In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, KBinsDiscretizer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from lightgbm import LGBMClassifier
from category_encoders import OneHotEncoder
from sklearn.model_selection import cross_val_predict
from warnings import filterwarnings
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import os
import matplotlib.pyplot as plt
import seaborn as sns

filterwarnings('ignore')


In [4]:
train = pd.read_csv("./data/train_2.csv")

In [6]:
target = train['target']
scores = []
train = train.drop(["id", "target", "B_15"], axis=1)

In [7]:
app_train = pd.get_dummies(train)
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(app_train)
train_imputed = imp_mean.transform(app_train)
scaler = StandardScaler()
scaler.fit(train_imputed)
train_imputed = scaler.transform(train_imputed)
train_imputed = pd.DataFrame(train_imputed, columns=app_train.columns)

In [9]:
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)
random_forest.fit(train_imputed, target)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.2min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=50, verbose=1, warm_start=False)

In [10]:
features = list(train_imputed.columns)
feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': features, 'importance':feature_importance_values})
print(feature_importances[feature_importances["importance"]>0.004])
features_selected = ["B_1", "B_2", "B_3", "B_4", "B_7", "B_8", "B_9", "B_10", "B_11", "B_12", "B_13", "D_6", "D_13", "D_34", "D_40", "D_56", "D_66", "D_116", "D_138", "D_142", "D_145", "D_166", "B_14_A", "B_14_B"]
chandan_features = ['B_10','B_3','B_12','B_8','B_7','B_4','D_121','D_26','D_17','B_11','D_56','D_138','D_1','D_40','D_166',
            'C_10','D_102','D_132','D_99','C_14','C_3','C_2','D_13','D_34','D_66','D_2','D_142','D_143','D_21','D_156','D_158','D_37','B_9',
            'D_14','C_12','D_28','D_6','D_29','D_54','D_117','C_5','D_86','D_107','D_30']
new_df = pd.DataFrame()
for f in chandan_features:
    new_df[f] = train_imputed[f]

    feature  importance
2       B_1    0.025756
3       B_2    0.006684
4       B_3    0.104691
5       B_4    0.063729
6       B_7    0.016498
7       B_8    0.050068
8       B_9    0.007586
9      B_10    0.320476
10     B_11    0.026757
11     B_12    0.133366
12     B_13    0.005373
63     D_34    0.004072
69     D_40    0.004204
95     D_66    0.004059
171   D_142    0.004050
174   D_145    0.004028
195   D_166    0.004043
205  B_14_A    0.010597
206  B_14_B    0.009808


In [49]:
def comparison(df, labels, scores):
    lr = LogisticRegression(n_jobs=-1)
    gnb = GaussianNB()
    lgbm = LGBMClassifier(n_jobs=-1,eta=0.01,max_depth=4)
    xgb = XGBClassifier(n_jobs=-1, nthreads=-1)
    models = [lr, gnb, lgbm, xgb]
    for model in models:
        scores.append(cross_val_predict(model, df, labels, cv=5, method="predict_proba", n_jobs=-1, verbose=20))
    

In [None]:
comparison(new_df, target, scores)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


In [38]:
for oof in scores:
    print(roc_auc_score(target, oof[:,1]))

0.6202676679532161
0.5804651745342949
0.6338427869548475
0.632673299288413


In [1]:
from ludwig import LudwigModel


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [17]:
model_parameters = {"input_features": [{"name": "B_10", "type": "numerical"}, {"name": "B_3", "type": "numerical"}, {"name": "B_12", "type": "numerical"}, {"name": "B_8", "type": "numerical"}, {"name": "B_7", "type": "numerical"}, {"name": "B_4", "type": "numerical"}, {"name": "D_121", "type": "numerical"}, {"name": "D_26", "type": "numerical"}, {"name": "D_17", "type": "numerical"}, {"name": "B_11", "type": "numerical"}, {"name": "D_56", "type": "numerical"}, {"name": "D_138", "type": "numerical"}, {"name": "D_1", "type": "numerical"}, {"name": "D_40", "type": "numerical"}, {"name": "D_166", "type": "numerical"}, {"name": "C_10", "type": "numerical"}, {"name": "D_102", "type": "numerical"}, {"name": "D_132", "type": "numerical"}, {"name": "D_99", "type": "numerical"}, {"name": "C_14", "type": "numerical"}, {"name": "C_3", "type": "numerical"}, {"name": "C_2", "type": "numerical"}, {"name": "D_13", "type": "numerical"}, {"name": "D_34", "type": "numerical"}, {"name": "D_66", "type": "numerical"}, {"name": "D_2", "type": "numerical"}, {"name": "D_142", "type": "numerical"}, {"name": "D_143", "type": "numerical"}, {"name": "D_21", "type": "numerical"}, {"name": "D_156", "type": "numerical"}, {"name": "D_158", "type": "numerical"}, {"name": "D_37", "type": "numerical"}, {"name": "B_9", "type": "numerical"}, {"name": "D_14", "type": "numerical"}, {"name": "C_12", "type": "numerical"}, {"name": "D_28", "type": "numerical"}, {"name": "D_6", "type": "numerical"}, {"name": "D_29", "type": "numerical"}, {"name": "D_54", "type": "numerical"}, {"name": "D_117", "type": "numerical"}, {"name": "C_5", "type": "numerical"}, {"name": "D_86", "type": "numerical"}, {"name": "D_107", "type": "numerical"}, {"name": "D_30", "type": "numerical"}], "output_features": [{"name": "target", "type": "numerical"}]}

In [None]:
new_df["target"] = target 

In [25]:
model = LudwigModel(model_parameters)
train_stats = model.train(new_df)

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Use tf.cast instead.


In [26]:
model.save("/home/jupyter/kaggle_days_sf_comp/nn_model")

In [27]:
test = pd.read_csv("./data/test_2.csv")

In [29]:
chandan_features = ['B_10','B_3','B_12','B_8','B_7','B_4','D_121','D_26','D_17','B_11','D_56','D_138','D_1','D_40','D_166',
            'C_10','D_102','D_132','D_99','C_14','C_3','C_2','D_13','D_34','D_66','D_2','D_142','D_143','D_21','D_156','D_158','D_37','B_9',
            'D_14','C_12','D_28','D_6','D_29','D_54','D_117','C_5','D_86','D_107','D_30']
test_df = pd.DataFrame()
for f in chandan_features:
    test_df[f] = test[f]

In [30]:
predictions = model.predict(test_df)

In [31]:
predictions

Unnamed: 0,target_predictions
0,-0.001214
1,-0.004780
2,-0.001777
3,-0.006437
4,-0.006558
5,-0.002141
6,-0.004893
7,-0.003272
8,-0.005046
9,-0.001502
