In [28]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, KBinsDiscretizer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from lightgbm import LGBMClassifier
from category_encoders import OneHotEncoder
from sklearn.model_selection import cross_val_predict
from warnings import filterwarnings
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import os
import matplotlib.pyplot as plt
import seaborn as sns

filterwarnings('ignore')


In [29]:
train = pd.read_csv("./data/features.csv")
labels = pd.read_csv("./data/train_2.csv")

In [30]:
target = labels['target']
scores = []
train = train.drop("Unnamed: 0", axis=1)

In [34]:
train["B15"]

KeyError: 'B15'

In [14]:
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)
random_forest.fit(train, target)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=50, verbose=1, warm_start=False)

In [35]:
features = list(train.columns)
feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': features, 'importance':feature_importance_values})
print(feature_importances[feature_importances["importance"]>0.004])
features_selected = ["B_1", "B_2", "B_3", "B_4", "B_7", "B_8", "B_9", "B_10", "B_11", "B_12", "B_13", "D_6", "D_13", "D_34", "D_40", "D_56", "D_66", "D_116", "D_138", "D_142", "D_145", "D_166", "B_14_A", "B_14_B"]
chandan_features = ['B_10','B_3','B_12','B_8','B_7','B_4','D_121','D_26','D_17','B_11','D_56','D_138','D_1','D_40','D_166',
            'C_10','D_102','D_132','D_99','C_14','C_3','C_2','D_13','D_34','D_66','D_2','D_142','D_143','D_21','D_156','D_158','D_37','B_9',
            'D_14','C_12','D_28','D_6','D_29','D_54','D_117','C_5','D_86','D_107','D_30']
new_df = pd.DataFrame()
for f in chandan_features:
    new_df[f] = train[f]

    feature  importance
2       B_1    0.025914
3       B_2    0.006413
4       B_3    0.101107
5       B_4    0.062924
6       B_7    0.015251
7       B_8    0.049039
8       B_9    0.006556
9      B_10    0.321729
10     B_11    0.026659
11     B_12    0.131798
12     B_13    0.005916
35      D_6    0.004015
42     D_13    0.004005
63     D_34    0.004204
69     D_40    0.004120
85     D_56    0.004085
95     D_66    0.004150
145   D_116    0.004018
167   D_138    0.004138
171   D_142    0.004184
174   D_145    0.004078
195   D_166    0.004022
207  B_14_A    0.010507
208  B_14_B    0.009526


In [36]:
def comparison(df, labels, scores):
    lr = LogisticRegression(n_jobs=-1)
    gnb = GaussianNB()
    lgbm = LGBMClassifier(n_jobs=-1,eta=0.01,max_depth=4)
    xgb = XGBClassifier(n_jobs=-1, nthreads=-1)
    models = [lr, gnb, lgbm, xgb]
    for model in models:
        scores.append(cross_val_predict(model, df, labels, cv=5, method="predict_proba", n_jobs=-1, verbose=20))
    

In [None]:
comparison(new_df, target, scores)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   11.2s remaining:   16.8s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   12.2s remaining:    8.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.3s remaining:    3.5s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    2.3s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=

In [26]:
for oof in scores:
    print(roc_auc_score(target, oof[:,1]))

0.6167784590332982
0.5818974079154993
0.625689791762019
0.6235947601130992
