In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, KBinsDiscretizer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from lightgbm import LGBMClassifier
from category_encoders import OneHotEncoder
from sklearn.model_selection import cross_val_predict
from warnings import filterwarnings
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn import svm
import os
import matplotlib.pyplot as plt
import seaborn as sns

filterwarnings('ignore')


In [None]:
import sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler, KBinsDiscretizer

sklearn.__version__

In [None]:
# Load data
do_sample = False
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

if do_sample:
    train = train.sample(frac=0.1, random_state=1)
    

print("train data shape", train.shape)
print("test data shape", test.shape)


In [None]:
# train["A_avg"] = (train["A_1"]+train["A_2"]) / 2
# for i in range(165):
#     train["XD_"+str(i+1)+str(i+2)] = (train["D_"+str(i+1)]+train["D_"+str(i+2)])/2 
    
# train.fillna(value=-1)
# test.fillna(value=-1)

# target_column = "target"
# id_column = "id"
# categorical_cols = [c for c in test.columns if test[c].dtype in [np.object]]
# numerical_cols = [c for c in test.columns if test[c].dtype in [np.float, np.int] and c not in [target_column, id_column]]
# print("Number of features", len(categorical_cols)+len(numerical_cols))


In [None]:
train_input = train.drop(['id','target','B_15'],axis = 1)
test_input  = test.drop(['id','B_15'],axis = 1)

train_labels = train['target']

app_train = pd.get_dummies(train_input)
app_test = pd.get_dummies(test_input)

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(app_train)
train_imputed = imp_mean.transform(app_train)
test_imputed = imp_mean.transform(app_test)

scaler = StandardScaler()
scaler.fit(train_imputed)
train_imputed = scaler.transform(train_imputed)
test_imputed = scaler.transform(test_imputed)


In [None]:

features = list(app_train.columns)
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)
random_forest.fit(train_imputed,train_labels)
feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': features, 'importance':feature_importance_values})


In [None]:
def plot_feature_importances(df):
    #Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index()
    
    #Normalise the feature importances to add up to one
    df['importance_normalized'] = df['importance'] / df['importance'].sum()
    
    #Make a horizontal bar chart of feature importances
    plt.figure(figsize = (10,6))
    ax = plt.subplot()
    
    #Need to reverse the index to plot most important on top
    ax.barh(list(reversed(list(df.index[:15]))),
           df['importance_normalized'].head(15),
           align = 'center', edgecolor = 'k')
    #Set the yticks and labels
    ax.set_yticks(list(reversed(list(df.index[:15]))))
    ax.set_yticklabels(df['feature'].head(15))
    
    #Plot labeling
    plt.xlabel('Normalized Importance'); plt.title('Feature Importance')
    plt.show()
    
    return df
feature_importances_sorted = plot_feature_importances(feature_importances)

In [None]:
fe_threshold = 0.001
#print(feature_importances_sorted.query('importance_normalized>{}'.format(fe_threshold)))
selected_features = feature_importances_sorted.query('importance_normalized > {}'.format(fe_threshold))['feature'].values
print(selected_features)

In [None]:
train_imputed= pd.DataFrame(train_imputed, columns = app_train.columns)
test_imputed= pd.DataFrame(test_imputed, columns= app_test.columns)
train_imputed=pd.concat([train_imputed[selected_features], train_labels])
test_imputed=test_imputed[selected_features]


In [None]:
target_column ='target'
id_column = 'id'
categorical_cols = [c for c in test.columns if test[c].dtype in [np.object]]
numerical_cols = [c for c in test.columns if test[c].dtype in [np.float, np.int] and c not in [target_column, id_column]]
preprocess = make_column_transformer(
    (numerical_cols, make_pipeline(SimpleImputer(), StandardScaler())),
    (categorical_cols, OneHotEncoder()))

In [15]:
column_transformer = ColumnTransformer([
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

In [33]:
classifier_lgbm = make_pipeline(preprocess,LGBMClassifier(n_jobs=-1,eta=0.01,max_depth=5,max_bin=512,learning_rate=0.01,num_iterations=1000))

In [100]:
classifier_xgb = make_pipeline(
    preprocess,
    XGBClassifier(n_jobs=-1, nthreads=-1)   
)

In [101]:
classifier_lr = make_pipeline(
    preprocess,
    LogisticRegression(n_jobs=-1)   
)

In [102]:
classifier_gnb = make_pipeline(
    preprocess,
    GaussianNB()   
)

In [None]:
classifier_rf = make_pipeline(preprocess,RandomForestClassifier(random_state=42))
classifier_svm = make_pipeline(preprocess, svm.SVC(random_state=42))
#pipe_lr_pca = make_pipeline(column_transformer, LogisticRegression(random_state=42))
#pipe_rf_pca = make_pipeline(column_transformer,RandomForestClassifier(random_state=42))
#pipe_svm_pca = make_pipeline( svm.SVC(column_transformer,random_state=42))

voting_classifier = VotingClassifier(estimators=[
    ('lgbm', classifier_lgbm), 
    ('xgb', classifier_xgb), 
    ('rf', classifier_rf),
    ('lr',classifier_lr), 
    ('gnb', classifier_gnb),
    ('classifier_svm',classifier_svm )],
                 voting='soft', n_jobs=-1)

In [None]:
oof_pred = cross_val_predict(classifier_lgbm, 
                             train, 
                             train[target_column], 
                             cv=5,
                             method="predict_proba",
                                  n_jobs=-1,
                                  verbose=20)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.


In [None]:
print("Cross validation AUC {:.4f}".format(roc_auc_score(train[target_column], oof_pred[:,1])))
classifier_lgbm.fit(train, train[target_column])


In [None]:
test_preds = classifier_lgbm.predict_proba(test)[:,1]
sub = pd.read_csv("data/sample_submission.csv")
sub[target_column] = test_preds
sub.to_csv("results/submit_results.csv", index=False)
print('done')

In [None]:
%%bash
kaggle competitions submit -c kaggledays-sf-hackathon -f ./results/submit_results.csv -m "crawl"