In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import roc_auc_score, confusion_matrix, log_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder, RobustScaler, KBinsDiscretizer, PolynomialFeatures
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE
from lightgbm import LGBMClassifier
from category_encoders import OneHotEncoder
from sklearn.model_selection import cross_val_predict, RepeatedStratifiedKFold, GridSearchCV, StratifiedKFold
from warnings import filterwarnings
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn import svm
import os
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import time
from datetime import datetime
filterwarnings('ignore')


In [4]:
# Load data
do_sample = False
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
print("train data shape", train.shape)
print("test data shape", test.shape)


train data shape (250, 302)
test data shape (19750, 301)


In [5]:
train_input = train.drop(['id','target'],axis = 1)
test_input  = test.drop(['id'],axis = 1)
train_cols = train_input.columns
test_cols = test_input.columns

train_labels = train['target']

#app_train = pd.get_dummies(train_input)
#app_test = pd.get_dummies(test_input)

#imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
#imp_mean.fit(app_train)
#train_imputed = imp_mean.transform(app_train)
#test_imputed = imp_mean.transform(app_test)

scaler = RobustScaler()
scaler.fit(train_input)
train_imputed = scaler.transform(train_input)
test_imputed = scaler.transform(test_input)
train_df = pd.DataFrame(train_imputed, columns = train_cols)
test_df = pd.DataFrame(test_imputed, columns = test_cols)

In [7]:
poly = PolynomialFeatures()
train_df = poly.fit_transform(train_df)
test_df = poly.fit_transform(test_df)

MemoryError: 

In [21]:
rfe = RFE(LogisticRegression(n_jobs=-1, penalty='l1', C=0.2, class_weight='balanced', solver='saga'),n_features_to_select=200)
rfe = rfe.fit(train_df, train_labels)

In [25]:
predictions = rfe.predict_proba(test_df)[:,1]

In [26]:
predictions

array([0.74124924, 0.5429478 , 0.63459652, ..., 0.37093393, 0.9215228 ,
       0.23987409])

In [4]:
def cross_validation(model_, train_, target_):
    
    clfs = []
    folds = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 20, random_state = 42)
    
    valid_pred = pd.DataFrame(index = train_.index)
    
    # Cross-validation cycle
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(target_, target_)):
        print('--- Fold {} started at {}'.format(n_fold, time.ctime()))
        
        train_x, train_y = train_.iloc[train_idx], target_.iloc[train_idx]
        valid_x, valid_y = train_.iloc[valid_idx], target_.iloc[valid_idx]
        
        
        model_.fit(train_x, train_y)
    
        

        predict = model_.predict_proba(valid_x)[:, 1]
    
        tn, fp, fn, tp = confusion_matrix(valid_y, (predict >= .5) * 1).ravel()
        auc = roc_auc_score(valid_y, predict)
        acc = accuracy_score(valid_y, (predict >= .5) * 1)
        loss = log_loss(valid_y, predict)
        print('TN =', tn, 'FN =', fn, 'FP =', fp, 'TP =', tp)
        print('AUC = ', auc, 'Loss =', loss, 'Acc =', acc)
        
        #valid_pred[n_fold] = pd.Series(predict, index = valid_x.index)
        if auc > 0.8:
            clfs.append(model_)
        del train_x, train_y, valid_x, valid_y, predict
        gc.collect()

    return clfs

In [5]:
logreg = LogisticRegression(n_jobs=-1, penalty='l1', C=0.2, class_weight='balanced', solver='saga')

In [6]:
models = cross_validation(logreg, train_df, train_labels)

--- Fold 0 started at Thu Apr 25 11:43:09 2019
TN = 11 FN = 6 FP = 7 TP = 26
AUC =  0.8194444444444444 Loss = 0.5024640675658794 Acc = 0.74
--- Fold 1 started at Thu Apr 25 11:43:09 2019
TN = 13 FN = 12 FP = 5 TP = 20
AUC =  0.7864583333333333 Loss = 0.5814435029083824 Acc = 0.66
--- Fold 2 started at Thu Apr 25 11:43:09 2019
TN = 11 FN = 7 FP = 7 TP = 25
AUC =  0.8194444444444444 Loss = 0.5197987969722148 Acc = 0.72
--- Fold 3 started at Thu Apr 25 11:43:09 2019
TN = 8 FN = 6 FP = 10 TP = 26
AUC =  0.7708333333333333 Loss = 0.536646283607311 Acc = 0.68
--- Fold 4 started at Thu Apr 25 11:43:09 2019
TN = 8 FN = 4 FP = 10 TP = 28
AUC =  0.7725694444444444 Loss = 0.529391744026143 Acc = 0.72
--- Fold 5 started at Thu Apr 25 11:43:10 2019
TN = 11 FN = 9 FP = 7 TP = 23
AUC =  0.7690972222222222 Loss = 0.568784872406496 Acc = 0.68
--- Fold 6 started at Thu Apr 25 11:43:10 2019
TN = 10 FN = 8 FP = 8 TP = 24
AUC =  0.717013888888889 Loss = 0.6096012264825561 Acc = 0.68
--- Fold 7 started at T

--- Fold 59 started at Thu Apr 25 11:43:21 2019
TN = 13 FN = 15 FP = 5 TP = 17
AUC =  0.734375 Loss = 0.6204848965399103 Acc = 0.6
--- Fold 60 started at Thu Apr 25 11:43:21 2019
TN = 10 FN = 7 FP = 8 TP = 25
AUC =  0.7482638888888888 Loss = 0.5656277824409117 Acc = 0.7
--- Fold 61 started at Thu Apr 25 11:43:21 2019
TN = 13 FN = 10 FP = 5 TP = 22
AUC =  0.7291666666666667 Loss = 0.57680817166554 Acc = 0.7
--- Fold 62 started at Thu Apr 25 11:43:22 2019
TN = 12 FN = 7 FP = 6 TP = 25
AUC =  0.828125 Loss = 0.4815990190253265 Acc = 0.74
--- Fold 63 started at Thu Apr 25 11:43:22 2019
TN = 11 FN = 4 FP = 7 TP = 28
AUC =  0.795138888888889 Loss = 0.5398418800202718 Acc = 0.78
--- Fold 64 started at Thu Apr 25 11:43:22 2019
TN = 13 FN = 9 FP = 5 TP = 23
AUC =  0.795138888888889 Loss = 0.5582965902239816 Acc = 0.72
--- Fold 65 started at Thu Apr 25 11:43:22 2019
TN = 12 FN = 7 FP = 6 TP = 25
AUC =  0.7777777777777777 Loss = 0.5193758780807347 Acc = 0.74
--- Fold 66 started at Thu Apr 25 11:4

In [10]:
predict = pd.DataFrame()
i = 0
for model in models:
    predict[i] = model.predict_proba(test_df)[:,1]
    i += 1
predictions = predict.mean(axis=1)

In [11]:
predictions

0        0.641045
1        0.395032
2        0.725760
3        0.804511
4        0.718820
5        0.373126
6        0.226903
7        0.174661
8        0.664140
9        0.132114
10       0.728307
11       0.447155
12       0.527075
13       0.489219
14       0.304647
15       0.732668
16       0.554350
17       0.718524
18       0.679486
19       0.405016
20       0.473495
21       0.649419
22       0.467863
23       0.557912
24       0.326506
25       0.556473
26       0.048270
27       0.855978
28       0.686605
29       0.667381
           ...   
19720    0.365093
19721    0.916168
19722    0.630523
19723    0.114699
19724    0.744705
19725    0.607666
19726    0.879052
19727    0.632335
19728    0.577922
19729    0.893650
19730    0.373457
19731    0.178191
19732    0.758980
19733    0.384564
19734    0.775695
19735    0.403274
19736    0.744211
19737    0.735188
19738    0.843716
19739    0.622005
19740    0.678477
19741    0.706220
19742    0.372368
19743    0.112050
19744    0

In [27]:
sub = pd.read_csv("data/sample_submission.csv")
print(predictions.shape)
print(sub.shape)
sub['target'] = predictions
sub.to_csv("results/submit_results.csv", index=False)
print('done')

(19750,)
(19750, 2)
done
