In [1]:
'''Import basic modules'''
import pandas as pd
import numpy as np
import string

'''Display markdown formatted output like bold, italic bold etc.'''
from IPython.display import Markdown
def bold(string):
    display(Markdown(string))

'''Ignore deprecation and future, and user warnings.'''
import warnings as wrn
wrn.filterwarnings('ignore', category = DeprecationWarning) 
wrn.filterwarnings('ignore', category = FutureWarning) 
wrn.filterwarnings('ignore', category = UserWarning) 

In [3]:
%%time

# Load data
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

print(train.shape)
print(test.shape)

(600000, 25)
(400000, 24)
CPU times: user 3.42 s, sys: 197 ms, total: 3.61 s
Wall time: 3.29 s


In [4]:
'''Variable Description'''
def description(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.iloc[0].values
    summary['Second Value'] = df.iloc[1].values
    summary['Third Value'] = df.iloc[2].values
    return summary
bold('**Variable Description of  train Data:**')
description(train)

**Variable Description of  train Data:**

Dataset Shape: (600000, 25)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value,Third Value
0,id,int64,0,600000,0,1,2
1,bin_0,float64,17894,2,0,1,0
2,bin_1,float64,18003,2,0,1,1
3,bin_2,float64,17930,2,0,0,0
4,bin_3,object,18014,2,F,F,F
5,bin_4,object,18047,2,N,Y,N
6,nom_0,object,18252,3,Red,Red,Red
7,nom_1,object,18156,6,Trapezoid,Star,
8,nom_2,object,18035,6,Hamster,Axolotl,Hamster
9,nom_3,object,18121,6,Russia,,Canada


In [5]:
def replace_nan(data):
    for column in data.columns:
        if data[column].isna().sum() > 0:
            data[column] = data[column].fillna(data[column].mode()[0])


replace_nan(train)
replace_nan(test)

In [6]:
%%time

# Subset
target = train['target']
train_id = train['id']
test_id = test['id']
train.drop(['target', 'id'], axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

print(train.shape)
print(test.shape)

(600000, 23)
(400000, 23)
CPU times: user 98.8 ms, sys: 52.1 ms, total: 151 ms
Wall time: 154 ms


In [7]:
%%time

# One Hot Encode
traintest = pd.concat([train, test])
dummies = pd.get_dummies(traintest, columns=traintest.columns, drop_first=True, sparse=True)
train_ohe = dummies.iloc[:train.shape[0], :]
test_ohe = dummies.iloc[train.shape[0]:, :]

print(train_ohe.shape)
print(test_ohe.shape)

(600000, 5678)
(400000, 5678)
CPU times: user 4min 17s, sys: 8.91 s, total: 4min 26s
Wall time: 2min 46s


In [8]:
%%time
'''Covert dataframe to spare matrix'''
train_ohe = train_ohe.sparse.to_coo().tocsr()
test_ohe = test_ohe.sparse.to_coo().tocsr()
type(train_ohe)

CPU times: user 1.74 s, sys: 63.8 ms, total: 1.8 s
Wall time: 1.6 s


scipy.sparse.csr.csr_matrix

In [10]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score as auc
from sklearn.linear_model import LogisticRegression
import optuna

In [11]:
Xtrain=train_ohe[:len(train)]
Xtest=test_ohe[len(train):]

kf=StratifiedKFold(n_splits=10)

def objective(trial):
    C=trial.suggest_loguniform('C', 10e-10, 10)
    model=LogisticRegression(C=C, class_weight='balanced',max_iter=10000, solver='lbfgs', n_jobs=-1)
    score=-cross_val_score(model, Xtrain, target, cv=kf, scoring='roc_auc').mean()
    return score
study=optuna.create_study()

In [12]:
%%time

# Model
def run_cv_model(train, test, target, model_fn, params={}, eval_fn=None, label='model'):
    kf = KFold(n_splits=10)
    fold_splits = kf.split(train, target)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros((train.shape[0]))
    i = 1
    for dev_index, val_index in fold_splits:
        print('Started ' + label + ' fold ' + str(i) + '/10')
        dev_X, val_X = train[dev_index], train[val_index]
        dev_y, val_y = target[dev_index], target[val_index]
        params2 = params.copy()
        pred_val_y, pred_test_y = model_fn(dev_X, dev_y, val_X, val_y, test, params2)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index] = pred_val_y
        if eval_fn is not None:
            cv_score = eval_fn(val_y, pred_val_y)
            cv_scores.append(cv_score)
            print(label + ' cv score {}: {}'.format(i, cv_score))
        i += 1
    print('{} cv scores : {}'.format(label, cv_scores))
    print('{} cv mean score : {}'.format(label, np.mean(cv_scores)))
    print('{} cv std score : {}'.format(label, np.std(cv_scores)))
    pred_full_test = pred_full_test / 5.0
    results = {'label': label,
              'train': pred_train, 'test': pred_full_test,
              'cv': cv_scores}
    return results


def runLR(train_X, train_y, test_X, test_y, test_X2, params):
    print('Train LR')
    model = LogisticRegression(**params)
    model.fit(train_X, train_y)
    print('Predict 1/2')
    pred_test_y = model.predict_proba(test_X)[:, 1]
    print('Predict 2/2')
    pred_test_y2 = model.predict_proba(test_X2)[:, 1]
    return pred_test_y, pred_test_y2


lr_params = {'solver': 'lbfgs', 'C':  0.5691135059661381}
results = run_cv_model(train_ohe, test_ohe, target, runLR, lr_params, auc, 'lr')

Started lr fold 1/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 1: 0.7824542976442759
Started lr fold 2/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 2: 0.7839734005368464
Started lr fold 3/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 3: 0.7852724452075415
Started lr fold 4/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 4: 0.7833413169679234
Started lr fold 5/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 5: 0.7852775493657906
Started lr fold 6/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 6: 0.7830002018708714
Started lr fold 7/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 7: 0.7835201605706091
Started lr fold 8/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 8: 0.7849721180315684
Started lr fold 9/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 9: 0.7850282203518497
Started lr fold 10/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 10: 0.7821219833012829
lr cv scores : [0.7824542976442759, 0.7839734005368464, 0.7852724452075415, 0.7833413169679234, 0.7852775493657906, 0.