In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [18]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
ss = pd.read_csv('sample_submission.csv')

In [19]:
def download_preds(preds_test, file_name = 'hacklive_sub.csv'):

  ## 1. Setting the target column with our obtained predictions
  ss[TARGET_COL] = preds_test

  ## 2. Saving our predictions to a csv file

  ss.to_csv(file_name, index = False)
  from google.colab import files
  files.download(file_name)

  

In [20]:
import numpy as np

In [21]:
from sklearn.model_selection import train_test_split, StratifiedKFold

In [22]:
from sklearn.metrics import accuracy_score, f1_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from imblearn.over_sampling import SMOTE


In [23]:
from sklearn.model_selection import StratifiedKFold

In [24]:
#Function for running cross validation
def run_gradient_boosting(clf, fit_params, train, test, features):
  N_SPLITS = 10
  oofs = np.zeros(len(train_proc))
  preds = np.zeros((len(test_proc)))

  folds = StratifiedKFold(n_splits = N_SPLITS)

  for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train[TARGET_COL])):
    print(f'\n------------- Fold {fold_ + 1} -------------')

    ### Training Set
    X_trn, y_trn = train[features].iloc[trn_idx], target.iloc[trn_idx]

    ### Validation Set
    X_val, y_val = train[features].iloc[val_idx], target.iloc[val_idx]

    ### Test Set
    X_test = test[features]
    scaler = StandardScaler()
    _ = scaler.fit(X_trn)

    X_trn = scaler.transform(X_trn)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    #print(X_trn)
    #exit(0)
    
    _ = clf.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], **fit_params)

    ### Instead of directly predicting the classes we will obtain the probability of positive class.
    preds_val = clf.predict_proba(X_val)[:, 1]
    preds_test = clf.predict_proba(X_test)[:, 1]

    roc_score = roc_auc_score(y_val,preds_val)
    print("ROC for validation set is {}".format(roc_score))

    oofs[val_idx] = preds_val
    preds += preds_test / N_SPLITS


  oofs_score = roc_auc_score(target, oofs.round())
  print('ROC score for oofs is {}'.format(oofs_score))
  

  return oofs, preds

In [27]:
ID_COL,TARGET_COL = 'ID','Is_Lead'

In [26]:
def join_df(train, test):

  df = pd.concat([train, test], axis=0).reset_index(drop = True)
  features = [c for c in df.columns if c not in [ID_COL, TARGET_COL]]

  return df, features

def split_df_and_get_features(df, train_nrows):

  train, test = df[:train_nrows].reset_index(drop = True), df[train_nrows:].reset_index(drop = True)
  features = [c for c in train.columns if c not in [ID_COL, TARGET_COL]]
  
  return train, test, features

In [16]:
df, features = join_df(train_data,test_data)
df.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0.0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0.0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0.0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0.0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0.0


In [97]:
#Feature engineering on the complete dataset
df['occupation_mean_balance'] = df.groupby('Occupation')['Avg_Account_Balance'].transform('mean')
df['is_active_avg_balance'] = df.groupby('Is_Active')['Avg_Account_Balance'].transform('mean')
df['gender_avg_balance'] = df.groupby('Gender')['Avg_Account_Balance'].transform('mean')
df['gender_avg_age'] = df.groupby('Gender')['Age'].transform('mean')
df['gender_and_channelcode'] = df['Gender'].astype('str') + df['Channel_Code'].astype('str')
df['gender_and_occupation'] = df['Gender'].astype('str') + df['Occupation'].astype('str')
df['gender_and_active'] = df['Gender'].astype('str') + df['Is_Active'].astype('str')



In [98]:
from category_encoders import CountEncoder

In [99]:
#dropping Avg account balance and preparin the datset for training and validation
df.drop('Avg_Account_Balance',axis=1,inplace=True)

categorical_col = ['gender_and_active','Region_Code','Gender','Credit_Product','Is_Active','Occupation','Channel_Code','gender_and_channelcode','gender_and_occupation']
#Applying feature encoding
ce = CountEncoder()
count_encoded = ce.fit_transform(df[categorical_col])
df = df.join(count_encoded.add_suffix("_count"))
df.drop(categorical_col,axis=1,inplace=True)


ID_COL , TARGET_COL = 'ID','Is_Lead'
features = [c for c in train_data.columns if c not in [ID_COL,TARGET_COL]]

train_proc, test_proc , features = split_df_and_get_features(df,train_data.shape[0])


target = train_data[TARGET_COL]


In [102]:
#catboost model training
clf = CatBoostClassifier(n_estimators = 3000,
                       learning_rate = 0.02,
                       rsm = 0.4, ## Analogous to colsample_bytree
                       random_state=2054,
                       
                       )

fit_params = {'verbose': 200, 'early_stopping_rounds': 300}

cb_oofs, cb_preds = run_gradient_boosting(clf, fit_params, train_proc, test_proc, features)

optimized_roc = roc_auc_score(target, (cb_oofs  * 1))
print(f'Optimized ROC is {optimized_roc}')


------------- Fold 1 -------------
0:	learn: 0.6763171	test: 0.6764365	best: 0.6764365 (0)	total: 83.8ms	remaining: 4m 11s
200:	learn: 0.3458747	test: 0.3482469	best: 0.3482469 (200)	total: 15.2s	remaining: 3m 32s
400:	learn: 0.3431044	test: 0.3460274	best: 0.3460274 (400)	total: 30.1s	remaining: 3m 15s
600:	learn: 0.3418284	test: 0.3453588	best: 0.3453588 (600)	total: 45.1s	remaining: 3m
800:	learn: 0.3406374	test: 0.3449334	best: 0.3449334 (800)	total: 1m	remaining: 2m 46s
1000:	learn: 0.3395704	test: 0.3447927	best: 0.3447911 (972)	total: 1m 16s	remaining: 2m 32s
1200:	learn: 0.3386386	test: 0.3447620	best: 0.3447435 (1132)	total: 1m 31s	remaining: 2m 17s
1400:	learn: 0.3377591	test: 0.3447264	best: 0.3447182 (1366)	total: 1m 47s	remaining: 2m 2s
1600:	learn: 0.3369332	test: 0.3446922	best: 0.3446770 (1539)	total: 2m 2s	remaining: 1m 47s
1800:	learn: 0.3361592	test: 0.3447178	best: 0.3446770 (1539)	total: 2m 18s	remaining: 1m 32s
Stopped by overfitting detector  (300 iterations wai

In [103]:
#training LightGBM model 
clf = LGBMClassifier(n_estimators = 200,
                        learning_rate = 0.05,
                        colsample_bytree = 0.5,
                        )
fit_params = {'verbose': 100, 'early_stopping_rounds': 100}

lgb_oofs, lgb_preds = run_gradient_boosting(clf, fit_params, train_proc, test_proc, features)


optimized_roc = roc_auc_score(target, (lgb_oofs * 1))
print(f'Optimized ROC is {optimized_roc}')


------------- Fold 1 -------------
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.345848
[200]	valid_0's binary_logloss: 0.344697
Did not meet early stopping. Best iteration is:
[197]	valid_0's binary_logloss: 0.344681
ROC for validation set is 0.8728690773544818

------------- Fold 2 -------------
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.345081
[200]	valid_0's binary_logloss: 0.344377
Did not meet early stopping. Best iteration is:
[191]	valid_0's binary_logloss: 0.34436
ROC for validation set is 0.8726673223943127

------------- Fold 3 -------------
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.34626
[200]	valid_0's binary_logloss: 0.344958
Did not meet early stopping. Best iteration is:
[200]	valid_0's binary_logloss: 0.344958
ROC for validation set is 0.8734977957285555

------------- Fold 4 -------------
Training until valida

In [104]:
#training XGB Classifier
clf = XGBClassifier(n_estimators = 1000,
                    max_depth = 6,
                    learning_rate = 0.05,
                    colsample_bytree = 0.5,
                    random_state=1452,
                    )

fit_params = {'verbose': 200, 'early_stopping_rounds': 200}

xgb_oofs, xgb_preds = run_gradient_boosting(clf, fit_params, train_proc, test_proc, features)


optimized_f1 = roc_auc_score(target, (xgb_oofs * 1))
print(f'Optimized F1 is {optimized_f1}')


------------- Fold 1 -------------
[0]	validation_0-error:0.215521
Will train until validation_0-error hasn't improved in 200 rounds.
[200]	validation_0-error:0.139625
[400]	validation_0-error:0.139299
Stopping. Best iteration:
[337]	validation_0-error:0.13877

ROC for validation set is 0.8730797789643918

------------- Fold 2 -------------
[0]	validation_0-error:0.215358
Will train until validation_0-error hasn't improved in 200 rounds.
[200]	validation_0-error:0.138811
Stopping. Best iteration:
[146]	validation_0-error:0.13816

ROC for validation set is 0.8724101531212467

------------- Fold 3 -------------
[0]	validation_0-error:0.211899
Will train until validation_0-error hasn't improved in 200 rounds.
[200]	validation_0-error:0.140601
Stopping. Best iteration:
[165]	validation_0-error:0.139991

ROC for validation set is 0.8736284534251052

------------- Fold 4 -------------
[0]	validation_0-error:0.212591
Will train until validation_0-error hasn't improved in 200 rounds.
[200]	va

In [105]:
#preparing the input from CatBoost, LightGBM , XGBoost
train_new = train_data[[ID_COL, TARGET_COL]]
test_new = test_data[[ID_COL]]

train_new['lgb'] = lgb_oofs
test_new['lgb'] = lgb_preds

train_new['cb'] = cb_oofs
test_new['cb'] = cb_preds

train_new['xgb'] = xgb_oofs
test_new['xgb'] = xgb_preds

features = [c for c in train_new.columns if c not in [ID_COL, TARGET_COL]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

In [107]:
#final ensembled model
clf = CatBoostClassifier(n_estimators = 3000,
                       learning_rate = 0.04,
                       rsm = 0.4, ## Analogous to colsample_bytree
                       random_state=2054,
                       
                       )

fit_params = {'verbose': 200, 'early_stopping_rounds': 300}

cb_oofs, cb_preds = run_gradient_boosting(clf, fit_params, train_new, test_new, features)

optimized_f1 = roc_auc_score(target, (cb_oofs  * 1))
print(f'Optimized ROC is {optimized_f1}')


------------- Fold 1 -------------
0:	learn: 0.6811419	test: 0.6811420	best: 0.6811420 (0)	total: 49.7ms	remaining: 2m 29s
200:	learn: 0.3431813	test: 0.3444260	best: 0.3444260 (200)	total: 11.4s	remaining: 2m 39s
400:	learn: 0.3426966	test: 0.3444138	best: 0.3444026 (224)	total: 23.1s	remaining: 2m 29s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.3444026339
bestIteration = 224

Shrink model to first 225 iterations.
ROC for validation set is 0.8731569581193186

------------- Fold 2 -------------
0:	learn: 0.6811419	test: 0.6811420	best: 0.6811420 (0)	total: 48.9ms	remaining: 2m 26s
200:	learn: 0.3433219	test: 0.3440693	best: 0.3440668 (199)	total: 11.2s	remaining: 2m 35s
400:	learn: 0.3428063	test: 0.3439520	best: 0.3439410 (378)	total: 22.7s	remaining: 2m 27s
600:	learn: 0.3423262	test: 0.3439443	best: 0.3439387 (539)	total: 34.4s	remaining: 2m 17s
800:	learn: 0.3418952	test: 0.3439748	best: 0.3439352 (717)	total: 46.2s	remaining: 2m 6s
1000:	learn: 0.3415365	

In [108]:
cb_preds_rounded = (cb_preds) * 1
download_preds(cb_preds_rounded, file_name = 'final.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>