In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.utils import np_utils
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split,GridSearchCV      # import GridSearchCV
from sklearn.pipeline import make_pipeline        # import pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

In [None]:
traindata = pd.read_csv("../input/train.csv")
testdata = pd.read_csv("../input/test.csv")
print(traindata.shape)
print(testdata.shape)

In [None]:
traindata.head()

In [None]:
y = traindata.loc[:,'target']
traindata = traindata.iloc[:,2:]
testid = testdata.iloc[:,0]
testdata = testdata.iloc[:,1:]
totaldata = traindata.append(testdata,sort=False)
totaldata.head()

In [None]:
y.value_counts(normalize=True)   #checking proportion of different ratings

In [None]:
#(totaldata.isnull().sum()).max
# No missing values found

In [None]:
totaldata.describe()

In [None]:
#Plotting boxplots of first 5 variables
m=1
plt.figure(figsize = (20,20))
for i in totaldata.columns[:5]:
    plt.subplot(3,4,m)
    sns.boxplot(totaldata[i])
    m = m+1

In [None]:
def outlier_treatment(data):
    data_X = data.copy()
    for col in data_X.columns:
        percentiles = data_X[col].quantile([0.01,0.99]).values
        data_X[col][data_X[col] <= percentiles[0]] = percentiles[0]
        data_X[col][data_X[col] >= percentiles[1]] = percentiles[1]
    
    return data_X

In [None]:
totaldata_ot = outlier_treatment(totaldata)

In [None]:
# plot histograms to see skewness of first 5 variables
m=1
plt.figure(figsize = (15,15))
for i in totaldata.columns[:5]:
    plt.subplot(3,4,m)
    sns.distplot(totaldata[i],kde = True)
    m = m+1

Different variables have different scaling and are very slightly skewed. We wil apply transformation to variables having skewness > 0.75

In [None]:
from scipy.stats import skew
def skew_treatment(data):
    data_X = data.copy()
    #finding skewness of all variables
    col = data_X.columns
    skewed_feats = data_X[col].apply(lambda x: skew(x.dropna()))
    #adjusting features having skewness >0.75
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    data_X[skewed_feats] = np.log1p(data_X[skewed_feats])
    
    return data_X

In [None]:
totaldata_sk = skew_treatment(totaldata)

In [None]:
totaldata_sk.head()

I tried both oversampling and skewness treatment but the model was not performing any better so I am not using them in final code. It is written above if you want to try that piece of code.

Now since the data is unbalanced, we can try oversampleing and undersampling:
    1. Undersampling
I  am not trying oversamling since the data is already huge, and oversamling will slower down the entire excecution.

In [None]:
#reducing y=0 labels from training set
totaldata = totaldata.reset_index(drop=True)
y = y.reset_index(drop=True)

#get training data and then shuffle and get some random permutation of observations
ntrain = int(traindata.shape[0])
train_data = totaldata.iloc[:ntrain,:]
remove_n = int(ntrain*0.6)
drop_indices = np.random.choice(y[y==0].index, remove_n, replace=False)
print('Shape of training data before dropping rows having 0 labels: ', train_data.shape)
train_data = train_data.drop(drop_indices, axis=0)
y1 = y.copy()
y1 = y1.drop(drop_indices)
print('Shape of training data after dropping rows having 0 labels: ',train_data.shape)

#checking proportion of different classes in y
y1.value_counts(normalize=True)

In [None]:
test_data = totaldata.iloc[ntrain: ,:]

In [None]:
'''
#we will divide data into train, test data
ntrain = int(traindata.shape[0])
train_data = totaldata.iloc[:ntrain,:]            
train_data_y = y

test_data = totaldata.iloc[ntrain: ,:]             

print(train_data.shape[0])
print(test_data.shape[0])
'''

In [None]:
rs = RobustScaler()
rs.fit(train_data)
train_data = rs.transform(train_data)
test_data = rs.transform(test_data)

Modelling:

In [None]:
from sklearn.model_selection import train_test_split

#  split X between training and testing set
x_train, x_test, y_train, y_test = train_test_split(train_data,y1, test_size=0.25, shuffle=True)

In [None]:
y_train.value_counts(normalize=True)

Used XGB, got 0.889 auc score. Commenting now to end execution of whole program faster.

In [None]:
'''
# Using XGBClassifier
xgb = XGBClassifier(n_estimators=600, reg_alpha = 0.01)
xgb.fit(x_train, y_train, verbose=1)
xgb_pred = xgb.predict_proba(x_test)

#probability that a customer will make this transaction (target class 1)
prob = [1 - item[0] for item in xgb_pred] 
prob[:5]

roc_auc_score(y_test, prob)
'''

In [None]:
'''
#Using Light GBM model
import lightgbm as lgb
d_train = lgb.Dataset(x_train, label=y_train)
params = {}
params['learning_rate'] = 0.05
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['max_leaves'] = 100
clf = lgb.train(params, d_train, 2000)
lgb_pred = clf.predict(x_test)         #output will be probabilties
'''

In [None]:
#Using Light GBM model
import lightgbm as lgb
d_train = lgb.Dataset(x_train, label=y_train)
params = {'num_leaves': 9,
         'min_data_in_leaf': 42,
         'objective': 'binary',
         'max_depth': 16,
         'learning_rate': 0.0123,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.728910519108444,
         'reg_lambda': 4.9847051755586085,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01077313523861969,
         'min_child_weight': 19.428902804238373,
         'num_threads': 4}
clf = lgb.train(params, d_train, 15000)
lgb_pred = clf.predict(x_test)         #output will be probabilties

In [None]:
roc_auc_score(y_test, lgb_pred)

In [None]:
y_pred = clf.predict(test_data)

In [None]:
sub = pd.DataFrame(data = testid,columns =['ID_code'])
sub['target'] = y_pred
sub.to_csv('submission.csv', index=False)