In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [59]:
import collections

In [169]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [116]:
from sklearn.metrics import accuracy_score, precision_score, roc_curve, roc_auc_score, confusion_matrix,\
recall_score

In [138]:
from sklearn.preprocessing import StandardScaler

In [61]:
import datetime

In [62]:
import time

In [63]:
pd.options.display.max_colwidth = 100

In [64]:
df = pd.read_csv('train_janatahack2.csv')

In [65]:
df.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00006/D02554/;A00002/B00003/C00006/D28436/;A00002/B0...,female
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00031/D02617/;A00001/B00009/C00031/D29407/;A00001/B0...,male
2,u19037,01/12/14 15:58,01/12/14 15:58,A00002/B00001/C00020/D16944/,female
3,u14556,23/11/14 2:57,23/11/14 3:00,A00002/B00004/C00018/D10284/;A00002/B00004/C00018/D10285/;A00002/B00004/C00018/D10286/,female
4,u24295,17/12/14 16:44,17/12/14 16:46,A00001/B00001/C00012/D30805/;A00001/B00001/C00012/D30806/,male


In [66]:
df.isnull().sum()

session_id     0
startTime      0
endTime        0
ProductList    0
gender         0
dtype: int64

In [67]:
df.tail()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
10495,u15442,18/11/14 7:39,18/11/14 7:42,A00002/B00002/C00007/D06407/;A00002/B00002/C00007/D06409/,female
10496,u17986,25/11/14 15:16,25/11/14 15:16,A00006/B00030/C00334/D11660/,female
10497,u22508,09/12/14 10:11,09/12/14 10:11,A00002/B00002/C00007/D18028/,female
10498,u17087,22/11/14 11:27,22/11/14 11:27,A00003/B00012/C00131/D09453/;A00003/B00012/C00073/D09454/,female
10499,u23137,19/12/14 3:11,19/12/14 3:19,A00002/B00001/C00010/D02309/;A00002/B00002/C00002/D32450/;A00002/B00001/C00059/D21254/;A00003/B0...,female


In [68]:
df.dtypes

session_id     object
startTime      object
endTime        object
ProductList    object
gender         object
dtype: object

In [69]:
df.shape

(10500, 5)

### Data Cleaning

In [70]:
def time2milliseconds(s):
    return time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y  %H:%M").timetuple())

In [71]:
def dateFormat(x):
    new_x = ""
    lst = x.split("/")
    
    new_x = lst[0]+'/'
    new_x = new_x + lst[1]+'/'
    new_x = new_x + "20"+lst[2]
    
    return time2milliseconds(new_x)
    

In [72]:
dateFormat("20/01/13 11:13")

1358660580.0

In [73]:
df['startTimeMillisecond'] = df['startTime'].apply(lambda x:dateFormat(x))

In [74]:
df['endTimeMillisecond'] = df['endTime'].apply(lambda x:dateFormat(x))

In [102]:
df['totalTime'] = df['startTimeMillisecond'] - df['endTimeMillisecond']

In [75]:
s = set()
def productListCleaning(x):
    lst = x.split("/")
    l = []
    
    for e in lst:
        if ';' in e:
            l.append(e[1:])
            s.add(e)
        if e != "" and ';' not in e:
            l.append(e)
            s.add(e)
    
    return sorted(l)

In [76]:
df['cleanedProductList'] = df['ProductList'].apply(lambda x:productListCleaning(x))

In [77]:
df['cleanedProductList']

0        [A00002, A00002, A00002, A00002, B00003, B00003, B00003, B00003, C00006, C00006, C00006, C00006,...
1        [A00001, A00001, A00001, A00001, A00001, A00001, A00001, B00009, B00009, B00009, B00009, B00009,...
2                                                                           [A00002, B00001, C00020, D16944]
3           [A00002, A00002, A00002, B00004, B00004, B00004, C00018, C00018, C00018, D10284, D10285, D10286]
4                                           [A00001, A00001, B00001, B00001, C00012, C00012, D30805, D30806]
                                                        ...                                                 
10495                                       [A00002, A00002, B00002, B00002, C00007, C00007, D06407, D06409]
10496                                                                       [A00006, B00030, C00334, D11660]
10497                                                                       [A00002, B00002, C00007, D18028]
10498              

In [78]:
def productCount(x):
    return [len(list(group)) for key, group in groupby(x)]

In [79]:
def productCountDict(x):
    return dict(collections.Counter(x))

In [80]:
df['productCountDict'] = df['cleanedProductList'].apply(lambda x : productCountDict(x))

In [81]:
df['productCountDict'].head()

0            {'A00002': 4, 'B00003': 4, 'C00006': 4, 'D02554': 1, 'D28435': 1, 'D28436': 1, 'D28437': 1}
1    {'A00001': 7, 'B00009': 7, 'C00031': 7, 'D02617': 1, 'D25444': 1, 'D29404': 1, 'D29407': 1, 'D29...
2                                                   {'A00002': 1, 'B00001': 1, 'C00020': 1, 'D16944': 1}
3                         {'A00002': 3, 'B00004': 3, 'C00018': 3, 'D10284': 1, 'D10285': 1, 'D10286': 1}
4                                      {'A00001': 2, 'B00001': 2, 'C00012': 2, 'D30805': 1, 'D30806': 1}
Name: productCountDict, dtype: object

In [82]:
lst = list(s)
for e in lst:
    df[e] = 0

In [83]:
df.shape

(10500, 16979)

In [84]:
#df.loc[df['session_id'] == 'u16159', 'A00002'] = 4

In [85]:
for session_id,d in zip(df['session_id'], df['productCountDict']): 
    for key, values in d.items():
        df.loc[df['session_id'] == session_id, key] = values

In [98]:
df.drop(columns = ['startTime', 'endTime', 'ProductList', 'productCountDict','cleanedProductList', 'session_id'
                  'startTimeMillisecond', 'endTimeMillisecond'],
        axis = 1, inplace = True)

In [181]:
df.columns

Index(['gender', 'D17301', 'D27215', 'D22821', 'D05366', 'D04914', 'D35122',
       'B00052', 'D30057', 'D11191',
       ...
       'D35422', 'D22661', 'D35940', 'D01141', 'D15961', 'D16672', 'D20717',
       'D11409', 'D26864', 'totalTime'],
      dtype='object', length=16972)

In [120]:
df['gender'] = df['gender'].map({'male':1,'female':0})

In [121]:
X = df.drop(columns = ['gender'], axis = 1)

In [122]:
y = df['gender']

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [124]:
X_train.shape

(7350, 16971)

In [125]:
X_test.shape

(3150, 16971)

In [126]:
y_train.shape

(7350,)

### Pre-processing data

In [141]:
sc = StandardScaler()

In [143]:
X_train_std = sc.fit_transform(X_train)

In [146]:
X_test_std = sc.transform(X_test)

### Models

### Logistic Regression

In [147]:
lg = LogisticRegression(n_jobs=-1)

In [148]:
lg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [149]:
y_predict = lg.predict(X_test)

In [150]:
accuracy_score(y_test, y_predict)

0.7996825396825397

In [151]:
confusion_matrix(y_test, y_predict)

array([[2455,   10],
       [ 621,   64]])

In [152]:
precision_score(y_test, y_predict)

0.8648648648648649

In [153]:
recall_score(y_test, y_predict)

0.09343065693430656

In [154]:
roc_auc_score(y_test, y_predict)

0.5446869309012304

### Logistic Regression CV

In [162]:
lg_cv = LogisticRegressionCV(n_jobs=-1)
lg_cv.fit(X_train, y_train)
y_predict = lg_cv.predict(X_test)

In [163]:
accuracy_score(y_test, y_predict)

0.8187301587301588

In [164]:
print("Confusion matrix is -- ",confusion_matrix(y_test, y_predict))
print("Presion score is -- ",precision_score(y_test, y_predict))

print("Recall score is -- ",recall_score(y_test, y_predict))
print("ROC_AUC score is -- ",roc_auc_score(y_test, y_predict))


Confusion matrix is --  [[2438   27]
 [ 544  141]]
Presion score is --  0.8392857142857143
Recall score is --  0.20583941605839415
ROC_AUC score is --  0.5974430346012051


### Randomforest classifier

In [165]:
rf = RandomForestClassifier(n_jobs=-1)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [166]:
y_predict = rf.predict(X_test)

In [167]:
accuracy_score(y_test, y_predict)

0.8825396825396825

In [168]:
print("Confusion matrix is -- ",confusion_matrix(y_test, y_predict))
print("Presion score is -- ",precision_score(y_test, y_predict))

print("Recall score is -- ",recall_score(y_test, y_predict))
print("ROC_AUC score is -- ",roc_auc_score(y_test, y_predict))


Confusion matrix is --  [[2358  107]
 [ 263  422]]
Presion score is --  0.7977315689981096
Recall score is --  0.6160583941605839
ROC_AUC score is --  0.7863253431249168


### RandomForest using cross validation

In [170]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]


In [171]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [172]:
rf = RandomForestClassifier()

In [173]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, 
                               verbose=2, random_state=42, n_jobs = -1)

In [174]:
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 35.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 82.6min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [175]:
y_predict = rf_random.predict(X_test)

In [176]:
accuracy_score(y_test, y_predict)

0.8860317460317461

In [177]:
print("Confusion matrix is -- ",confusion_matrix(y_test, y_predict))
print("Presion score is -- ",precision_score(y_test, y_predict))
print("Recall score is -- ",recall_score(y_test, y_predict))
print("ROC_AUC score is -- ",roc_auc_score(y_test, y_predict))

Confusion matrix is --  [[2374   91]
 [ 268  417]]
Presion score is --  0.8208661417322834
Recall score is --  0.6087591240875913
ROC_AUC score is --  0.7859211441938971


### Test Data

In [199]:
df_test = pd.read_csv('test_janatahack2.csv') 

In [200]:
df_test.head()

Unnamed: 0,session_id,startTime,endTime,ProductList
0,u12112,08/12/14 13:36,08/12/14 13:36,A00002/B00003/C00006/D19956/
1,u19725,19/12/14 13:52,19/12/14 13:52,A00002/B00005/C00067/D02026/
2,u11795,01/12/14 10:44,01/12/14 10:44,A00002/B00002/C00004/D12538/
3,u22639,08/12/14 20:19,08/12/14 20:22,A00002/B00003/C00079/D22781/;A00002/B00003/C00079/D22782/;A00002/B00003/C00079/D19325/;A00002/B0...
4,u18034,15/12/14 19:33,15/12/14 19:33,A00002/B00001/C00010/D23419/


In [201]:
df_test.isnull().sum()

session_id     0
startTime      0
endTime        0
ProductList    0
dtype: int64

In [202]:
df_test['startTimeMillisecond'] = df_test['startTime'].apply(lambda x:dateFormat(x))

In [203]:
df_test['endTimeMillisecond'] = df_test['endTime'].apply(lambda x:dateFormat(x))

In [204]:
df_test['cleanedProductList'] = df_test['ProductList'].apply(lambda x:productListCleaning(x))

In [205]:
df_test['productCountDict'] = df_test['cleanedProductList'].apply(lambda x : productCountDict(x))

In [206]:
sesson_id = df_test['session_id']

In [207]:
lst = list(s)
for e in lst:
    df_test[e] = 0

In [208]:
for session_id,d in zip(df_test['session_id'], df_test['productCountDict']): 
    for key, values in d.items():
        df_test.loc[df_test['session_id'] == session_id, key] = values

In [209]:
df_test.columns

Index(['session_id', 'startTime', 'endTime', 'ProductList',
       'startTimeMillisecond', 'endTimeMillisecond', 'cleanedProductList',
       'productCountDict', 'D17301', 'D22821',
       ...
       'D33320', 'D23693', 'D15277', 'D34355', 'D11787', 'D02070', 'D13480',
       'D02431', 'D17735', 'D01141'],
      dtype='object', length=22379)

In [210]:
df_test['totalTime'] = df_test['startTimeMillisecond'] - df_test['endTimeMillisecond']

In [211]:
df_test.drop(columns = ['startTime', 'endTime', 'ProductList', 'productCountDict','cleanedProductList', 'session_id',
                  'startTimeMillisecond', 'endTimeMillisecond'],
        axis = 1, inplace = True)

In [212]:
df_test.columns

Index(['D17301', 'D22821', 'D04914', 'D30057', 'D16326', 'D07620', 'D03881',
       'D31079', 'D18465', 'D23766',
       ...
       'D23693', 'D15277', 'D34355', 'D11787', 'D02070', 'D13480', 'D02431',
       'D17735', 'D01141', 'totalTime'],
      dtype='object', length=22372)

In [213]:
rf_random.predict(df_test)

ValueError: Number of features of the model must match the input. Model n_features is 16971 and input n_features is 22372 