# IEEE Big Data Challenge

## Initial steps

Import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
training = pd.read_pickle("../Datasets/encoded_train.pkl")

In [3]:
training.head()

Unnamed: 0,notified,timestamp_dist,correlatedcount,n1,n2,n3,n4,n5,n6,n7,...,p8d_1,p8d_2,p8d_3,p8d_4,BENCH,SC,MW,BW,ON,DK
0,0,65684,69,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,0,1188030,5302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
2,0,43716,346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
4,0,2401,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0


## Training dataset preparation

In [4]:
from sklearn.model_selection import train_test_split

y = training['notified']
X = training.drop('notified', axis=1)
test_size = 0.3
seed = 1011

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed )

## Random Forests

In [38]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=800, max_depth=30, random_state=0)

In [39]:
clf = clf.fit(X_train,y_train)

In [40]:
predictions = clf.predict_proba(X_test)

### Evaluation

In [41]:
from sklearn import metrics
metrics.roc_auc_score(y_test, predictions[:,1])

0.9184494556260057

In [None]:
0.9156252529319485


## Parameter tuning

### Grid search

In [15]:
from sklearn.model_selection import GridSearchCV

n_estimators = list(range(1000,4000,500))
max_depth = list(range(30,60,10))

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth)

clf = RandomForestClassifier(random_state=0)
gridF = GridSearchCV(clf, hyperF, cv = 3, verbose = 6, 
                      n_jobs = -1, scoring='roc_auc', refit=True)
bestF = gridF.fit(X_train,y_train)
predictions = bestF.predict_proba(X_test)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed: 14.4min finished


In [16]:
print("Best score: ",bestF.best_score_)
print("Best params: ", bestF.best_params_)

Best score:  0.9036670407966692
Best params:  {'max_depth': 40, 'n_estimators': 3500}


In [17]:
predictions = bestF.predict_proba(X_test)
metrics.roc_auc_score(y_test, predictions[:,1])

0.9077027154516224