In [44]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [45]:
from utils_all import *

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

from sklearn import metrics

import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score

# Model building III: TF-IDF and PCA

In this notebook we will perform: 
* Extract tf-idf matrix from the text field
* Apply PCA on it and take 5 components
* Build the same model and see the difference

In [3]:
%store -r DATA_NUM_CL_WITH_NAN_TXT_XY

In [4]:
data = DATA_NUM_CL_WITH_NAN_TXT_XY

Construct numeric dataset

In [5]:
data['domain'] = data.url.apply(get_domain)

data_num = data.select_dtypes(['int64', 'float64'])
data_num = data_num.fillna(data_num.mean())
data_num['meta_name'] = data.meta_name
data_num['domain'] = data.domain
data_num['text'] = data.text

# Random Forest classifier

### Name event component

In [25]:
result_name = analyse("name", data_num, "Random Forest", tf_idf=True)

Current model: Random Forest, Prameter optimization: False
1/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8720
2/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8792
3/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8431
4/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8563
5/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8470
The final CV=5 score for name and Random Forest: 0.8595
METRICS:
   f1_score  mean_accuracy  precision    recall
0  0.856522       0.859513   0.861599  0.860582
Feature importance:


### Location event component


In [26]:
result_location = analyse("location", data_num, "Random Forest", tf_idf=True)

Current model: Random Forest, Prameter optimization: False
1/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.7805
2/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8348
3/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.7414
4/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8256
5/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.7974
The final CV=5 score for location and Random Forest: 0.7960
METRICS:
   f1_score  mean_accuracy  precision    recall
0  0.813813       0.795961   0.766402  0.872227
Feature importance:


### Start Date event component


In [27]:
result_date = analyse("startDate", data_num, "Random Forest", tf_idf=True)

Current model: Random Forest, Prameter optimization: False
1/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8712
2/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.9214
3/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8629
4/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.9372
5/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.9135
The final CV=5 score for startDate and Random Forest: 0.9013
METRICS:
   f1_score  mean_accuracy  precision    recall
0  0.907239       0.901255   0.897584  0.922642
Feature importance:


### Description event component


In [28]:
result_description = analyse("description", data_num, "Random Forest", tf_idf=True)

Current model: Random Forest, Prameter optimization: False
1/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.7180
2/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8431
3/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8590
4/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8842
5/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8541
The final CV=5 score for description and Random Forest: 0.8317
METRICS:
   f1_score  mean_accuracy  precision    recall
0  0.840922       0.831679   0.816643  0.879164
Feature importance:


# Result for Random forest

In [31]:
result_all_rf = pd.concat([result_name, result_date, result_location, result_description])
result_all_rf['meta_name'] = ['name', 'date', 'location', 'description']
result_all_rf.ix[:,'model'] = 'Random forest'

In [32]:
result_all_rf.round(4)

Unnamed: 0,f1_score,mean_accuracy,precision,recall,meta_name,model
0,0.8565,0.8595,0.8616,0.8606,name,Random forest
0,0.9072,0.9013,0.8976,0.9226,date,Random forest
0,0.8138,0.796,0.7664,0.8722,location,Random forest
0,0.8409,0.8317,0.8166,0.8792,description,Random forest


# Logisitc regression

In [33]:
result_name = analyse("name", data_num, "Logistic regression", tf_idf=True)
result_date = analyse("startDate", data_num, "Logistic regression", tf_idf=True)
result_location = analyse("location", data_num, "Logistic regression", tf_idf=True)
result_descr = analyse("description", data_num, "Logistic regression", tf_idf=True)

Current model: Logistic regression, Prameter optimization: False
1/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.7587
2/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.7671
3/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.7183
4/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.7440
5/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.7569
The final CV=5 score for name and Logistic regression: 0.7490
METRICS:
   f1_score  mean_accuracy  precision   recall
0  0.750989       0.749004   0.735665  0.78118
Feature importance:
Current model: Logistic regression, Prameter optimization: False
1/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.7319
2/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8687
3/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8809
4/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.7163
5/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8742
The final CV=5 score for startDate and Logistic regression: 0.8144
METRICS:
   f1_sco

# Result for Logisitc regression

In [34]:
result_all_lr = pd.concat([result_name, result_date, result_location, result_descr])
result_all_lr['meta_name'] = ['name', 'date', 'location', 'description']
result_all_lr.ix[:,'model'] = 'Logistic regression'
result_all_lr.round(4)

Unnamed: 0,f1_score,mean_accuracy,precision,recall,meta_name,model
0,0.751,0.749,0.7357,0.7812,name,Logistic regression
0,0.7989,0.8144,0.8559,0.7894,date,Logistic regression
0,0.7603,0.7623,0.7667,0.7745,location,Logistic regression
0,0.7681,0.7676,0.7359,0.8285,description,Logistic regression


# SVM

In [35]:
result_name = analyse("name", data_num, "SVM", tf_idf=True)
result_date = analyse("startDate", data_num, "SVM", tf_idf=True)
result_location = analyse("location", data_num, "SVM", tf_idf=True)
result_descr = analyse("description", data_num, "SVM", tf_idf=True)

Current model: SVM, Prameter optimization: False
1/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8342
2/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8267
3/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8458
4/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8444
5/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.7940
The final CV=5 score for name and SVM: 0.8290
METRICS:
   f1_score  mean_accuracy  precision    recall
0  0.823153        0.82901   0.821516  0.835133
Feature importance:
Current model: SVM, Prameter optimization: False
1/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8237
2/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8840
3/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.9011
4/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8461
5/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8931
The final CV=5 score for startDate and SVM: 0.8696
METRICS:
   f1_score  mean_accuracy  precision    recall
0  0.869368       0.869

# Result for SVM

In [36]:
result_all_svm = pd.concat([result_name, result_date, result_location, result_descr])
result_all_svm['meta_name'] = ['name', 'date', 'location', 'description']
result_all_svm.ix[:,'model'] = 'SVM'
result_all_svm.round(4)

Unnamed: 0,f1_score,mean_accuracy,precision,recall,meta_name,model
0,0.8232,0.829,0.8215,0.8351,name,SVM
0,0.8694,0.8696,0.8904,0.8608,date,SVM
0,0.8099,0.807,0.8088,0.8138,location,SVM
0,0.8202,0.8177,0.7846,0.8692,description,SVM


# Extreme Random Forest

In [37]:
result_name = analyse("name", data_num, "Extreme Random Forest", tf_idf=True)
result_date = analyse("startDate", data_num, "Extreme Random Forest", tf_idf=True)
result_location = analyse("location", data_num, "Extreme Random Forest", tf_idf=True)
result_descr = analyse("description", data_num, "Extreme Random Forest", tf_idf=True)

Current model: Extreme Random Forest, Prameter optimization: False
1/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8371
2/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8321
3/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8734
4/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8536
5/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.8899
The final CV=5 score for name and Extreme Random Forest: 0.8572
METRICS:
   f1_score  mean_accuracy  precision    recall
0  0.849008       0.857206   0.811897  0.898433
Feature importance:
Current model: Extreme Random Forest, Prameter optimization: False
1/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.9103
2/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.9097
3/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.9152
4/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.9103
5/5 iteration ...
tf-idf and PCA: True
Mean accuracy 0.9068
The final CV=5 score for startDate and Extreme Random Forest: 0.9105
METRICS:

# Results for Extreme Random Forest

In [38]:
result_all_erf = pd.concat([result_name, result_date, result_location, result_descr])
result_all_erf['meta_name'] = ['name', 'date', 'location', 'description']
result_all_erf.ix[:,'model'] = 'Extreme Random Forest'
result_all_erf.round(4)

Unnamed: 0,f1_score,mean_accuracy,precision,recall,meta_name,model
0,0.849,0.8572,0.8119,0.8984,name,Extreme Random Forest
0,0.9122,0.9105,0.8776,0.9535,date,Extreme Random Forest
0,0.8213,0.8076,0.7548,0.9116,location,Extreme Random Forest
0,0.864,0.8653,0.832,0.9079,description,Extreme Random Forest


In [39]:
result_all = pd.concat([result_all_rf, result_all_svm, result_all_lr, result_all_erf])

# Result for all models

In [40]:
result_all = result_all.sort_values(by=['meta_name'],ascending=False).round(4)

In [41]:
result_all.round(2).to_latex(open('summary_result_PCA','w'))

In [43]:
result_all

Unnamed: 0,f1_score,mean_accuracy,precision,recall,meta_name,model
0,0.8565,0.8595,0.8616,0.8606,name,Random forest
0,0.8232,0.829,0.8215,0.8351,name,SVM
0,0.751,0.749,0.7357,0.7812,name,Logistic regression
0,0.849,0.8572,0.8119,0.8984,name,Extreme Random Forest
0,0.8138,0.796,0.7664,0.8722,location,Random forest
0,0.8099,0.807,0.8088,0.8138,location,SVM
0,0.7603,0.7623,0.7667,0.7745,location,Logistic regression
0,0.8213,0.8076,0.7548,0.9116,location,Extreme Random Forest
0,0.8409,0.8317,0.8166,0.8792,description,Random forest
0,0.8202,0.8177,0.7846,0.8692,description,SVM


Here we used:
* Numerica features + tf-idf for text + PCA of it
* Fair splitting
* Cross validation k = 5