In [72]:
# import necessary packages
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import model_selection, preprocessing 
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold
from sklearn import tree

<H3>Load train and test data from csv to pandas dataframes

In [73]:
train_df = pd.read_csv('I_examples.csv')
test_df = pd.read_csv('J_examples.csv')
train_df = train_df.drop(['string', 'doc_id', 'word_loc'], axis =1)
test_df = test_df.drop(['string', 'doc_id', 'word_loc'], axis =1)
train_feature_data = train_df.drop(['label'], axis = 1)
train_feature_data = train_feature_data.values
train_label_data = train_df['label'].values

<H3>Feature data for training 

In [74]:
train_feature_data

array([[ 6,  1,  0, ...,  1,  0,  1],
       [15,  2,  0, ...,  1,  0,  0],
       [ 8,  1,  0, ...,  1,  1,  0],
       ..., 
       [ 7,  1,  0, ...,  1,  0,  1],
       [12,  1,  0, ...,  1,  0,  1],
       [ 3,  1,  0, ...,  0,  0,  0]])

<H3>Label data for training

In [80]:
train_label_data

array([0, 1, 0, ..., 0, 0, 0])

<H3>k-fold stratified cross validation on training data

In [76]:
n_splits = 10
skf = StratifiedKFold(n_splits,random_state=1)
skf.get_n_splits(train_feature_data,train_label_data)

10

<H3>Trying out different classifiers such as Decision Tree, RandomForest, SVM, Linear Regression and Logistic Regression
<H3>Decision Tree Classifier

In [98]:
decisiontree_classifier = tree.DecisionTreeClassifier()
precision_list = []
recall_list = []
fscore_list = []
for train_index, test_index in skf.split(train_feature_data,train_label_data):
    decisiontree_classifier.fit(train_feature_data[train_index],train_label_data[train_index])
    y_pred = decisiontree_classifier.predict(train_feature_data[test_index])
    precision,recall,fscore,support = precision_recall_fscore_support(train_label_data[test_index],y_pred, average='macro')
#     print(precision,recall,fscore)
    precision_list.append(precision)
    recall_list.append(recall)
    fscore_list.append(fscore)
print("\n\nAverage Precision: "+ str(sum(precision_list)/n_splits)+
      "\nAverage Recall: " + str(sum(recall_list)/n_splits)+
      "\nAverage fscore: "+ str(sum(fscore_list)/n_splits))




Average Precision: 0.910033464899
Average Recall: 0.908526936534
Average fscore: 0.90920343611


<H3>Random Forest Classifier

In [99]:
from sklearn.ensemble import RandomForestClassifier
randomforest_classifier = RandomForestClassifier(random_state=1)
precision_list = []
recall_list = []
fscore_list = []
for train_index, test_index in skf.split(train_feature_data,train_label_data):
    randomforest_classifier.fit(train_feature_data[train_index],train_label_data[train_index])
    y_pred = randomforest_classifier.predict(train_feature_data[test_index])
    precision,recall,fscore,support = precision_recall_fscore_support(train_label_data[test_index], y_pred, average='macro')
#     print(precision,recall,fscore)
    precision_list.append(precision)
    recall_list.append(recall)
    fscore_list.append(fscore)
print("\n\nAverage Precision: "+ str(sum(precision_list)/n_splits)+
      "\nAverage Recall: " + str(sum(recall_list)/n_splits)+
      "\nAverage fscore: "+ str(sum(fscore_list)/n_splits))




Average Precision: 0.916221345677
Average Recall: 0.91127355784
Average fscore: 0.913627013296


<H2>SVM Classifier

In [100]:
from sklearn import svm
svm_clf = svm.SVC(kernel = 'rbf', random_state = 1, gamma = 0.1, C = 10.0)
precision_list = []
recall_list = []
fscore_list = []
for train_index, test_index in skf.split(train_feature_data,train_label_data):
    svm_clf.fit(train_feature_data[train_index],train_label_data[train_index])
    y_pred = svm_clf.predict(train_feature_data[test_index])
    precision,recall,fscore,support = precision_recall_fscore_support(train_label_data[test_index], y_pred, average='macro')
#     print(precision,recall,fscore)
    precision_list.append(precision)
    recall_list.append(recall)
    fscore_list.append(fscore)
print("\n\nAverage Precision: "+ str(sum(precision_list)/n_splits)+
      "\nAverage Recall: " + str(sum(recall_list)/n_splits)+
      "\nAverage fscore: "+ str(sum(fscore_list)/n_splits))




Average Precision: 0.914892919417
Average Recall: 0.906628812171
Average fscore: 0.910567297396


<H3> Linear Regression

In [102]:
linreg = LinearRegression()
precision_list = []
recall_list = []
fscore_list = []
for train_index, test_index in skf.split(train_feature_data,train_label_data):
    linreg.fit(train_feature_data[train_index],train_label_data[train_index])
    y_pred = linreg.predict(train_feature_data[test_index])
    # apply a threshold (using mean value)
    thresh = round(np.mean(y_pred), 2)
    y_pred = np.where(y_pred > thresh, 1, 0)
    precision,recall,fscore,support = precision_recall_fscore_support(train_label_data[test_index], y_pred, average='macro')
#     print(precision,recall,fscore)
    precision_list.append(precision)
    recall_list.append(recall)
    fscore_list.append(fscore)
print("\n\nAverage Precision: "+ str(sum(precision_list)/n_splits)+
      "\nAverage Recall: " + str(sum(recall_list)/n_splits)+
      "\nAverage fscore: "+ str(sum(fscore_list)/n_splits))





Average Precision: 0.746905143007
Average Recall: 0.849258260986
Average fscore: 0.755272893862


<H3>Logistic Regression

In [103]:
logreg = LogisticRegression(C = 100.0, random_state = 1)
precision_list = []
recall_list = []
fscore_list = []
for train_index, test_index in skf.split(train_feature_data,train_label_data):
    logreg.fit(train_feature_data[train_index],train_label_data[train_index])
    y_pred = logreg.predict(train_feature_data[test_index])
    precision,recall,fscore,support = precision_recall_fscore_support(train_label_data[test_index], y_pred, average='macro')
#     print(precision,recall,fscore)
    precision_list.append(precision)
    recall_list.append(recall)
    fscore_list.append(fscore)
print("\n\nAverage Precision: "+ str(sum(precision_list)/n_splits)+
      "\nAverage Recall: " + str(sum(recall_list)/n_splits)+
      "\nAverage fscore: "+ str(sum(fscore_list)/n_splits))






Average Precision: 0.892948000182
Average Recall: 0.890114361209
Average fscore: 0.891187328498


<H3>From the above results RandomForest Classifier seems to be the best classfier among the tested classifiers
<H3>RandomForest Classifier on TestSet

In [105]:
from sklearn.ensemble import RandomForestClassifier
randomforest_classifier = RandomForestClassifier(random_state=1)
randomforest_classifier.fit(train_feature_data,train_label_data)
y_pred = randomforest_classifier.predict(test_feature_data)
precision, recall, fscore ,support = precision_recall_fscore_support(test_label_data, y_pred, average='macro')
print("Precision on Test Set: "+ str(precision) +
      "\nRecall on Test Set: "+ str(recall) +
      "\nFScore on Test Set: "+ str(fscore) ) 

Precision on Test Set: 0.926112957475
Recall on Test Set: 0.925876400585
FScore on Test Set: 0.92599460843
