# ENPM808W: HW3

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
cwd = os.getcwd()
train_data_path = os.path.join(cwd, 'Quiz_bowl_data', 'qb.train.csv')
test_data_path = os.path.join(cwd, 'Quiz_bowl_data', 'qb.test.csv')

## Load and preview Data

In [3]:
train_data = pd.read_csv(train_data_path)
train_data.head(5)

Unnamed: 0,row,body_score,page,answer,text,category,tournaments,answer_type,corr,inlinks
0,1,127.398036,Comus (John Milton),Comus (John Milton),First performed in Ludlow Castle by the childr...,Literature,2000 ACF Nationals,work,True,62
1,2,50.212336,Circe,Comus (John Milton),First performed in Ludlow Castle by the childr...,Literature,2000 ACF Nationals,,False,5
2,3,44.767071,Satyr,Comus (John Milton),First performed in Ludlow Castle by the childr...,Literature,2000 ACF Nationals,,False,6
3,4,44.058274,Philip K. Dick,Wilfred Owen,This author is convinced by another to publish...,Literature,2009 ACF Winter,people,False,22
4,5,40.675249,Honore de Balzac,Wilfred Owen,This author is convinced by another to publish...,Literature,2009 ACF Winter,,False,0


In [4]:
test_data = pd.read_csv(test_data_path)
test_data.head()

Unnamed: 0,row,body_score,page,text,category,tournaments,answer_type,inlinks
0,31,40.023617,Death in Venice,The protagonist of this novella is inspired to...,Literature,2001 ACF Fall,work,4
1,32,27.538799,The Eye of the World,The protagonist of this novella is inspired to...,Literature,2001 ACF Fall,work,1
2,33,26.976121,Carmilla,The protagonist of this novella is inspired to...,Literature,2001 ACF Fall,work,2
3,88,45.848831,A Passage to India,In a symbolic incident in this novel Ronny Hea...,Literature,2001 ACF Regionals,work,5
4,91,99.811169,The Cherry Orchard,"We first meet Fiers, the aging footman, in Act...",Literature,2001 ACF Regionals,work,11


### 1. New feature: log of inlinks

In [5]:
# add feature to training data
train_data['inlinks']+=1
train_data['log_inlinks'] = np.log2(train_data['inlinks'])

# add feature to test data
test_data['inlinks']+=1
test_data['log_inlinks'] = np.log2(test_data['inlinks'])

### 2. New feature: length of text revealed

In [6]:
# add feature to training data
train_data['len_text'] = [ len(i.split(' ')) for i in train_data['text']]

# add feature to test data
test_data['len_text'] = [ len(i.split(' ')) for i in test_data['text']]

## Best classifier

* To arrive on the best classifier, I tried using different classfiers such as logistic regression, decision trees and SVM. 
* Finally, I chose SVM because it had the best performance out of the 3.
* I split the training data in train and developement set in the ratio of 80:20. 
* If the classifier performs well on the dev set as well as on the public leaderboard on kaggle(30% of the data), we can be sure that this model will generalize well.
* I tried using different sets of data in my training data, but the following combination seemed to work the best: 'log_inlinks', 'body_score', 'len_text'
* I achieved an accuracy of 0.8224 on the dev set.




In [7]:
num_train = train_data.shape[0]

# train_data
train_x = pd.DataFrame(train_data[:int(0.8*num_train)], columns=['log_inlinks', 'body_score', 'len_text'])
train_y = pd.DataFrame(train_data[:int(0.8*num_train)], columns=['corr'])

# dev data
dev_x = pd.DataFrame(train_data[int(0.8*num_train):], columns=['log_inlinks', 'body_score', 'len_text'])
dev_y = pd.DataFrame(train_data[int(0.8*num_train):], columns=['corr'])

# test data
test_x = pd.DataFrame(test_data, columns=['log_inlinks', 'body_score', 'len_text'])

In [8]:
# fit model
svm_model = SVC()
svm_model.fit(train_x, train_y)

# prediction
y_pred = svm_model.predict(dev_x)
y_test_pred = svm_model.predict(test_x)

# confusion matrix and accuracy
cm = confusion_matrix(dev_y, y_pred)
accuracy = accuracy_score(dev_y, y_pred)
tn, fp, fn, tp = cm.ravel()

# print results
print("Confusion matrix: \n {} \n".format(cm))
print("The accuracy score: {} \n".format(accuracy))
print('True negatives : {}'.format(tn))
print('False positives : {}'.format(fp))
print('False negatives : {}'.format(fn))
print('True positives : {}'.format(tp))

  return f(**kwargs)


Confusion matrix: 
 [[883  26]
 [261 446]] 

The accuracy score: 0.8224009900990099 

True negatives : 883
False positives : 26
False negatives : 261
True positives : 446


In [9]:
# write to csv
test_y_df = pd.DataFrame()
test_y_df['row'] = test_data['row']
test_y_df['corr'] = y_test_pred
test_y_df.to_csv('submission.csv', index=False)


## Error Analysis

* Our final model has a high number of false negatives. False negatives form almost 90% of the miss-classified examples.
* Below I have shown 10 examples of some false negatives that were encountered and corresponding values for those examples.
* The reason why our model classfies these examples as false is because they have an unusually low body score. 
* This is why our model classifies them as false even when the guess is correct. 
* This can be traced down to the fact that in some cases body score is not reliable and is noisy. 
* A high body score should correlate to a guess being correct and vice versa as the body score is a measure of the confidence for how well the text of the Wikipedia page mathed the question text. 

In [10]:
# false negatives
false_neg_index = []
dev_y_arr = np.array(dev_y)[:, 0]
for i in range(len(y_pred)):
    if y_pred[i]==False and dev_y_arr[i]==True:
        false_neg_index.append(i)

In [11]:
dev_data = pd.DataFrame(train_data[int(0.8*num_train):])
count = 0
for i in false_neg_index:
    count+=1
    print('{}. Sentence: {} '.format(count, dev_data['text'].iloc[i]))
    print('\n {} \n \n'.format(dev_data.iloc[i,:]))
    if count == 10:
        break

1. Sentence: Headmaster Michael and Nancy attempt to modernize a rural village run by a traditional priest in this man's short story "Dead Men's Path. 

 row                                                        11134
body_score                                                18.759
page                                               Chinua Achebe
answer                                             Chinua Achebe
text           Headmaster Michael and Nancy attempt to modern...
category                                              Literature
tournaments                                        2005 ACF Fall
answer_type                                               people
corr                                                        True
inlinks                                                       16
log_inlinks                                                    4
len_text                                                      23
Name: 6486, dtype: object 
 

2. Sentence: Near the beginning of t