In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
sys.path.insert(0, '../src')
import prework as pwk
import string
from time import time

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score, accuracy_score, precision_score, roc_auc_score
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df_0 = pd.read_pickle('../data/pickles/20180117_part_one.pkl')

In [4]:
'---'.join(df_0.columns)

'doc_id---path---author_code---essay_content---label---unique_lemma---avg_stc_length---total_stc---DT_pos---POS_adjv_body---POS_adjv_repeat_rate---POS_adjv_repeat_cnt---DT_archs---DT_max_dp_cnts---DT_ROOT_idx---DT_pass_cnt---DT_mark_cnt---DT_pos_join---DT_archs_join---DT_insent_pos_ngram---DT_insent_arch_ngram---DT_max_dp_cnts_std---DT_ROOT_idx_mean---DT_pass_cnt_sum---DT_mark_cnt_sum'

# Explore per-class Accuracies based on Different Features

### Model to Evaluate: Logistic Regression

In [5]:
lrclf = LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Features to Investigate

- unique_lemma
- avg_stc_length
- DT_max_dp_cnts_std
- DT_ROOT_idx_mean
- DT_pos_join
- DT_insent_arch_ngram

In [6]:
X = df_0[['unique_lemma']]
y = df_0['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)

lrclf.fit(X_train, y_train)
y_pred = lrclf.predict(X_test)

pwk.print_confusion_matrix(y_test.values, y_pred)

Predicted,JPN,KOR,PAK,SIN,IDN,ENS,THA,CHN,PHL,All,Recall
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CHN,20,15,1,64,0,0,0,0,0,100,0.0
ENS,10,17,2,87,0,0,0,0,0,116,0.0
IDN,39,13,3,41,0,0,0,0,0,96,0.0
JPN,41,18,3,29,0,0,0,0,0,91,0.450549
KOR,41,19,1,27,0,0,0,0,0,88,0.215909
PAK,30,18,0,45,0,0,0,0,0,93,0.0
PHL,24,17,2,66,0,0,0,0,0,109,0.0
SIN,3,8,3,81,0,0,0,0,0,95,0.852632
THA,55,29,1,27,0,0,0,0,0,112,0.0
All,263,154,16,467,0,0,0,0,0,900,0.156667


`The result shows that feature` **unique_lemma** `is great at identifying Singapore learners`

In [7]:
X = df_0[['avg_stc_length']]
y = df_0['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)

lrclf.fit(X_train, y_train)
y_pred = lrclf.predict(X_test)

pwk.print_confusion_matrix(y_test.values, y_pred)

Predicted,ENS,JPN,KOR,PHL,SIN,THA,IDN,PAK,CHN,All,Recall
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CHN,18,35,12,28,9,6,0,0,0,108,0.0
ENS,81,2,1,6,6,1,0,0,0,97,0.835052
IDN,28,25,15,26,9,5,0,0,0,108,0.0
JPN,9,59,7,13,6,2,0,0,0,96,0.614583
KOR,15,52,11,12,7,1,0,0,0,98,0.112245
PAK,16,48,18,14,9,2,0,0,0,107,0.0
PHL,38,11,11,18,10,4,0,0,0,92,0.195652
SIN,59,2,4,14,10,2,0,0,0,91,0.10989
THA,26,41,9,15,8,4,0,0,0,103,0.038835
All,290,275,88,146,74,27,0,0,0,900,0.203333


`The result shows that feature` **avg_stc_length** `is great at identifying native English speakers`

In [8]:
X = df_0[['DT_max_dp_cnts_std']]
y = df_0['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)

lrclf.fit(X_train, y_train)
y_pred = lrclf.predict(X_test)

pwk.print_confusion_matrix(y_test.values, y_pred)

Predicted,CHN,ENS,PAK,JPN,PHL,IDN,KOR,SIN,THA,All,Recall
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CHN,65,9,12,0,0,0,0,0,0,86,0.755814
ENS,44,15,28,0,0,0,0,0,0,87,0.172414
IDN,67,13,28,0,0,0,0,0,0,108,0.0
JPN,55,17,36,0,0,0,0,0,0,108,0.0
KOR,54,12,33,0,0,0,0,0,0,99,0.0
PAK,35,16,50,0,0,0,0,0,0,101,0.49505
PHL,60,18,21,0,0,0,0,0,0,99,0.0
SIN,62,13,29,0,0,0,0,0,0,104,0.0
THA,49,22,37,0,0,0,0,0,0,108,0.0
All,491,135,274,0,0,0,0,0,0,900,0.144444


`The result shows that feature` **DT_max_dp_cnts_std** `is great at identifying Chinese learners`

In [10]:
X = df_0[['DT_ROOT_idx_mean']]
y = df_0['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)

lrclf.fit(X_train, y_train)
y_pred = lrclf.predict(X_test)

pwk.print_confusion_matrix(y_test.values, y_pred)

Predicted,PAK,PHL,SIN,JPN,CHN,IDN,ENS,KOR,THA,All,Recall
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CHN,15,41,43,0,0,0,0,0,0,99,0.0
ENS,27,22,55,0,0,0,0,0,0,104,0.0
IDN,36,35,31,0,0,0,0,0,0,102,0.0
JPN,63,29,11,0,0,0,0,0,0,103,0.0
KOR,54,20,26,0,0,0,0,0,0,100,0.0
PAK,68,12,7,0,0,0,0,0,0,87,0.781609
PHL,36,17,39,0,0,0,0,0,0,92,0.184783
SIN,15,26,66,0,0,0,0,0,0,107,0.616822
THA,63,25,18,0,0,0,0,0,0,106,0.0
All,377,227,296,0,0,0,0,0,0,900,0.167778


`The result shows that feature` **DT_ROOT_idx_mean** `is great at identifying Pakistani and Singapore learners`

In [11]:
df_0['DT_pos_join'][0]

u'NOUN VERB ADP PRON VERB ADJ ADP NOUN NOUN PART VERB ADV PUNCT NOUN NOUN PUNCT ADV PUNCT DET ADJ NOUN ADP NOUN NOUN VERB VERB DET ADJ PUNCT NOUN NOUN PUNCT DET ADP PRON VERB DET NOUN PUNCT NOUN NOUN VERB VERB PRON PART VERB ADP DET NOUN ADV CCONJ VERB PRON ADJ NOUN PUNCT VERB DET NOUN ADP NOUN ADP NOUN PUNCT ADV PROPN VERB DET NOUN NOUN PUNCT PRON VERB ADP NOUN ADP DET NOUN CCONJ VERB PART VERB DET NOUN NOUN DET NOUN PUNCT ADV VERB ADP ADJ NOUN PUNCT PRON ADV VERB DET ADJ NOUN ADP VERB ADP NOUN PUNCT ADP DET ADJ NOUN PART NOUN PUNCT PRON VERB ADP PRON VERB VERB NOUN PRON VERB ADJ NOUN VERB ADV VERB PUNCT PRON VERB DET NOUN ADJ PRON VERB ADP DET NOUN ADP DET NOUN ADP ADJ ADJ PUNCT NOUN PART VERB ADJ NOUN NOUN DET NOUN VERB PUNCT ADV PUNCT PRON VERB ADJ ADP NOUN NOUN PART VERB DET ADJ PUNCT NOUN NOUN PUNCT ADV PUNCT ADJ NOUN ADV VERB ADV VERB PRON PART VERB ADJ PUNCT NOUN NOUN PUNCT PRON VERB ADJ ADP ADJ NOUN CCONJ NOUN PUNCT ADP ADJ NOUN PUNCT VERB DET ADJ NOUN PUNCT NOUN NOUN VERB ADJ

In [12]:
X = df_0['DT_pos_join']
y = df_0['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)

vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(3,3), max_features=800).fit(X_train)
X_train_dtm = vectorizer.transform(X_train)
X_test_dtm = vectorizer.transform(X_test)

lrclf.fit(X_train_dtm, y_train)
y_pred = lrclf.predict(X_test_dtm)

pwk.print_confusion_matrix(y_test.values, y_pred)

Predicted,CHN,ENS,IDN,JPN,KOR,PAK,PHL,SIN,THA,All,Recall
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CHN,65,0,1,13,2,4,0,9,0,94,0.691489
ENS,1,68,0,4,3,4,5,18,1,104,0.653846
IDN,9,1,34,6,11,31,12,6,4,114,0.298246
JPN,4,0,2,69,4,5,0,3,1,88,0.784091
KOR,6,1,8,31,32,10,2,6,3,99,0.323232
PAK,0,1,2,2,1,86,0,1,1,94,0.914894
PHL,4,5,1,5,1,19,42,14,2,93,0.451613
SIN,1,10,1,3,2,4,7,79,0,107,0.738318
THA,7,7,3,15,9,22,4,4,36,107,0.336449
All,97,93,52,148,65,185,72,140,48,900,0.567778


In [13]:
df_0['DT_insent_arch_ngram'][0]

u'nsubj_ROOT_mark ROOT_mark_nsubj mark_nsubj_ccomp nsubj_ccomp_acomp ccomp_acomp_mark acomp_mark_compound mark_compound_nsubj compound_nsubj_aux nsubj_aux_advcl aux_advcl_amod advcl_amod_punct amod_punct_compound punct_compound_dobj compound_dobj_punct advmod_punct_det punct_det_amod det_amod_nsubj amod_nsubj_prep nsubj_prep_compound prep_compound_pobj compound_pobj_aux pobj_aux_ROOT aux_ROOT_det ROOT_det_compound det_compound_punct compound_punct_compound punct_compound_dobj compound_dobj_punct nsubj_prep_pobj prep_pobj_ROOT pobj_ROOT_mark ROOT_mark_compound mark_compound_punct compound_punct_compound punct_compound_nsubj compound_nsubj_aux nsubj_aux_ccomp aux_ccomp_nsubj ccomp_nsubj_aux nsubj_aux_ccomp aux_ccomp_prep ccomp_prep_det prep_det_pobj det_pobj_advmod pobj_advmod_cc advmod_cc_conj cc_conj_dative conj_dative_amod dative_amod_dobj amod_dobj_punct advcl_det_dobj det_dobj_prep dobj_prep_pobj prep_pobj_prep pobj_prep_pobj prep_pobj_punct pobj_punct_advmod punct_advmod_nsubj advm

In [14]:
X = df_0['DT_insent_arch_ngram']
y = df_0['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)

vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1,1), max_features=800).fit(X_train)
X_train_dtm = vectorizer.transform(X_train)
X_test_dtm = vectorizer.transform(X_test)

lrclf.fit(X_train_dtm, y_train)
y_pred = lrclf.predict(X_test_dtm)

pwk.print_confusion_matrix(y_test.values, y_pred)

Predicted,CHN,ENS,IDN,JPN,KOR,PAK,PHL,SIN,THA,All,Recall
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CHN,65,2,5,11,4,1,2,8,1,99,0.656566
ENS,2,64,0,1,0,4,0,17,0,88,0.727273
IDN,3,4,41,8,4,23,11,9,5,108,0.37963
JPN,3,0,5,76,3,7,0,3,1,98,0.77551
KOR,6,5,5,33,29,7,1,15,3,104,0.278846
PAK,3,0,0,0,0,91,0,2,0,96,0.947917
PHL,5,19,10,0,0,16,37,17,2,106,0.349057
SIN,6,11,0,0,0,4,2,65,0,88,0.738636
THA,7,13,3,16,2,17,4,5,46,113,0.40708
All,100,118,69,145,42,170,57,141,58,900,0.571111


`Both results show that tfidf matrix of feature` **DT_insent_arch_ngram** `is great at identifying Pakistani learners`