In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import prework as pwk
import string
from time import time

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score, accuracy_score, precision_score, roc_auc_score
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df_0 = pd.read_pickle('data/pickles/20180117_part_one.pkl')

In [4]:
'---'.join(df_0.columns)

'doc_id---path---author_code---essay_content---label---unique_lemma---avg_stc_length---total_stc---DT_pos---POS_adjv_body---POS_adjv_repeat_rate---POS_adjv_repeat_cnt---DT_archs---DT_max_dp_cnts---DT_ROOT_idx---DT_pass_cnt---DT_mark_cnt---DT_pos_join---DT_archs_join---DT_insent_pos_ngram---DT_insent_arch_ngram---DT_max_dp_cnts_std---DT_ROOT_idx_mean---DT_pass_cnt_sum---DT_mark_cnt_sum'

# Explore per-class Accuracies based on Different Features

### Model to Evaluate: Logistic Regression

In [5]:
lrclf = LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Features to Investigate

- unique_lemma
- avg_stc_length
- DT_max_dp_cnts_std
- DT_ROOT_idx_mean
- DT_pos_join
- DT_insent_arch_ngram

In [20]:
X = df_0[['unique_lemma']]
y = df_0['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)

lrclf.fit(X_train, y_train)
y_pred = lrclf.predict(X_test)

pwk.print_confusion_matrix(y_test.values, y_pred)

Predicted,ENS,JPN,PAK,SIN,THA,IDN,KOR,CHN,PHL,All,Accuracy
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CHN,8,21,6,64,0,0,0,0,0,99,0.0
ENS,6,12,5,65,0,0,0,0,0,88,0.068182
IDN,14,59,4,30,1,0,0,0,0,108,0.0
JPN,11,49,7,30,1,0,0,0,0,98,0.5
KOR,12,44,8,40,0,0,0,0,0,104,0.0
PAK,15,28,8,45,0,0,0,0,0,96,0.083333
PHL,8,24,4,70,0,0,0,0,0,106,0.0
SIN,4,5,3,76,0,0,0,0,0,88,0.863636
THA,14,62,8,28,1,0,0,0,0,113,0.00885
All,92,304,53,448,3,0,0,0,0,900,0.155556


`The result shows that feature` **unique_lemma** `is great at identifying Singapore learners`

In [36]:
X = df_0[['avg_stc_length']]
y = df_0['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)

lrclf.fit(X_train, y_train)
y_pred = lrclf.predict(X_test)

pwk.print_confusion_matrix(y_test.values, y_pred)

Predicted,CHN,ENS,IDN,JPN,KOR,PHL,SIN,PAK,THA,All,Accuracy
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CHN,1,33,25,23,19,3,0,0,0,104,0.009615
ENS,0,86,4,1,4,0,3,0,0,98,0.877551
IDN,3,32,17,29,17,1,1,0,0,100,0.17
JPN,0,9,10,52,21,3,3,0,0,98,0.530612
KOR,1,22,5,49,14,1,2,0,0,94,0.148936
PAK,1,11,12,44,23,5,3,0,0,99,0.0
PHL,0,61,14,12,12,3,1,0,0,103,0.029126
SIN,0,85,10,1,2,1,2,0,0,101,0.019802
THA,2,24,19,36,16,1,5,0,0,103,0.0
All,8,363,116,247,128,18,20,0,0,900,0.194444


`The result shows that feature` **avg_stc_length** `is great at identifying native English speakers`

In [51]:
X = df_0[['DT_max_dp_cnts_std']]
y = df_0['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)

lrclf.fit(X_train, y_train)
y_pred = lrclf.predict(X_test)

pwk.print_confusion_matrix(y_test.values, y_pred)

Predicted,CHN,JPN,KOR,PAK,IDN,ENS,SIN,THA,PHL,All,Accuracy
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CHN,67,5,4,18,0,0,0,0,0,94,0.712766
ENS,59,11,0,41,0,0,0,0,0,111,0.0
IDN,68,13,1,22,0,0,0,0,0,104,0.0
JPN,49,14,3,25,0,0,0,0,0,91,0.153846
KOR,43,13,2,34,0,0,0,0,0,92,0.021739
PAK,24,16,6,55,0,0,0,0,0,101,0.544554
PHL,54,17,4,34,0,0,0,0,0,109,0.0
SIN,59,10,6,19,0,0,0,0,0,94,0.0
THA,55,12,4,33,0,0,0,0,0,104,0.0
All,478,111,30,281,0,0,0,0,0,900,0.153333


`The result shows that feature` **DT_max_dp_cnts_std** `is great at identifying Chinese learners`

In [60]:
X = df_0[['DT_ROOT_idx_mean']]
y = df_0['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)

lrclf.fit(X_train, y_train)
y_pred = lrclf.predict(X_test)

pwk.print_confusion_matrix(y_test.values, y_pred)

Predicted,ENS,JPN,PAK,SIN,THA,IDN,KOR,CHN,PHL,All,Accuracy
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CHN,3,8,20,66,6,0,0,0,0,103,0.0
ENS,3,8,29,51,4,0,0,0,0,95,0.031579
IDN,6,7,41,56,5,0,0,0,0,115,0.0
JPN,4,9,46,31,10,0,0,0,0,100,0.09
KOR,1,14,37,48,8,0,0,0,0,108,0.0
PAK,1,5,67,22,4,0,0,0,0,99,0.676768
PHL,3,8,25,59,4,0,0,0,0,99,0.0
SIN,2,2,11,60,5,0,0,0,0,80,0.75
THA,1,7,57,32,4,0,0,0,0,101,0.039604
All,24,68,333,425,50,0,0,0,0,900,0.158889


`The result shows that feature` **DT_ROOT_idx_mean** `is great at identifying Pakistani and Singapore learners`

In [120]:
df_0['DT_pos_join'][0]

u'NOUN VERB ADP PRON VERB ADJ ADP NOUN NOUN PART VERB ADV PUNCT NOUN NOUN PUNCT ADV PUNCT DET ADJ NOUN ADP NOUN NOUN VERB VERB DET ADJ PUNCT NOUN NOUN PUNCT DET ADP PRON VERB DET NOUN PUNCT NOUN NOUN VERB VERB PRON PART VERB ADP DET NOUN ADV CCONJ VERB PRON ADJ NOUN PUNCT VERB DET NOUN ADP NOUN ADP NOUN PUNCT ADV PROPN VERB DET NOUN NOUN PUNCT PRON VERB ADP NOUN ADP DET NOUN CCONJ VERB PART VERB DET NOUN NOUN DET NOUN PUNCT ADV VERB ADP ADJ NOUN PUNCT PRON ADV VERB DET ADJ NOUN ADP VERB ADP NOUN PUNCT ADP DET ADJ NOUN PART NOUN PUNCT PRON VERB ADP PRON VERB VERB NOUN PRON VERB ADJ NOUN VERB ADV VERB PUNCT PRON VERB DET NOUN ADJ PRON VERB ADP DET NOUN ADP DET NOUN ADP ADJ ADJ PUNCT NOUN PART VERB ADJ NOUN NOUN DET NOUN VERB PUNCT ADV PUNCT PRON VERB ADJ ADP NOUN NOUN PART VERB DET ADJ PUNCT NOUN NOUN PUNCT ADV PUNCT ADJ NOUN ADV VERB ADV VERB PRON PART VERB ADJ PUNCT NOUN NOUN PUNCT PRON VERB ADJ ADP ADJ NOUN CCONJ NOUN PUNCT ADP ADJ NOUN PUNCT VERB DET ADJ NOUN PUNCT NOUN NOUN VERB ADJ

In [116]:
X = df_0['DT_pos_join']
y = df_0['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)

vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(3,3), max_features=800).fit(X_train)
X_train_dtm = vectorizer.transform(X_train)
X_test_dtm = vectorizer.transform(X_test)

lrclf.fit(X_train_dtm, y_train)
y_pred = lrclf.predict(X_test_dtm)

pwk.print_confusion_matrix(y_test.values, y_pred)

Predicted,CHN,ENS,IDN,JPN,KOR,PAK,PHL,SIN,THA,All,Accuracy
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CHN,58,2,3,10,5,5,2,12,0,97,0.597938
ENS,1,74,0,4,2,2,1,19,0,103,0.718447
IDN,7,2,38,8,9,17,8,10,9,108,0.351852
JPN,0,0,2,66,10,3,0,1,2,84,0.785714
KOR,2,4,6,26,31,5,4,8,6,92,0.336957
PAK,1,2,2,3,0,97,2,1,1,109,0.889908
PHL,5,7,3,3,2,13,50,17,0,100,0.5
SIN,4,7,2,5,2,4,5,80,1,110,0.727273
THA,8,4,7,12,11,14,4,2,35,97,0.360825
All,86,102,63,137,72,160,76,150,54,900,0.587778


In [122]:
df_0['DT_insent_arch_ngram'][0]

u'nsubj_ROOT_mark ROOT_mark_nsubj mark_nsubj_ccomp nsubj_ccomp_acomp ccomp_acomp_mark acomp_mark_compound mark_compound_nsubj compound_nsubj_aux nsubj_aux_advcl aux_advcl_amod advcl_amod_punct amod_punct_compound punct_compound_dobj compound_dobj_punct advmod_punct_det punct_det_amod det_amod_nsubj amod_nsubj_prep nsubj_prep_compound prep_compound_pobj compound_pobj_aux pobj_aux_ROOT aux_ROOT_det ROOT_det_compound det_compound_punct compound_punct_compound punct_compound_dobj compound_dobj_punct nsubj_prep_pobj prep_pobj_ROOT pobj_ROOT_mark ROOT_mark_compound mark_compound_punct compound_punct_compound punct_compound_nsubj compound_nsubj_aux nsubj_aux_ccomp aux_ccomp_nsubj ccomp_nsubj_aux nsubj_aux_ccomp aux_ccomp_prep ccomp_prep_det prep_det_pobj det_pobj_advmod pobj_advmod_cc advmod_cc_conj cc_conj_dative conj_dative_amod dative_amod_dobj amod_dobj_punct advcl_det_dobj det_dobj_prep dobj_prep_pobj prep_pobj_prep pobj_prep_pobj prep_pobj_punct pobj_punct_advmod punct_advmod_nsubj advm

In [119]:
X = df_0['DT_insent_arch_ngram']
y = df_0['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)

vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1,1), max_features=800).fit(X_train)
X_train_dtm = vectorizer.transform(X_train)
X_test_dtm = vectorizer.transform(X_test)

lrclf.fit(X_train_dtm, y_train)
y_pred = lrclf.predict(X_test_dtm)

pwk.print_confusion_matrix(y_test.values, y_pred)

Predicted,CHN,ENS,IDN,JPN,KOR,PAK,PHL,SIN,THA,All,Accuracy
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CHN,58,8,9,15,1,2,2,3,0,98,0.591837
ENS,1,70,2,0,0,3,3,13,1,93,0.752688
IDN,5,4,49,7,1,14,7,2,2,91,0.538462
JPN,5,0,4,74,2,2,0,3,1,91,0.813187
KOR,12,9,14,26,25,15,2,0,1,104,0.240385
PAK,1,0,1,1,0,91,1,0,0,95,0.957895
PHL,10,13,8,0,0,14,41,13,2,101,0.405941
SIN,12,16,5,4,0,1,10,67,0,115,0.582609
THA,9,4,20,13,3,13,0,3,47,112,0.419643
All,113,124,112,140,32,155,66,104,54,900,0.58


`Both results show that tfidf matrix of feature` **DT_insent_arch_ngram** `is great at identifying Pakistani learners`