In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings('ignore','.*encoding is deprecated, Use raw=False instead..*')
warnings.filterwarnings('ignore','.*the matrix subclass is not the recommended way to represent matrices or deal with linear algebra.*')
warnings.filterwarnings('ignore','.*Precision and F-score are ill-defined.*')
warnings.filterwarnings('ignore','.*Data with input dtype int64 was converted to float64 by StandardScaler.*')

# importing required packages
from pathlib import Path

from collections import Counter

import _pickle as cPickle

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from __future__ import print_function

from pprint import pprint
from time import time
import logging


from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support, cohen_kappa_score 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm  import SVC
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE

from imblearn.pipeline import Pipeline as imb_pipeline
from sklearn.pipeline import Pipeline as skl_pipeline

from sklearn.metrics import make_scorer, f1_score


from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

np.random.seed(42)
%matplotlib inline

  from numpy.core.umath_tests import inner1d


Automatically created module for IPython interactive environment


In [2]:
myDir = Path.cwd().parents[0]
dataFolder = myDir / 'data/asap-sas'
ratingsFolder = myDir / 'data/ratings'
figureFolder = myDir / 'figures'
modelFolder = myDir /'experiments/models'

print(modelFolder)

C:\Users\Devanshi\Desktop\finalProject\experiments\models


In [3]:
df = pd.read_csv(dataFolder/'training.csv', header=0)  #read data into dataframe
df.head(2)

Unnamed: 0,EssaySet,subject,studentGrade,EssayText,Score1,styleScore,totalChars,total_words,words_no_punct,words_no_punct_no_stop,count_content_words,count_stopwords,avg_word_len,maturity,concreteness,content_only_text,function_based_text,function_only_text,pos_tags
0,7,English,10,One trait that describes rose is hard-working....,1,1,120,27,23,12,15,11,4.173913,5.487419,2.449177,trait that describes rose is hard working...,one __NOUN__ __ADJ__ __VERB__ __VERB__ __VERB_...,"One - . I this because she to , but sh...",NUM NOUN ADJ VERB VERB VERB ADV PUNCT VERB PUN...
1,9,English,10,First the author has an introduction to grab t...,1,1,190,38,33,18,20,15,4.666667,5.333607,2.461765,First author has introduction grab reader...,__ADV__ the __NOUN__ __VERB__ an __NOUN__ to _...,"the an to the 's . , the if in the ...",ADV DET NOUN VERB DET NOUN PART VERB DET NOUN ...


In [4]:
# Take only essay set 1
df = df[~(df.subject == 'Biology')]
print(df.shape)

X = df[['EssayText','Score1']].copy()
X.reset_index(drop=True,inplace=True)
y= X.pop('Score1')

print(X.shape, y.shape)

(10895, 19)
(10895, 1) (10895,)


## Experiments around Features and Hyper Parameter Tuning

### 1) Custom features

### 2) BoW & TF.IDF (n-grams)

In [33]:
random_state = 42

In [34]:
def get_pipeline(smote_on, clf):
    
    #Do smote sampling
    if smote_on == 1:
        pipeline = imb_pipeline([('vect',CountVectorizer()),
                                 ('smote', SMOTE(ratio='auto',random_state=random_state)),
                                 ('tfidf', TfidfTransformer(smooth_idf=True)),
                                 (model_name,clf)
                                  ])
        
    #No smote sampling
    else:
        pipeline = skl_pipeline([('vect',CountVectorizer()),
                             ('tfidf', TfidfTransformer(smooth_idf=True)),
                             (model_name,clf)
                              ])

    return pipeline

In [39]:
#======================EXPERIMENT CONFIG===================
experiment_name = 'gridSearch_Content'
text_column = 'EssayText'


#Models
models = [('LogR', LogisticRegression(random_state=random_state, multi_class='multinomial',
                                      class_weight='balanced', solver='newton-cg')),
         ('NB', MultinomialNB()),
          ('SVM',SVC(random_state=random_state, class_weight='balanced')),
         ('RF',RandomForestClassifier(random_state=random_state, class_weight='balanced'))]

#,'LogR__max_iter':[50,100,150]
#,'SVM__gamma':[0.1,1,10]
model_params = {'LogR':{'LogR__C':[1,0.1,0.01]},
                  'NB':{'NB__alpha':[1,0.5,0.1]},
                  'SVM':{'SVM__kernel':['linear','rbf'], 'SVM__C':[0.1,1,10]},
                  'RF':{'RF__n_estimators':[50,100,150], 'RF__max_depth':[50,100,500] }
                 }
 
# Vectoriser Parameters for Grid Search
vect_params = {'vect__max_df': (0.9, 1.0),
              'vect__max_features': (None, 8000), 
              'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
              'vect__stop_words':(None,'english'),
              'tfidf__use_idf': (True, False)
              }


#Use f1-micro and Cohen's Kappa for Grid search evaluations
scoring = {'f1-micro':  make_scorer(f1_score, average='micro'),
           'cohens_kappa': make_scorer(cohen_kappa_score)}




In [None]:
# For each model
for model_name, clf in models:
    print('\n================{}================'.format(model_name))
        
    #Empty array to store results
    tmp_results = []
    
    #for each subject
    for subject in list(df.subject.unique()):
        print('\n----------{}----------'.format(subject))

        X = df[(df['subject'] == subject)][[text_column,'Score1']].copy()
        X.reset_index(drop=True,inplace=True)
        y= X.pop('Score1')

        target_names = [str(i) for i in sorted(y.unique())] #Convert to string

        print(X.shape, y.shape)
        
        
        #for SMOTE sampling true or false
        for smote_on in range(0,2):
            
            parameters = {}
            
            myrow = {'Subject':subject, 'Model':model_name, 'Smote':smote_on}
         
            pipeline = get_pipeline(smote_on, clf)
            
            #Update parameters for classifier
            parameters.update(vect_params)
            parameters.update(model_params[model_name])
            
            #----------------------------GRID SEARCH ----------------------------------------------
            grid_search = GridSearchCV(pipeline, parameters, scoring=scoring, refit='cohens_kappa',
                                        cv=5, n_jobs=-1, verbose=1, return_train_score=True) 

            print("Performing grid search...")
            print("pipeline:", [name for name, _ in pipeline.steps])
            print("parameters:")
            pprint(parameters)
        
            t0 = time()
            grid_search.fit(X[text_column], y)
            print("done in %0.3fs" % (time() - t0))
            
            
            print("\nBest score: %0.3f" % grid_search.best_score_)
            print("Best parameters set:")
            best_parameters = grid_search.best_estimator_.get_params()

            for param_name in sorted(parameters.keys()):
                print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
            # get results
            myrow.update(grid_search.cv_results_)            

            #append results into list
            tmp_results.append(myrow)
            
            #Save the best estimator for each subject & model
            with open(modelFolder/'{}_{}_content.pkl'.format(subject,model_name), 'wb') as fid:
                cPickle.dump(grid_search.best_estimator_, fid)    
            
    #Save the results in excel
    results = pd.Dataframe(tmp_results)
    
    writer = pd.ExcelWriter(modelFolder/'{}_content_GS.xlsx'.format(model_name))
    results.to_excel(writer,'Sheet1')
    writer.save()
            



----------English----------
(4296, 1) (4296,)
Performing grid search...
pipeline: ['vect', 'tfidf', 'LogR']
parameters:
{'LogR__C': [1, 0.1, 0.01],
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.9, 1.0),
 'vect__max_features': (None, 8000),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__stop_words': (None, 'english')}
Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   34.9s


In [None]:
results

In [51]:
# #############################################################################
# Define a pipeline combining a text feature extractor with a simple classifier
pipeline = imb_pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('resampler',SMOTE(ratio='all',random_state=random_state)),
    ('clf', LogisticRegression(random_state=42,multi_class='multinomial', solver='newton-cg')),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000)
#     ,
#     'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
#     'vect__stop_words':(None,'english'),
#     'tfidf__use_idf': (True, False),
# #     'clf__max_iter': (50,100,150),
#     'clf__C': (0.1,0.001),
}

# find the best parameters for both the feature extraction and the classifier
grid_search = GridSearchCV(pipeline, parameters, scoring=scoring, refit='cohens_kappa',
                           cv=5, n_jobs=-1, verbose=1, return_train_score=True) 

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(X.EssayText, y)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'resampler', 'clf']
parameters:
{'vect__max_df': (0.5, 0.75, 1.0), 'vect__max_features': (None, 5000)}
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   22.0s finished


done in 23.631s

Best score: 0.488
Best parameters set:
	vect__max_df: 0.75
	vect__max_features: None


In [52]:
grid_search.best_estimator_

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        stri...alty='l2', random_state=42, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))])

In [53]:
grid_search.best_score_

0.48827234700588906

In [77]:

# save the classifier
with open(modelFolder/'testing_classifier.pkl', 'wb') as fid:
    cPickle.dump(grid_search.best_estimator_, fid)    

# load it again
# with open('my_dumped_classifier.pkl', 'rb') as fid:
#     gnb_loaded = cPickle.load(fid)


In [64]:

pd.DataFrame(myrow)

Unnamed: 0,Subject,Model,Smote,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_vect__max_df,param_vect__max_features,params,...,mean_test_cohens_kappa,std_test_cohens_kappa,rank_test_cohens_kappa,split0_train_cohens_kappa,split1_train_cohens_kappa,split2_train_cohens_kappa,split3_train_cohens_kappa,split4_train_cohens_kappa,mean_train_cohens_kappa,std_train_cohens_kappa
0,English,Logistic Regression,1,1.458831,0.197223,0.184738,0.003708,0.5,,"{'vect__max_df': 0.5, 'vect__max_features': None}",...,0.48341,0.035603,6,0.748034,0.741879,0.735919,0.742846,0.739428,0.741621,0.004
1,English,Logistic Regression,1,1.495641,0.039982,0.187302,0.006626,0.5,5000.0,"{'vect__max_df': 0.5, 'vect__max_features': 5000}",...,0.485109,0.038627,5,0.742376,0.736173,0.734622,0.739368,0.731145,0.736737,0.003867
2,English,Logistic Regression,1,1.58623,0.013895,0.189941,0.01105,0.75,,"{'vect__max_df': 0.75, 'vect__max_features': N...",...,0.488272,0.033995,1,0.743665,0.740591,0.735,0.740618,0.724567,0.736888,0.006767
3,English,Logistic Regression,1,1.557021,0.036691,0.184569,0.003592,0.75,5000.0,"{'vect__max_df': 0.75, 'vect__max_features': 5...",...,0.485507,0.034119,3,0.738896,0.738815,0.730651,0.734946,0.717107,0.732083,0.008078
4,English,Logistic Regression,1,1.573363,0.031348,0.193581,0.016257,1.0,,"{'vect__max_df': 1.0, 'vect__max_features': None}",...,0.488272,0.033995,1,0.743665,0.740591,0.735,0.740618,0.724567,0.736888,0.006767
5,English,Logistic Regression,1,1.551547,0.016249,0.176738,0.009753,1.0,5000.0,"{'vect__max_df': 1.0, 'vect__max_features': 5000}",...,0.485507,0.034119,3,0.738896,0.738815,0.730651,0.734946,0.717107,0.732083,0.008078


In [54]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_vect__max_df,param_vect__max_features,params,split0_test_f1-macro,split1_test_f1-macro,split2_test_f1-macro,...,mean_test_cohens_kappa,std_test_cohens_kappa,rank_test_cohens_kappa,split0_train_cohens_kappa,split1_train_cohens_kappa,split2_train_cohens_kappa,split3_train_cohens_kappa,split4_train_cohens_kappa,mean_train_cohens_kappa,std_train_cohens_kappa
0,1.458831,0.197223,0.184738,0.003708,0.5,,"{'vect__max_df': 0.5, 'vect__max_features': None}",0.614089,0.659351,0.660316,...,0.48341,0.035603,6,0.748034,0.741879,0.735919,0.742846,0.739428,0.741621,0.004
1,1.495641,0.039982,0.187302,0.006626,0.5,5000.0,"{'vect__max_df': 0.5, 'vect__max_features': 5000}",0.612016,0.670881,0.6635,...,0.485109,0.038627,5,0.742376,0.736173,0.734622,0.739368,0.731145,0.736737,0.003867
2,1.58623,0.013895,0.189941,0.01105,0.75,,"{'vect__max_df': 0.75, 'vect__max_features': N...",0.622326,0.668459,0.664109,...,0.488272,0.033995,1,0.743665,0.740591,0.735,0.740618,0.724567,0.736888,0.006767
3,1.557021,0.036691,0.184569,0.003592,0.75,5000.0,"{'vect__max_df': 0.75, 'vect__max_features': 5...",0.618829,0.667661,0.661506,...,0.485507,0.034119,3,0.738896,0.738815,0.730651,0.734946,0.717107,0.732083,0.008078
4,1.573363,0.031348,0.193581,0.016257,1.0,,"{'vect__max_df': 1.0, 'vect__max_features': None}",0.622326,0.668459,0.664109,...,0.488272,0.033995,1,0.743665,0.740591,0.735,0.740618,0.724567,0.736888,0.006767
5,1.551547,0.016249,0.176738,0.009753,1.0,5000.0,"{'vect__max_df': 1.0, 'vect__max_features': 5000}",0.618829,0.667661,0.661506,...,0.485507,0.034119,3,0.738896,0.738815,0.730651,0.734946,0.717107,0.732083,0.008078


### 4) Word Embeddings