## Imports packages

In [4]:
#For  Colab 
#from google.colab import drive
#drive.mount('/content/drive')


In [5]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [37]:
import warnings
warnings.filterwarnings("ignore")                     #Ignoring unnecessory warnings

import re 
import nltk                                         #Natural language processing tool-kit
from nltk.corpus import stopwords                   #Stopwords corpus
from nltk.stem import PorterStemmer                 # Stemmer

from sklearn.feature_extraction.text import CountVectorizer          #For Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer          #For TF-IDF
from gensim.models import Word2Vec                                   #For Word2Vec

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot

from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K

from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import accuracy_score


## Imports data

In [7]:
#Local
df = pd.read_csv("../data/training_set_rel3.tsv", sep='\t', encoding='ISO-8859-1')

#Colab
#df = pd.read_csv("/content/drive/MyDrive/Automated_Essay_Scoring/data/training_set_rel3.tsv", sep='\t', encoding='ISO-8859-1')


df = df.dropna(axis=1)
df = df.drop(columns=['rater1_domain1', 'rater2_domain1'])
df = df.drop(columns=['essay_id'])
df.head()

Unnamed: 0,essay_set,essay,domain1_score
0,1,"Dear local newspaper, I think effects computer...",8
1,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,1,"Dear @LOCATION1, I know having computers has a...",8


In [8]:
#df['essay'][1]

In [9]:
#we rescale to have all scores to be in the same interval
for i in range(len(df)):
    if df['essay_set'][i] == 1:
        df['domain1_score'][i] = df['domain1_score'][i] - 2
    elif df['essay_set'][i] == 2:
        df['domain1_score'][i] = (df['domain1_score'][i] - 1)*2
    elif df['essay_set'][i] == 3:
        df['domain1_score'][i] = (df['domain1_score'][i])*10/3
    elif df['essay_set'][i] == 4:
        df['domain1_score'][i] = (df['domain1_score'][i])*10/3
    elif df['essay_set'][i] == 5:
        df['domain1_score'][i] = (df['domain1_score'][i])*10/4
    elif df['essay_set'][i] == 6:
        df['domain1_score'][i] = (df['domain1_score'][i])*10/4
    elif df['essay_set'][i] == 7:
        df['domain1_score'][i] = (df['domain1_score'][i])/3
    elif df['essay_set'][i] == 8:
        df['domain1_score'][i] = (df['domain1_score'][i])/6

## Text pre-treatment

In [10]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
len(stop_words) #finding stop words

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/GuillaumeKunsch/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


179

In [11]:
snow = nltk.stem.SnowballStemmer('english')

corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['essay'][i])
    review = review.lower()
    review = review.split()
    
    review = [snow.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [12]:
voc_size=5000
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
type(onehot_repr)

list

In [13]:
sent_length=400
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ...  492 3258 4761]
 [   0    0    0 ... 3500 1051 1707]
 [   0    0    0 ... 1648 4919 2295]
 ...
 [3162 1714 4364 ... 2897 3757 1911]
 [   0    0    0 ... 4826 2098 4811]
 [   0    0    0 ... 3162 2433 2922]]


## NN architecture

In [26]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/GuillaumeKunsch/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [50]:
def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    index2word_set = set(model.wv.index_to_key)
    for word in words:
        if word in index2word_set:
            num_words += 1
            featureVec = np.add(featureVec,model.wv[word])        
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for word2vec model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

In [21]:
def get_model():
    """Define the model."""
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['accuracy','mae'])
    model.summary()

    return model

In [23]:
X=df
y = X['domain1_score']

In [63]:
cv = KFold(n_splits = 5, shuffle = True)
results = []
y_pred_list = []

count = 1
for traincv, testcv in cv.split(X):
    print("\n--------Fold {}--------\n".format(count))
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]
    
    train_essays = X_train['essay']
    test_essays = X_test['essay']
    
    sentences = []
    
    for essay in train_essays:
            # Obtaining all sentences from the training essays.
            sentences += essay_to_sentences(essay, remove_stopwords = True)
            
    # Initializing variables for word2vec model.
    num_features = 300 
    min_word_count = 40
    num_workers = 4
    context = 10
    downsampling = 1e-3

    print("Training Word2Vec Model...")
    model = Word2Vec(sentences, workers=num_workers, vector_size=num_features, min_count = min_word_count, window = context, sample = downsampling)

    model.init_sims(replace=True)
    model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

    clean_train_essays = []
    
    # Generate training and testing data word vectors.
    for essay_v in train_essays:
        clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
    trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)
    
    clean_test_essays = []
    for essay_v in test_essays:
        clean_test_essays.append(essay_to_wordlist( essay_v, remove_stopwords=True ))
    testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
    
    trainDataVecs = np.array(trainDataVecs)
    testDataVecs = np.array(testDataVecs)
    # Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
    trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
    testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))
    
    lstm_model = get_model()
    lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=50)
    #lstm_model.load_weights('./model_weights/final_lstm.h5')
    y_pred = lstm_model.predict(testDataVecs)
    
    # Save any one of the 5 models.
    if count == 5:
         lstm_model.save('./final_lstm.h5')
    
    # Round y_pred to the nearest integer.
    y_pred = np.around(y_pred)
    
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    result = cohen_kappa_score(y_test.values,y_pred,weights='quadratic')
    acc = accuracy_score(y_test.values,y_pred)
    print("acc Score: {}".format(acc))
    print("Kappa Score: {}".format(result))
    results.append(result)

    count += 1



--------Fold 1--------

Training Word2Vec Model...
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 1, 300)            721200    
                                                                 
 lstm_3 (LSTM)               (None, 64)                93440     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 814,705
Trainable params: 814,705
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch

KeyboardInterrupt: 

In [None]:
print("Average Kappa score after a 5-fold cross validation: ",np.around(np.array(results).mean(),decimals=4))

Average Kappa score after a 5-fold cross validation:  0.9217


In [None]:
demo = ["Dear@CAPS1 @CAPS2, I believe that using computers will benefit us in many ways like talking and becoming friends will others through websites like facebook and mysace. Using computers can help us find coordibates, locations, and able ourselfs to millions of information. Also computers will benefit us by helping with jobs as in planning a house plan and typing a @NUM1 page report for one of our jobs in less than writing it. Now lets go into the wonder world of technology. Using a computer will help us in life by talking or making friends on line. Many people have myspace, facebooks, aim, these all benefit us by having conversations with one another. Many people believe computers are bad but how can you make friends if you can never talk to them? I am very fortunate for having a computer that can help with not only school work but my social life and how I make friends. Computers help us with finding our locations, coordibates and millions of information online. If we didn't go on the internet a lot we wouldn't know how to go onto websites that @MONTH1 help us with locations and coordinates like @LOCATION1. Would you rather use a computer or be in @LOCATION3. When your supposed to be vacationing in @LOCATION2. Million of information is found on the internet. You can as almost every question and a computer will have it. Would you rather easily draw up a house plan on the computers or take @NUM1 hours doing one by hand with ugly erazer marks all over it, you are garrenteed that to find a job with a drawing like that. Also when appling for a job many workers must write very long papers like a @NUM3 word essay on why this job fits you the most, and many people I know don't like writing @NUM3 words non-stopp for hours when it could take them I hav an a computer. That is why computers we needed a lot now adays. I hope this essay has impacted your descion on computers because they are great machines to work with. The other day I showed my mom how to use a computer and she said it was the greatest invention sense sliced bread! Now go out and buy a computer to help you chat online with friends, find locations and millions of information on one click of the button and help your self with getting a job with neat, prepared, printed work that your boss will love."]
demo_df = pd.DataFrame(demo,columns=['essay'])
demo_df.head()

Unnamed: 0,essay
0,"Dear@CAPS1 @CAPS2, I believe that using comput..."


In [None]:
type(demo_df['essay'])

pandas.core.series.Series

In [None]:
content = "Dear@CAPS1 @CAPS2, I believe that using computers will benefit us in many ways like talking and becoming friends will others through websites like facebook and mysace. Using computers can help us find coordibates, locations, and able ourselfs to millions of information. Also computers will benefit us by helping with jobs as in planning a house plan and typing a @NUM1 page report for one of our jobs in less than writing it. Now lets go into the wonder world of technology. Using a computer will help us in life by talking or making friends on line. Many people have myspace, facebooks, aim, these all benefit us by having conversations with one another. Many people believe computers are bad but how can you make friends if you can never talk to them? I am very fortunate for having a computer that can help with not only school work but my social life and how I make friends. Computers help us with finding our locations, coordibates and millions of information online. If we didn't go on the internet a lot we wouldn't know how to go onto websites that @MONTH1 help us with locations and coordinates like @LOCATION1. Would you rather use a computer or be in @LOCATION3. When your supposed to be vacationing in @LOCATION2. Million of information is found on the internet. You can as almost every question and a computer will have it. Would you rather easily draw up a house plan on the computers or take @NUM1 hours doing one by hand with ugly erazer marks all over it, you are garrenteed that to find a job with a drawing like that. Also when appling for a job many workers must write very long papers like a @NUM3 word essay on why this job fits you the most, and many people I know don't like writing @NUM3 words non-stopp for hours when it could take them I hav an a computer. That is why computers we needed a lot now adays. I hope this essay has impacted your descion on computers because they are great machines to work with. The other day I showed my mom how to use a computer and she said it was the greatest invention sense sliced bread! Now go out and buy a computer to help you chat online with friends, find locations and millions of information on one click of the button and help your self with getting a job with neat, prepared, printed work that your boss will love."

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
num_features = 300
      
model = KeyedVectors.load_word2vec_format( "./word2vecmodel.bin", binary=True)
clean_test_essays = []
clean_test_essays.append(essay_to_wordlist( content, remove_stopwords=True ))
testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
testDataVecs = np.array(testDataVecs)
testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))

# lstm_model = get_model()
lstm_model.load_weights("./final_lstm.h5")
preds = lstm_model.predict(testDataVecs)

In [None]:
int(np.around(preds))

9

In [None]:
# val_essays = demo_df['essay']
# sentences = []

# for essay in val_essays:
#         sentences += essay_to_sentences(essay, remove_stopwords = True)
        

# num_features = 300 
# min_word_count = 40
# num_workers = 4
# context = 10
# downsampling = 1e-3


# model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

# model.init_sims(replace=True)

# clean_train_essays = []

# # Generate training and testing data word vectors.
# for essay_v in val_essays:
#     clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
# trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)

# trainDataVecs = np.array(trainDataVecs)
# # Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
# trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))


# y_pred = lstm_model.predict(trainDataVecs)



# y_pred = np.around(y_pred)


# # result = cohen_kappa_score(y_test.values,y_pred,weights='quadratic')
# # print("Kappa Score: {}".format(result))
# print(y_pred)

In [None]:
y_test.T.shape

(2064,)

In [None]:
y_pred1 = y_pred.T.reshape(2595,)

ValueError: cannot reshape array of size 2064 into shape (2595,)

In [None]:
y_pred1

NameError: name 'y_pred1' is not defined

In [None]:
y_test1 = y_test.T.reshape

AttributeError: 'Series' object has no attribute 'reshape'

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred1)

NameError: name 'y_pred1' is not defined

In [None]:
# Save a palette to a variable:
palette = sns.color_palette("bright")
 
# Use palplot and pass in the variable:
sns.palplot(palette)

NameError: name 'sns' is not defined

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
array =cm[0:10,0:10]
df_cm = pd.DataFrame(array)
plt.figure(figsize = (20,20))
sns.heatmap(df_cm,cmap = 'Blues',square=True, annot=True)

NameError: name 'cm' is not defined

In [None]:
cm[0:10,0:10]

NameError: name 'cm' is not defined

In [None]:
arr = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
print(arr.shape)
print(arr[1:4, 4:1])
# print(arr)

(2, 5)
[]


In [None]:
cm.shape

NameError: name 'cm' is not defined

In [None]:
corr = cm

mask = np.triu(np.ones_like(corr, dtype=np.bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title('Fig:1',size=15)

NameError: name 'cm' is not defined

In [None]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred1, average='macro')

NameError: name 'y_pred1' is not defined

In [None]:
from sklearn.metrics import classification_report
cr = classification_report(y_test, y_pred1)

NameError: name 'y_pred1' is not defined

In [None]:
cr

NameError: name 'cr' is not defined

In [None]:
import seaborn as sns
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt

def plot_classification_report(y_tru, y_prd, figsize=(10, 10), ax=None):

    plt.figure(figsize=figsize)

    xticks = ['precision', 'recall', 'f1-score', 'support']
    yticks = list(np.unique(y_tru))
    yticks += ['avg']

    rep = np.array(precision_recall_fscore_support(y_tru, y_prd)).T
    avg = np.mean(rep, axis=0)
    avg[-1] = np.sum(rep[:, -1])
    rep = np.insert(rep, rep.shape[0], avg, axis=0)

    sns.heatmap(rep,
                annot=True, 
                cbar=False, 
                xticklabels=xticks, 
                yticklabels=yticks,
                ax=ax)

plot_classification_report(y_test, y_pred1)

NameError: name 'y_pred1' is not defined

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import itertools


def plot_classification_report(classificationReport,
                               title='Classification report',
                               cmap='RdBu'):

    classificationReport = classificationReport.replace('\n\n', '\n')
    classificationReport = classificationReport.replace(' / ', '/')
    lines = classificationReport.split('\n')

    classes, plotMat, support, class_names = [], [], [], []
    for line in lines[1:]:  # if you don't want avg/total result, then change [1:] into [1:-1]
        t = line.strip().split()
        if len(t) < 2:
            continue
        classes.append(t[0])
        v = [float(x) for x in t[1: len(t) - 1]]
        support.append(int(t[-1]))
        class_names.append(t[0])
        plotMat.append(v)

    plotMat = np.array(plotMat)
    xticklabels = ['Precision', 'Recall', 'F1-score']
    yticklabels = ['{0} ({1})'.format(class_names[idx], sup)
                   for idx, sup in enumerate(support)]

    plt.imshow(plotMat, interpolation='nearest', cmap=cmap, aspect='auto')
    plt.title(title)
    plt.colorbar()
    plt.xticks(np.arange(3), xticklabels, rotation=45)
    plt.yticks(np.arange(len(classes)), yticklabels)

    upper_thresh = plotMat.min() + (plotMat.max() - plotMat.min()) / 10 * 8
    lower_thresh = plotMat.min() + (plotMat.max() - plotMat.min()) / 10 * 2
    for i, j in itertools.product(range(plotMat.shape[0]), range(plotMat.shape[1])):
        plt.text(j, i, format(plotMat[i, j], '.2f'),
                 horizontalalignment="center",
                 color="white" if (plotMat[i, j] > upper_thresh or plotMat[i, j] < lower_thresh) else "black")

    plt.ylabel('Metrics')
    plt.xlabel('Classes')
    plt.tight_layout()


def main():

    sampleClassificationReport = cr
    plot_classification_report(sampleClassificationReport)
    plt.show()
    plt.close()


if __name__ == '__main__':
    main()

NameError: name 'cr' is not defined

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,plot_confusion_matrix

In [None]:
classifiers = {
    "LogisiticRegression": LogisticRegression(),
    "KNearest": KNeighborsClassifier(n_neighbors=1),
    "Support Vector Classifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "MultinimialNB": MultinomialNB()
}


In [None]:
trainDataVecs.shape[2]

300

In [None]:
trainDataVecs1 = np.reshape(trainDataVecs,trainDataVecs.shape[0],trainDataVecs.shape[2])
trainDataVecs1.shape

ValueError: cannot reshape array of size 2477700 into shape (8259,)

In [None]:
from sklearn.model_selection import cross_val_score

classifier = KNeighborsClassifier()

classifier.fit(trainDataVecs, y_train)
training_score = cross_val_score(classifier, train_vectors, df_train["domain1_score"], cv=5)
print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

ValueError: Found array with dim 3. Estimator expected <= 2.

In [None]:
np.unique(df['domain1_score'],return_counts=True)

KeyError: 'domain1_score'

In [None]:
df

Unnamed: 0,essay,score
0,Write a persuasive essay to a newspaper reflec...,1
1,If the people that are publishing and writing ...,1
2,I do think that there should be a censorship i...,1
3,I belive that they need to keep every thing an...,1
4,Hi my name is @CAPS1 @CAPS2 @PERSON1 and I'm h...,1
...,...,...
10318,A time I was patient was when I had to be pati...,10
10319,Being patient is hard to be. First of all its ...,10
10320,One day I was watching @CAPS1. I was bored the...,10
10321,My own way about patient is that when somebody...,10


In [None]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(df['domain1_score']),
                                                 train)

KeyError: 'domain1_score'