In [1]:
# importing required packages
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
# from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split,KFold
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn import ensemble
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import cohen_kappa_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K

In [2]:
import csv

In [3]:
importing_dataset = pd.read_csv('https://github.com/dnyanada02/SmartGrading/blob/main/Dataset/training_set_rel3.tsv?raw=true', quoting=csv.QUOTE_NONE, sep='\t', encoding='ISO-8859-1')
# dependent variable
scores = importing_dataset['domain1_score']
dataset = importing_dataset.loc[:,['essay_id', 'essay_set', 'essay', 'domain1_score']]
dataset.dropna()
dataset

# dataset = pd.read_csv("/content/training_set_rel3.tsv",sep='\t', encoding='ISO-8859-1',
#                             usecols = ['essay_id', 'essay_set', 'essay','domain1_score']).dropna(axis=1)
# scores = dataset['domain1_score']
# dataset

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"""Dear local newspaper, I think effects compute...",8
1,2,1,"""Dear @CAPS1 @CAPS2, I believe that using comp...",9
2,3,1,"""Dear, @CAPS1 @CAPS2 @CAPS3 More and more peop...",7
3,4,1,"""Dear Local Newspaper, @CAPS1 I have found tha...",10
4,5,1,"""Dear @LOCATION1, I know having computers has ...",8
...,...,...,...,...
12973,21626,8,""" In most stories mothers and daughters are ei...",35
12974,21628,8,""" I never understood the meaning laughter is t...",32
12975,21629,8,"""When you laugh, is @CAPS5 out of habit, or is...",40
12976,21630,8,""" Trippin' on fe...",40


In [4]:
dataset.describe()

Unnamed: 0,essay_id,essay_set,domain1_score
count,12978.0,12978.0,12978.0
mean,10295.432809,4.179458,6.799276
std,6308.588616,2.136749,8.970357
min,1.0,1.0,0.0
25%,4439.25,2.0,2.0
50%,10045.5,4.0,3.0
75%,15680.75,6.0,8.0
max,21633.0,8.0,60.0


In [5]:
# Generating word tokens after removing characters other than alphabets, converting them to lower case and
# removing stopwords from the text'''

def word_tokens(essay_text):
    essay_text = re.sub("[^a-zA-Z]", " ", essay_text)
    words = essay_text.lower().split()
    stop_words = set(stopwords.words("english"))
    words = [w for w in words if not w in stop_words]
    return (words)

In [6]:
# Generating sentence tokens from the essay and finally the word tokens

def sentence_tokens(essay_text):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sent_tokens = tokenizer.tokenize(essay_text.strip())
    sentences = []
    for sent_token in sent_tokens:
        if len(sent_token) > 0:
            sentences.append(word_tokens(sent_token))
    return sentences

In [7]:
# Generating a vector of features

def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            num_words += 1
            featureVec = np.add(featureVec,model[word])        
    featureVec = np.divide(featureVec,num_words)
    return featureVec

In [8]:
# Generating word vectors to be used in word2vec model

def getAvgFeatureVecs(essays, model, num_features):
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay_text in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay_text, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

In [9]:
def get_model():
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

In [10]:
X=dataset.copy()

In [11]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
# Applying k-fold cross validation

cv = KFold(n_splits=5, shuffle=True)
cv.get_n_splits(len(dataset))
results = []
y_pred_list = []

count = 1
for traincv, testcv in cv.split(dataset):
    print("\n------------Fold {}------------\n".format(count))
    X_test, X_train, y_test, y_train = dataset.iloc[testcv], dataset.iloc[traincv], scores.iloc[testcv], scores.iloc[traincv]
    
    train_essays = X_train['essay']
    test_essays = X_test['essay']
    
    sentences = []
    
    for essay in train_essays:
            # Obtaining all sentences from the training set of essays.
            sentences += sentence_tokens(essay)
            
    # Initializing variables for word2vec model.
    num_features = 300 
    min_word_count = 40
    num_workers = 4
    context = 10
    downsampling = 1e-3

    print("Training Word2Vec Model...")
    model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

    model.init_sims(replace=True)
    model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

    clean_train_essays = []
    
    # Generate training and testing data word vectors.
    for essay_text in train_essays:
        clean_train_essays.append(word_tokens(essay_text))
    trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)
    
    clean_test_essays = []
    for essay_text in test_essays:
        clean_test_essays.append(word_tokens(essay_text))
    testDataVecs = getAvgFeatureVecs(clean_test_essays, model, num_features)
    
    trainDataVecs = np.array(trainDataVecs)
    testDataVecs = np.array(testDataVecs)
    # Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
    trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
    testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))
    trainDataVecs = np.nan_to_num(trainDataVecs.astype(np.float32))
    testDataVecs = np.nan_to_num(testDataVecs.astype(np.float32))

    lstm_model = get_model()
    lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=50)
    #lstm_model.load_weights('./model_weights/final_lstm.h5')
    lstm_model.save("LSTM.h5")
    y_pred = lstm_model.predict(testDataVecs)
    
    # Round y_pred to the nearest integer.
    y_pred = np.around(y_pred)
    
    '''Evaluation metric used : 
    1. Mean squared error
    2. Variance
    3. Cohen's kappa score
    Expected results - Minimum error, maximum variance(For variance, best possible score is 1.0, lower 
    values are worse.) and maximum kappa score(1 depicting the best scores)'''
    
    # Mean squared error
    print("Mean squared error: {0:.2f}".format(mean_squared_error(y_test.values, y_pred)))

    # Explained variance score: 1 is perfect prediction
    print('Variance: {0:.2f}'.format(explained_variance_score(y_test.values, y_pred)))  
    
    #Cohen's kappa score
    result = cohen_kappa_score(y_test.values,y_pred,weights='quadratic')
    print("Kappa Score: {0:.2f}".format(result))
    results.append(result)

    count += 1


------------Fold 1------------

Training Word2Vec Model...


  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1, 300)            721200    
                                                                 
 lstm_1 (LSTM)               (None, 64)                93440     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 814,705
Trainable params: 814,705
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50

  # Remove the CWD from sys.path while we load stuff.


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 1, 300)            721200    
                                                                 
 lstm_3 (LSTM)               (None, 64)                93440     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 814,705
Trainable params: 814,705
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/

In [13]:
print("Average Kappa score after a 5-fold cross validation: ",np.around(np.array(results).mean(),decimals=2))

Average Kappa score after a 2-fold cross validation:  0.95


# Prediction

In [14]:
# As lstm outperforms all other models, so using it for predicting the scores for the final dataset
valid_set = pd.read_csv('https://github.com/dnyanada02/SmartGrading/blob/main/Dataset/valid_set.tsv?raw=true', sep='\t', encoding='ISO-8859-1')

In [15]:
valid_set = valid_set.drop(['domain2_predictionid'], axis = 1)

In [16]:
valid_set.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_predictionid
0,1788,1,"Dear @ORGANIZATION1, @CAPS1 more and more peop...",1788
1,1789,1,Dear @LOCATION1 Time @CAPS1 me tell you what I...,1789
2,1790,1,"Dear Local newspaper, Have you been spending a...",1790
3,1791,1,"Dear Readers, @CAPS1 you imagine how life woul...",1791
4,1792,1,"Dear newspaper, I strongly believe that comput...",1792


In [17]:
valid_test_essays = valid_set['essay']

In [18]:
valid_test_essays

0       Dear @ORGANIZATION1, @CAPS1 more and more peop...
1       Dear @LOCATION1 Time @CAPS1 me tell you what I...
2       Dear Local newspaper, Have you been spending a...
3       Dear Readers, @CAPS1 you imagine how life woul...
4       Dear newspaper, I strongly believe that comput...
                              ...                        
4213     Have you ever noticed that if two little kids...
4214                                Laughter @CAPS1 I ...
4215     Laughter in @CAPS1 A laugh is not just an act...
4216      LAUGHTER @CAPS1 i was younger my friend live...
4217     You know how the saying goes live, laugh, lov...
Name: essay, Length: 4218, dtype: object

In [19]:
sentences = []
    
for valid_essay in valid_test_essays:
        sentences += sentence_tokens(valid_essay)
            
num_features = 300 
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

print("Training Word2Vec Model...")
model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

model.init_sims(replace=True)
model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

valid_clean_test_essays = []
    
# Generate training and testing data word vectors.
for essay_text in valid_test_essays:
    valid_clean_test_essays.append(word_tokens(essay_text))
valid_testDataVecs = getAvgFeatureVecs(valid_clean_test_essays, model, num_features)

valid_testDataVecs = np.array(valid_testDataVecs)
# Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
valid_testDataVecs = np.reshape(valid_testDataVecs, (valid_testDataVecs.shape[0], 1, valid_testDataVecs.shape[1]))
    
predicted_scores = lstm_model.predict(valid_testDataVecs)
    
# Round y_pred to the nearest integer.
predicted_scores = np.around(predicted_scores)

Training Word2Vec Model...


  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


In [20]:
submission = valid_set.drop(['essay'], axis = 1)

In [21]:
predicted_score = predicted_scores.tolist()

In [22]:
predicted_score = pd.Series([score for sublist in predicted_scores for score in sublist])

In [23]:
predicted_score.head()

0    10.0
1     6.0
2    15.0
3     9.0
4     7.0
dtype: float32

In [24]:
submission = pd.concat([submission, predicted_score], axis = 1).rename(columns = {0:"predicted_score"}).iloc[:,[2,0,1,3]]
submission.to_excel("Submission.xls",index=False)

  


In [25]:
from sklearn.metrics import classification_report,confusion_matrix,cohen_kappa_score

# print('training accuracy:',trainDataVecs[1]*100)
# print('testing accuracy:',testDataVecs[1]*100)

# y_pred = lstm_model.predict(testDataVecs)
# y_pred = np.around(y_pred)
print(classification_report(y_test.values,y_pred))
print('Confusion matix:\n',confusion_matrix(y_test.values,y_pred))
print('Cohen-kappa score:',cohen_kappa_score(y_test.values,y_pred,weights='quadratic'))

              precision    recall  f1-score   support

           0       0.17      0.07      0.10       209
           1       0.43      0.09      0.15       850
           2       0.38      0.71      0.49      1256
           3       0.44      0.48      0.46      1411
           4       0.55      0.13      0.21       720
           5       0.23      0.31      0.27        51
           6       0.12      0.15      0.13        60
           7       0.16      0.25      0.19        81
           8       0.47      0.37      0.42       355
           9       0.20      0.36      0.26       174
          10       0.24      0.13      0.16       183
          11       0.03      0.01      0.02        77
          12       0.00      0.00      0.00        66
          13       0.09      0.10      0.09        40
          14       0.07      0.05      0.06        56
          15       0.13      0.17      0.15        47
          16       0.18      0.15      0.16        96
          17       0.15    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
