In [2]:
import pandas as pd
import numpy as np
from string import punctuation
import re

In [14]:
essays = pd.read_csv('essay_dataset_with_ner_sample11.csv', encoding = "utf-8")
essays.head()

Unnamed: 0,ID,EssayBody,ParagraphCount,WordCount,EssayScore
0,2,"According to some people , the role of a fathe...",4,419,6.0
1,20271,"With the growth in internet technology , stude...",4,287,7.5
2,13400,"Nowadays , education has become a vital role i...",4,331,7.0
3,20272,It is important to consider whether to give ho...,4,312,7.5
4,29811,There will always be some popular leisure acti...,4,279,8.5


In [15]:
essays.shape

(2430, 5)

In [16]:

essays['EssayBody'] = essays['EssayBody'].apply(lambda x: x.encode('ascii', 'ignore').\
                                          strip().decode())


In [6]:
isnull = essays.isnull().sum()
dtype = essays.dtypes
print('Number of null values in each column:\n \n{}\n'.format(isnull))
print('Data Type of each column:\n \n{} \n'.format(dtype))

Number of null values in each column:
 
ID                0
EssayBody         0
ParagraphCount    0
WordCount         0
EssayScore        0
dtype: int64

Data Type of each column:
 
ID                  int64
EssayBody          object
ParagraphCount      int64
WordCount           int64
EssayScore        float64
dtype: object 



In [8]:
essays.head()

Unnamed: 0,ID,EssayBody,ParagraphCount,WordCount,EssayScore
0,1,"Over DATE , our cities faced unprecedented gro...",4,350,6.0
1,2,"According to some people , the role of a fathe...",4,419,6.0
2,3,"Thesedays , number of crime commit by the youn...",3,197,5.0
3,4,Some people believe that sharing details of re...,4,281,6.0
4,5,People DATE have contradictory opinion regardi...,12,410,5.5


In [52]:
def clean_topic(text):
    text = text.lower()  # Lowercase text
    text = re.sub(f"[{re.escape(punctuation)}]", "", text)  # Remove punctuation
    text = " ".join(text.split())  # Remove extra spaces, tabs, and new lines
    return text

In [17]:
def new_label(value):
    if value > 7.5 :
        return 'High Band'
    elif value > 6:
        return 'Moderate Band'
    else:
        return 'Low Band'

In [53]:
essays['EssayTopic'] = essays['EssayTopic'].map(clean_topic)

In [18]:
essays['EssayScore'] = essays['EssayScore'].map(new_label)
essays.head()

Unnamed: 0,ID,EssayBody,ParagraphCount,WordCount,EssayScore
0,2,"According to some people , the role of a fathe...",4,419,Low Band
1,20271,"With the growth in internet technology , stude...",4,287,Moderate Band
2,13400,"Nowadays , education has become a vital role i...",4,331,Moderate Band
3,20272,It is important to consider whether to give ho...,4,312,Moderate Band
4,29811,There will always be some popular leisure acti...,4,279,High Band


In [19]:
possible_labels = essays.EssayScore.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'Low Band': 0, 'Moderate Band': 1, 'High Band': 2}

In [54]:
#essays['EssayScore'] = essays.EssayScore.astype("category").cat.codes
essays.head()

Unnamed: 0,ID,EssayTopic,EssayBody,ParagraphCount,WordCount,EssayScore,CC_Score,LR_Score,GRA_Score,TA_Score
0,1,increasing the price of petrol is the best way...,"Over the last century, our cities faced unprec...",4,350,6.0,5.5,5.5,6.5,6.5
1,2,fatherhood ought to be emphasized as much as m...,"According to some people, the role of a father...",4,419,6.0,5.5,5.5,6.5,7.0
2,3,best way to reduce the number of crime among y...,"Thesedays, number of crime commit by the young...",3,197,5.0,5.5,5.0,6.5,5.0
3,4,some people believe that it is good to share a...,Some people believe that sharing details of re...,4,281,6.0,6.5,5.5,6.5,6.0
4,5,some people believe that teaching children at ...,People these days have contradictory opinion r...,12,410,5.5,5.5,5.0,5.5,5.0


In [20]:
essays['EssayScore'] = essays.EssayScore.replace(label_dict)
essays = essays.astype({'EssayScore': int})
essays.head()

Unnamed: 0,ID,EssayBody,ParagraphCount,WordCount,EssayScore
0,2,"According to some people , the role of a fathe...",4,419,0
1,20271,"With the growth in internet technology , stude...",4,287,1
2,13400,"Nowadays , education has become a vital role i...",4,331,1
3,20272,It is important to consider whether to give ho...,4,312,1
4,29811,There will always be some popular leisure acti...,4,279,2


_______

_________

In [21]:
X = essays[['EssayBody']].copy()
y = essays[['EssayScore']].copy()

In [22]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ekrembakay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ekrembakay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
import numpy as np
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec


def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords=False))
    return sentences

def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    index2word_set = set(model.wv.index_to_key)
    for word in words:
        if word in index2word_set:
            num_words += 1
            featureVec = np.add(featureVec,model.wv[word])        
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for word2vec model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

In [24]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K

def get_model():
    """Define the model."""
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    model.add(LSTM(32, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

In [12]:
from constants import GLOVE_DIR
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential
import keras.backend as K
from utils import tokenizer, load_embedding_matrix

def get_model(embedding_dimension, essay_length):
    vocabulary_size = len(tokenizer.word_index) + 1
    embedding_matrix = load_embedding_matrix(glove_directory=GLOVE_DIR, embedding_dimension=embedding_dimension)

    model = Sequential()

    model.add(Embedding(vocabulary_size, embedding_dimension, weights=[embedding_matrix], input_length=essay_length, trainable=False, mask_zero=False))
    model.add(LSTM(64, dropout=0.4, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Lambda(lambda x: K.mean(x, axis=1, keepdims=True)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

ModuleNotFoundError: No module named 'constants'

In [25]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score

cv = KFold(5, shuffle=True)
results = []
y_pred_list = []

count = 1
for traincv, testcv in cv.split(X):
    print("\n--------Fold {}--------\n".format(count))
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]
    
    train_essays = X_train['EssayBody']
    test_essays = X_test['EssayBody']
    
    sentences = []
    
    for essay in train_essays:
            # Obtaining all sentences from the training essays.
            sentences += essay_to_sentences(essay, remove_stopwords = True)
            
    # Initializing variables for word2vec model.
    num_features = 300 
    min_word_count = 40
    num_workers = 4
    context = 10
    downsampling = 1e-3

    print("Training Word2Vec Model...")
    model = Word2Vec(sentences, workers=num_workers, vector_size=num_features, min_count = min_word_count, window = context, sample = downsampling)

    model.init_sims(replace=True)
    model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

    clean_train_essays = []
    
    # Generate training and testing data word vectors.
    for essay_v in train_essays:
        clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
    trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)
    
    clean_test_essays = []
    for essay_v in test_essays:
        clean_test_essays.append(essay_to_wordlist( essay_v, remove_stopwords=True ))
    testDataVecs = getAvgFeatureVecs(clean_test_essays, model, num_features )
    
    trainDataVecs = np.array(trainDataVecs)
    testDataVecs = np.array(testDataVecs)
    # Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
    trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
    testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))
    
    lstm_model = get_model()
    lstm_model.fit(trainDataVecs, y_train, batch_size=32, epochs=50)
    #lstm_model.load_weights('./model_weights/final_lstm.h5')
    y_pred = lstm_model.predict(testDataVecs)
    
    # Round y_pred to the nearest integer.
    y_pred = np.around(y_pred)
    y_test = np.around(y_test)
  
    
    # Save any one of the 8 models.
    if count == 5:
         lstm_model.save('./model_weights/final_lstm.h5')
    
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    result = cohen_kappa_score(y_test.values,y_pred,weights='quadratic')
    print("Kappa Score: {}".format(result))
    results.append(result)
    
    count += 1
    break


--------Fold 1--------

Training Word2Vec Model...


  model.init_sims(replace=True)


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_6 (LSTM)               (None, 1, 300)            721200    
                                                                 
 lstm_7 (LSTM)               (None, 32)                42624     
                                                                 
 dropout_3 (Dropout)         (None, 32)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 763,857
Trainable params: 763,857
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/

In [17]:
y_test2 = np.array(y_test)

In [18]:
y_pred.shape

(4851, 1)

In [26]:
y_pred

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],

In [27]:
y_test2

array([[6. ],
       [6. ],
       [7.5],
       [6. ],
       [6. ],
       [7.5],
       [5.5],
       [6.5],
       [7. ],
       [7. ],
       [6. ],
       [5. ],
       [7.5],
       [6. ],
       [8.5],
       [8. ],
       [7. ],
       [7.5],
       [7. ],
       [6.5],
       [7.5],
       [8. ],
       [7. ],
       [7. ],
       [6.5],
       [6. ],
       [5.5],
       [7. ],
       [6.5],
       [5.5],
       [6. ],
       [6.5],
       [6. ],
       [6. ],
       [6.5],
       [6. ],
       [7. ],
       [6.5],
       [6.5],
       [6.5],
       [7. ],
       [6. ],
       [6.5],
       [6.5],
       [8.5],
       [6.5],
       [9. ],
       [8. ],
       [7.5],
       [7. ],
       [6. ],
       [6. ],
       [6.5],
       [8. ],
       [7.5],
       [7. ],
       [7.5],
       [6. ],
       [8.5],
       [6. ],
       [5.5],
       [7. ],
       [7.5],
       [6. ],
       [7.5],
       [6. ],
       [7. ],
       [6. ],
       [8.5],
       [8. ],
       [6. ],
      

In [28]:
y_test = np.around(y_test)

In [25]:
y_pred = np.around(y_pred - 1)


In [29]:
y_test2 = y_test.values
y_test2 = np.around(y_test2)
y_test2

array([[0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [2],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [2],
       [1],
       [2],
       [1],
       [1],
       [0],
       [0],
       [1],
       [2],
       [2],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [2],
       [2],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [2],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [2],
       [0],
       [2],
       [0],
       [1],
       [1],
       [1],
       [0],
       [2],
    