In [None]:
# Importing the dataset from the drive 

from google.colab import drive 
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# importing libraries 

import numpy as np
import pandas as pd
import nltk
import re

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize,word_tokenize
from gensim.models import Word2Vec

import keras.backend as K
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import cohen_kappa_score

In [None]:
# reading the dataset 

df = pd.read_csv('/content/drive/MyDrive/ai_hiring/train.csv')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,promptId,uniqueId,essay,evaluator_rating
0,0,1,1_323,"At present age, our education system is not go...",3.0
1,1,1,1_238,I am agree the tightly defined curriculum of o...,4.0
2,2,1,1_212,I strongly agree with the statement that tight...,2.0
3,3,1,1_117,Our education system is nice quitely but i dis...,2.0
4,4,1,1_229,i am totally agree with the statement that tig...,3.0


In [None]:
df = df.drop(columns=['Unnamed: 0','uniqueId'])

In [None]:
df = df.rename(columns={'evaluator_rating':'score'})

In [None]:
df

Unnamed: 0,promptId,essay,score
0,1,"At present age, our education system is not go...",3.0
1,1,I am agree the tightly defined curriculum of o...,4.0
2,1,I strongly agree with the statement that tight...,2.0
3,1,Our education system is nice quitely but i dis...,2.0
4,1,i am totally agree with the statement that tig...,3.0
...,...,...,...
1235,5,The entire world is in the race of producing a...,3.0
1236,5,The race in the development of weapons are pro...,2.5
1237,5,In an era where every second person hopes and ...,4.0
1238,5,INTRODUCTION :Since the beginning of the time ...,3.0


In [None]:
df['essay'][0]

'At present age, our education system is not good because so many things are done which is not required. Education systemis not work properly for so many regions in present time like corruption etc. but after that many educated people try to remove this type of sitution inour country and they try to give education to children or youngers age who is the future of our country. Teachers always want that their students done their work on time in class and whatever task or assignment they given to student they work on that task with honesty and politely and in case they not know or not able to work on that thing then ask them and the teacher suggested him very polietly. Our education system is work properly in privot schools, colleges, institution because in that place all person active for their work and they done resposibility very excelent way they do not think to others. Not opposite but it will not perform good in Government schools, colleges because in that place upper department not 

In [None]:
y_train = df['score']
df.drop('score',inplace=True,axis=1)
X_train = df

In [None]:
y_train

0       3.0
1       4.0
2       2.0
3       2.0
4       3.0
       ... 
1235    3.0
1236    2.5
1237    4.0
1238    3.0
1239    4.5
Name: score, Length: 1240, dtype: float64

In [None]:
X_train

Unnamed: 0,promptId,essay
0,1,"At present age, our education system is not go..."
1,1,I am agree the tightly defined curriculum of o...
2,1,I strongly agree with the statement that tight...
3,1,Our education system is nice quitely but i dis...
4,1,i am totally agree with the statement that tig...
...,...,...
1235,5,The entire world is in the race of producing a...
1236,5,The race in the development of weapons are pro...
1237,5,In an era where every second person hopes and ...
1238,5,INTRODUCTION :Since the beginning of the time ...


In [None]:
X_test = pd.read_csv('/content/drive/MyDrive/ai_hiring/test.csv')

In [None]:
X_test

Unnamed: 0.1,Unnamed: 0,promptId,uniqueId,essay
0,0,1,1_315,Curriculum has been adopted in many schools. T...
1,1,1,1_214,"I strongly agree with the statement , The tig..."
2,2,1,1_196,Imagination and creativity is the most importa...
3,3,1,1_178,In our eduction system leaves no room for imag...
4,4,1,1_201,"I will agree at some what extend, because if w..."
...,...,...,...,...
300,300,5,5_146,Earth is a creation of God and everything that...
301,301,5,5_65,production of arms and weapons in this present...
302,302,5,5_151,Race to become more powerful can destroy the e...
303,303,5,5_404,In its attempt to harness the power of the ato...


In [None]:
X_test = X_test.drop(columns=['Unnamed: 0','uniqueId'])

In [None]:
X_test

Unnamed: 0,promptId,essay
0,1,Curriculum has been adopted in many schools. T...
1,1,"I strongly agree with the statement , The tig..."
2,1,Imagination and creativity is the most importa...
3,1,In our eduction system leaves no room for imag...
4,1,"I will agree at some what extend, because if w..."
...,...,...
300,5,Earth is a creation of God and everything that...
301,5,production of arms and weapons in this present...
302,5,Race to become more powerful can destroy the e...
303,5,In its attempt to harness the power of the ato...


In [None]:
X_train.shape, X_test.shape

((1240, 2), (305, 2))

# Preprocessing 

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
train_e = X_train['essay'].tolist()
test_e = X_test['essay'].tolist()

In [None]:
train_sents=[]
test_sents=[]

stop_words = set(stopwords.words('english')) 

def sent2word(x):
    x=re.sub("[^A-Za-z]"," ",x)
    x.lower()
    filtered_sentence = [] 
    words=x.split()
    for w in words:
        if w not in stop_words: 
            filtered_sentence.append(w)
    return filtered_sentence

def essay2word(essay):
    essay = essay.strip()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw = tokenizer.tokenize(essay)
    final_words=[]
    for i in raw:
        if(len(i)>0):
            final_words.append(sent2word(i))
    return final_words

for i in train_e:
    train_sents+=essay2word(i)

for i in test_e:
    test_sents+=essay2word(i)

In [None]:
len(train_sents)

11527

In [None]:
len(test_sents)

2728

In [None]:
train_sents[5]

['Not',
 'opposite',
 'perform',
 'good',
 'Government',
 'schools',
 'colleges',
 'place',
 'upper',
 'department',
 'saw',
 'work',
 'lower',
 'department',
 'free',
 'work',
 'education',
 'system',
 'give',
 'challenge',
 'private',
 'institutions']

# Preparing LSTM MODEL

In [None]:
def get_model():
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))
    model.compile(loss='mean_squared_error', optimizer='rmsprop',metrics=['accuracy','mae'])
    model.summary()
    return model

In [None]:
#Training Word2Vec model

num_features = 300 
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

model = Word2Vec(train_sents, 
                 workers=num_workers, 
                 size=num_features, 
                 min_count = min_word_count, 
                 window = context, 
                 sample = downsampling)

model.init_sims(replace=True)
model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

In [None]:
def makeVec(words, model, num_features):
    vec = np.zeros((num_features,), dtype="float32")
    noOfWords = 0.
    index2word_set = set(model.wv.index2word)
    for i in words:
        if i in index2word_set:
            noOfWords += 1
            vec = np.add(vec,model[i])        
    vec = np.divide(vec,noOfWords)
    return vec


def getVecs(essays, model, num_features):
    count = 0
    essay_vecs = np.zeros((len(essays),num_features),dtype="float32")
    for i in essays:
        essay_vecs[count] = makeVec(i, model, num_features)
        count += 1
    return essay_vecs


clean_train=[]

for i in train_e:
    clean_train.append(sent2word(i))
training_vectors = getVecs(clean_train, model, num_features)

clean_test=[] 

for i in test_e:
    clean_test.append(sent2word(i))
testing_vectors = getVecs(clean_test, model, num_features)

  


In [None]:
training_vectors.shape

(1240, 300)

In [None]:
training_vectors

array([[-0.00319389,  0.01608094, -0.10613039, ...,  0.00924635,
        -0.02388303,  0.00655016],
       [ 0.00046103,  0.01696447, -0.08171739, ...,  0.01048832,
        -0.03859491, -0.00837543],
       [ 0.0055596 ,  0.0113463 , -0.09916937, ...,  0.00373044,
        -0.03052321, -0.01034497],
       ...,
       [ 0.00364791, -0.00915951, -0.09941565, ..., -0.0038315 ,
        -0.00329722, -0.00383761],
       [ 0.00184123, -0.0084358 , -0.09609666, ..., -0.00323701,
        -0.00380062, -0.00473638],
       [-0.00104747, -0.01097726, -0.09413584, ..., -0.00450476,
         0.00253275, -0.0016561 ]], dtype=float32)

In [None]:
testing_vectors

array([[-0.0007781 ,  0.0166339 , -0.08895762, ...,  0.01036535,
        -0.03185473, -0.004624  ],
       [ 0.00284804,  0.01608943, -0.08169953, ...,  0.0097451 ,
        -0.03750173, -0.01363392],
       [ 0.00908039,  0.01049536, -0.10315957, ...,  0.00209458,
        -0.03337025, -0.00953309],
       ...,
       [ 0.00816296, -0.02046868, -0.09700317, ..., -0.01008742,
         0.006501  , -0.00326278],
       [ 0.00423583, -0.01289677, -0.10273359, ..., -0.00572433,
        -0.00116716, -0.00552742],
       [ 0.00817247, -0.0172872 , -0.11039079, ..., -0.00994854,
         0.0046935 , -0.00446463]], dtype=float32)

In [None]:
training_vectors.shape

(1240, 300)

In [None]:
testing_vectors.shape

(305, 300)

In [None]:
training_vectors = np.array(training_vectors)
testing_vectors = np.array(testing_vectors)

training_vectors = np.reshape(training_vectors, (training_vectors.shape[0], 1, training_vectors.shape[1]))
testing_vectors = np.reshape(testing_vectors, (testing_vectors.shape[0], 1, testing_vectors.shape[1]))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 1, 300)            721200    
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 814,705
Trainable params: 814,705
Non-trainable params: 0
_________________________________________________________________


In [None]:
training_vectors.shape

(1240, 1, 300)

In [None]:
testing_vectors.shape

(305, 1, 300)

In [None]:
lstm_model = get_model()
lstm_model.fit(training_vectors, y_train, batch_size=64, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f1a479fc3d0>

In [None]:
lstm_model.save('final_lstm_v1.h5')
y_pred = lstm_model.predict(testing_vectors)
y_pred = np.around(y_pred)
y_pred

array([[3.],
       [3.],
       [2.],
       [3.],
       [3.],
       [3.],
       [2.],
       [2.],
       [2.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [2.],
       [3.],
       [3.],
       [2.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [2.],
       [3.],
       [3.],
       [3.],
       [3.],
       [2.],
       [3.],
       [3.],
       [3.],
       [2.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [2.],
       [2.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [2.],
       [3.],
       [2.],
       [2.],
       [2.],
       [3.],
       [2.],
       [2.],
       [3.],
       [3.],

In [None]:
y_pred.shape

(305, 1)

In [None]:
y_pred = y_pred.flatten()

In [None]:
y_pred

array([3., 3., 2., 3., 3., 3., 2., 2., 2., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 2., 3., 3., 2., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 2., 3., 3., 3., 3., 2., 3., 3., 3., 2., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 2., 2., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 2.,
       3., 2., 2., 2., 3., 2., 2., 3., 3., 3., 2., 2., 3., 3., 2., 3., 3.,
       3., 2., 2., 2., 2., 2., 3., 2., 2., 2., 2., 2., 3., 3., 2., 3., 2.,
       3., 3., 3., 2., 1., 2., 2., 3., 3., 2., 2., 2., 3., 2., 2., 2., 2.,
       3., 2., 3., 3., 2., 2., 2., 2., 2., 3., 2., 3., 2., 2., 2., 2., 2.,
       3., 3., 2., 2., 2., 3., 3., 2., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 2.

In [None]:
predicted_score = pd.read_csv('/content/drive/MyDrive/ai_hiring/test.csv')

In [None]:
predicted_score

Unnamed: 0.1,Unnamed: 0,promptId,uniqueId,essay
0,0,1,1_315,Curriculum has been adopted in many schools. T...
1,1,1,1_214,"I strongly agree with the statement , The tig..."
2,2,1,1_196,Imagination and creativity is the most importa...
3,3,1,1_178,In our eduction system leaves no room for imag...
4,4,1,1_201,"I will agree at some what extend, because if w..."
...,...,...,...,...
300,300,5,5_146,Earth is a creation of God and everything that...
301,301,5,5_65,production of arms and weapons in this present...
302,302,5,5_151,Race to become more powerful can destroy the e...
303,303,5,5_404,In its attempt to harness the power of the ato...


In [None]:
predicted_score['predicted_score'] = y_pred

In [None]:
predicted_score

Unnamed: 0.1,Unnamed: 0,promptId,uniqueId,essay,predicted_score
0,0,1,1_315,Curriculum has been adopted in many schools. T...,3.0
1,1,1,1_214,"I strongly agree with the statement , The tig...",3.0
2,2,1,1_196,Imagination and creativity is the most importa...,2.0
3,3,1,1_178,In our eduction system leaves no room for imag...,3.0
4,4,1,1_201,"I will agree at some what extend, because if w...",3.0
...,...,...,...,...,...
300,300,5,5_146,Earth is a creation of God and everything that...,2.0
301,301,5,5_65,production of arms and weapons in this present...,2.0
302,302,5,5_151,Race to become more powerful can destroy the e...,2.0
303,303,5,5_404,In its attempt to harness the power of the ato...,2.0


In [None]:
predicted_score.to_csv('test_prediction.csv', index = False)

In [None]:
pred = pd.read_csv('/content/test_prediction.csv')
pred

Unnamed: 0.1,Unnamed: 0,promptId,uniqueId,essay,predicted_score
0,0,1,1_315,Curriculum has been adopted in many schools. T...,3.0
1,1,1,1_214,"I strongly agree with the statement , The tig...",3.0
2,2,1,1_196,Imagination and creativity is the most importa...,2.0
3,3,1,1_178,In our eduction system leaves no room for imag...,3.0
4,4,1,1_201,"I will agree at some what extend, because if w...",3.0
...,...,...,...,...,...
300,300,5,5_146,Earth is a creation of God and everything that...,2.0
301,301,5,5_65,production of arms and weapons in this present...,2.0
302,302,5,5_151,Race to become more powerful can destroy the e...,2.0
303,303,5,5_404,In its attempt to harness the power of the ato...,2.0
