In [0]:

import os
from sklearn.preprocessing import scale
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix

import en_core_web_sm
nlp = en_core_web_sm.load()
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import pandas as pd
import numpy as np
from time import time 
import multiprocessing
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split


import keras 
from keras.models import Sequential, Model 
from keras import layers
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Input, Embedding
from keras.layers.merge import Concatenate


nltk.download('wordnet')


nltk.download('punkt')

Using TensorFlow backend.


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
def getTokensFromFiles(filePath, label):
  main=[]
  for i in os.listdir(filePath):
    if i != ".ipynb_checkpoints":
      x=i
      try:
        fname = open(filePath +"/"+ x).read()
      except UnicodeDecodeError:
        continue
          
      f1=re.sub(r'\W+', ' ',fname)
      
      wordnet_lemmatizer = WordNetLemmatizer()
      
      words = nltk.word_tokenize(f1)
      
      f2=" "
      for w in words:
        f2=f2+" "+wordnet_lemmatizer.lemmatize(w)
    
      doc = nlp(f2)
      
      tokens = [token.text for token in doc if not token.is_stop]
      
      l=[]
      l.append(tokens)
      if label != -1:
        l.append(label)
      main.append(l)

  return main



In [0]:

def trainw2v(trainSet):
  cores = multiprocessing.cpu_count() 
  w2v_model = Word2Vec(min_count=20,
                       window=2,
                       size=1000,
                       sample=6e-5, 
                       alpha=0.03, 
                       min_alpha=0.0007, 
                       negative=20,
                       workers=cores-1)

  t = time()
  w2v_model.build_vocab(trainSet["Tokens"], progress_per=1000)
  print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

  w2v_model.train(trainSet["Tokens"], total_examples=w2v_model.corpus_count, epochs=10000, report_delay=1)
  print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

  return w2v_model;

In [0]:
def buildWordVector(w2v_model, tfidf, tokens, size):
    vec = np.zeros(size).reshape((1, size))
    
    count = 0.
    for word in tokens:
        try:
    
            vec += w2v_model[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    
    
    return vec


In [0]:

def saveModel(train_vecs_w2v, y_train, test_vecs_w2v, y_test):
  dnn_model1 = Sequential()
  dnn_model1.add(Dense(128, activation='relu', input_dim=1000))
  dnn_model1.add(Dropout(0.7))
  dnn_model1.add(Dense(1, activation='sigmoid'))
  dnn_model1.compile(optimizer='adadelta',
                loss='binary_crossentropy',
                metrics=['accuracy'])
  dnn_model1.summary()
  earlystop = EarlyStopping(monitor = 'loss',
                          min_delta = 0,
                          patience = 3,
                          verbose = 1,
                          restore_best_weights = True)  
  history = dnn_model1.fit(train_vecs_w2v, y_train, epochs=40, batch_size=50,validation_data=(test_vecs_w2v,y_test))
  loss, accuracy = dnn_model1.evaluate(train_vecs_w2v, y_train, verbose=False)
  print("Training Accuracy: {:.4f}".format(accuracy))
  loss, accuracy = dnn_model1.evaluate(test_vecs_w2v, y_test, verbose=False)
  print("Testing Accuracy:  {:.4f}".format(accuracy))
  dnn_model1.save("/content/sample_data/DNN_Model2", overwrite=True, include_optimizer=True)



In [0]:

#Preparing tokens from raw data

training_test = []
training_test = training_test + getTokensFromFiles("/content/drive/My Drive/resume/Training & Test/Resume", 1)
training_test = training_test + getTokensFromFiles("/content/drive/My Drive/resume/Training & Test/Non Resume", 0)
prediction = getTokensFromFiles("/content/drive/My Drive/resume/Prediction/Resume", -1)
prediction = prediction + getTokensFromFiles("/content/drive/My Drive/resume/Prediction/Non Resume", -1)


#Creating Dataframes for Training & prediction tokens


train_df = pd.DataFrame(training_test, columns = ['Tokens', 'Label'])
pred_df = pd.DataFrame(prediction, columns = ['Tokens'])



In [7]:
#Train Word2Vec model with personal data

from gensim.models import KeyedVectors
from gensim.models import Word2Vec


trained_w2v_model =  KeyedVectors.load_word2vec_format('/content/sample_data/GoogleNews-vectors-negative300.bin.gz', binary=True)

y = train_df['Label'].values
X = np.array(train_df["Tokens"])

#Split the training and test set in provided ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
X_pred = np.array(pred_df["Tokens"])

vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)

matrix = vectorizer.fit_transform([x for x in X_train])

tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print ('vocab size :', len(tfidf))


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


EOFError: ignored

In [0]:
#Build the word vectors for train, test & prediction dataset 
train_vecs_w2v = np.concatenate([buildWordVector(trained_w2v_model, tfidf , z, 1000) for z in map(lambda x: x, X_train)])
test_vecs_w2v = np.concatenate([buildWordVector(trained_w2v_model, tfidf , z, 1000) for z in map(lambda x: x, X_test)])
pred_vec_w2v = np.concatenate([buildWordVector(trained_w2v_model, tfidf , z, 1000) for z in map(lambda x: x, X_pred)])

#Scale the vectors
train_vecs_w2v = scale(train_vecs_w2v)
test_vecs_w2v = scale(test_vecs_w2v)
pred_vec_w2v = scale(pred_vec_w2v)


print ('shape for training set : ',train_vecs_w2v.shape,
      '\nshape for test set : ', test_vecs_w2v.shape,
       '\nshape for predicted set : ', pred_vec_w2v.shape,) 


  


shape for training set :  (135, 1000) 
shape for test set :  (58, 1000) 
shape for predicted set :  (7, 1000)


In [0]:
from keras.callbacks import EarlyStopping 
saveModel(train_vecs_w2v, y_train, test_vecs_w2v, y_test)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 128)               128128    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total params: 128,257
Trainable params: 128,257
Non-trainable params: 0
_________________________________________________________________
Train on 135 samples, validate on 58 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 

In [0]:
from keras.models import load_model

trained_model = load_model("/content/sample_data/DNN_Model2")

l = trained_model.predict(pred_vec_w2v, verbose=True)
print(l)

[[9.9996626e-01]
 [9.9995935e-01]
 [4.3215469e-02]
 [9.9999976e-01]
 [5.1797028e-03]
 [7.1375431e-03]
 [9.3743120e-06]]
