In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
import nltk
from nltk import word_tokenize
from nltk.chunk import RegexpParser
from random import shuffle
import numpy as np
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from nltk.util import ngrams
from sklearn.metrics import precision_recall_fscore_support as score
from nltk.util import everygrams

In [27]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [28]:
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.metrics.scores import accuracy, precision, recall, f_measure
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC,LinearSVC
import collections
import os
import re
import json

In [29]:
train_path = "../json_data_train.json"
test_path = "../json_data_test.json"
glove_embedding_path = './glove.6B.300d.txt'

In [30]:
def read_json_file(path):
    # Opening JSON file
    f = open(path)
    # returns JSON object as
    # a dictionary
    data = json.load(f)
    text_value = dict()
    for key in data:
      val = data.get(key)
      text = val['text']
      value = val['keywords']
      text_value[text] = value
    return text_value


In [31]:
def get_candidate_phrases(file):
    #Tokenize
    tokens = word_tokenize(file)
    # Chunking
    tagged = nltk.pos_tag(tokens)
    chunkGram = r""" PHRASE: 
                        {(<JJ>*  <NN.*>* )* <NN.*>*}
                """
    chunkParser = RegexpParser(chunkGram)
    chunked = chunkParser.parse(tagged)

    candidate_keywords = []
    for tree in chunked.subtrees():
        if tree.label() == 'PHRASE':
            candidate_keyword = ' '.join([x for x,y in tree.leaves()])
            candidate_keywords.append(candidate_keyword)
    candidate_keywords = [w for w in candidate_keywords if len(w) > 3 and len(w.split(' ')) < 10 and w.isalpha() ] 
    return candidate_keywords

In [32]:
embeddings_index = {}
f = open(glove_embedding_path, encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
       coefs = np.asarray(values[1:], dtype='float32')
       embeddings_index[word] = coefs
    except ValueError:
       pass
f.close()
print('Found %s word vectors.' % len(embeddings_index))


400000it [00:29, 13783.93it/s]

Found 400000 word vectors.





In [33]:
def create_phrase_vec(phrase):
  M = []
  for w in phrase:
    try:
      M.append(embeddings_index[w])
    except:
      continue
  M = np.array(M)
  v = M.sum(axis=0)
  if type(v) != np.ndarray:
    return np.zeros(300)
  return  v / np.sqrt((v ** 2).sum())

In [34]:
def create_features(doc,phrase,candidate_list):
  features = {}
  features['length'] = len(phrase.split(' '))
  features['part_of_speech'] = ' '.join([pos for word,pos in nltk.pos_tag(nltk.word_tokenize(phrase))])
  phrase_list = create_phrase_vec(phrase)
  if len(phrase_list):
    for i in range(0,len(phrase_list)):
      features['vector' + str(i)] = phrase_list[i]
  position_list = [ m.start()/float(len(doc)) for m in re.finditer(re.escape(phrase),doc,flags=re.IGNORECASE)] 
  if len(position_list):
      for i in range(0,len(position_list)):
          features['occurrance' + str(i)] = position_list[i]
  features['frequency'] = len(position_list)/ float(len(set(candidate_list)))
  return features

In [35]:
def create_feature_list(train_path, class_mapping):
  feature_list = []
  features = read_json_file(train_path)
  candidate_list = []
  for text, value in tqdm(features.items()):
    candidates = get_candidate_phrases(text)
    candidate_list += candidates
    keywords =  list(value.keys())
    classes = list(value.values())
    for cd in candidates:
      feature = create_features(text,cd,candidates)
      if cd in keywords:
        tag = value.get(cd)[0][2]
        label = class_mapping[tag]
      else:
        label = class_mapping['Normal']
      feature_list.append([feature,label])    
  return  feature_list

In [36]:
class_mapping = {
 'Normal': 0,
 'Task' : 1,
 'Process' : 2,
 'Material': 3
}

In [37]:
feature_list = create_feature_list(train_path, class_mapping)
print(" ")
print('Length of feature list',len(feature_list))

100%|██████████| 350/350 [00:09<00:00, 37.55it/s]

 
Length of feature list 6265





In [38]:
print('number of keywords', len([x for x,y in feature_list if y != 0]))
print('number of Task keywords', len([x for x,y in feature_list if y == 1]))
print('number of Process keywords', len([x for x,y in feature_list if y == 2]))
print('number of Material keywords', len([x for x,y in feature_list if y == 3]))
print('number of non keywords', len([x for x,y in feature_list if y == 0]))


number of keywords 857
number of Task keywords 41
number of Process keywords 278
number of Material keywords 538
number of non keywords 5408


In [39]:
shuffle(feature_list)
non_candidates_list_train = [(x,y) for x,y in feature_list if y == 0][:500]
task_list_train = [(x,y) for x,y in feature_list if y == 1][:41]
process_list_train = [(x,y) for x,y in feature_list if y == 2][:278]
material_list_train = [(x,y) for x,y in feature_list if y == 3][:500]
shuffle(non_candidates_list_train)
shuffle(task_list_train)
shuffle(process_list_train)
shuffle(material_list_train)
training_set = []
training_set.extend(non_candidates_list_train)
training_set.extend(task_list_train)
training_set.extend(process_list_train)
training_set.extend(material_list_train)

In [40]:
feature_list = np.array(feature_list)
X = feature_list[:,0]
Y = feature_list[:,1]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [42]:
X_train = np.array(X_train).reshape(-1,1)
y_train = np.array(y_train).reshape(-1,1)
feature_list_train = np.concatenate((X_train,y_train),axis=1)
print('Train size',feature_list_train.shape)

X_test = np.array(X_test).reshape(-1,1)
y_test = np.array(y_test).reshape(-1,1)
feature_list_test = np.concatenate((X_test,y_test),axis=1)
print('Test size',feature_list_test.shape)



Train size (5012, 2)
Test size (1253, 2)


In [43]:
def evaluation_metric(y_test,y_pred):
  precision, recall, fscore, support = score(y_test, y_pred,average=None,labels=[0,1,2,3])
  print('Class Non Keyword: ')
  print('Precision: ',precision[0])
  print('Recall: ',recall[0])
  print('Fscore: ',fscore[0])
  print('support: ',support[0])
  print(' ')
  print('Class Task: ')
  print('Precision: ',precision[1])
  print('Recall: ',recall[1])
  print('Fscore: ',fscore[1])
  print('support: ',support[1])
  print(' ')
  print('Class Process: ')
  print('Precision: ',precision[2])
  print('Recall: ',recall[2])
  print('Fscore: ',fscore[2])
  print('support: ',support[2])
  print(' ')
  print('Class Material: ')
  print('Precision: ',precision[3])
  print('Recall: ',recall[3])
  print('Fscore: ',fscore[3])
  print('support: ',support[3])
  print(' ')
  print("Overall:")
  print("Micro averaged F score",f1_score(y_test, y_pred, average='micro'))
  print("Macro averaged F score",f1_score(y_test, y_pred, average='macro'))

In [44]:
classifier =  SklearnClassifier(LinearSVC(max_iter=5000)).train(training_set)
y_pred = classifier.classify_many(feature_list_test[:,0])
y_test = feature_list_test[:,1].tolist()
print ("Accuracy : ",nltk.classify.accuracy(classifier,feature_list_test) * 100)
print(' ')
evaluation_metric(y_test, y_pred)


Accuracy :  64.96408619313647
 
Class Non Keyword: 
Precision:  0.9290240811153359
Recall:  0.677449168207024
Fscore:  0.7835382148583645
support:  1082
 
Class Task: 
Precision:  0.0
Recall:  0.0
Fscore:  0.0
support:  8
 
Class Process: 
Precision:  0.13
Recall:  0.23636363636363636
Fscore:  0.16774193548387098
support:  55
 
Class Material: 
Precision:  0.18681318681318682
Recall:  0.6296296296296297
Fscore:  0.288135593220339
support:  108
 
Overall:
Micro averaged F score 0.6496408619313647
Macro averaged F score 0.3098539358906436


  _warn_prf(average, modifier, msg_start, len(result))


In [45]:
def create_test_feature_list(test_path, class_mapping):
  feature_list = []
  features = read_json_file(train_path)
  for text, value in tqdm(features.items()):
    candidates = get_candidate_phrases(text)
    keywords =  list(value.keys())
    classes = list(value.values())
    print(classes)

    for i,kw in enumerate(keywords):
      feature = create_features(text,kw,candidates)
      label = class_mapping[classes[i][0][2]]
      feature_list.append([feature,label])
      for cd in candidates:
        if cd not in keywords:
          feature = create_features(text,cd,candidates)
          label = class_mapping['Normal']
          feature_list.append([feature,label])    
  return feature_list 

In [46]:
test_feature_list = create_feature_list(test_path,class_mapping)
test_feature_list = np.asarray(test_feature_list)
print(" ")
print('Length of test feature list',len(test_feature_list))

100%|██████████| 100/100 [00:03<00:00, 33.27it/s]

 
Length of test feature list 2135





In [47]:
print("Accuracy : ",nltk.classify.accuracy(classifier,test_feature_list) * 100)
print(' ')
y_pred = classifier.classify_many(test_feature_list[:,0])
y_test = test_feature_list[:,1].tolist()
evaluation_metric(y_test,y_pred)

Accuracy :  63.559718969555036
 
Class Non Keyword: 
Precision:  0.9132385938668661
Recall:  0.674585635359116
Fscore:  0.7759771210676835
support:  1810
 
Class Task: 
Precision:  0.0
Recall:  0.0
Fscore:  0.0
support:  19
 
Class Process: 
Precision:  0.14906832298136646
Recall:  0.22857142857142856
Fscore:  0.18045112781954886
support:  105
 
Class Material: 
Precision:  0.17582417582417584
Recall:  0.5572139303482587
Fscore:  0.26730310262529833
support:  201
 
Overall:
Micro averaged F score 0.6355971896955503
Macro averaged F score 0.30593283787813264


  _warn_prf(average, modifier, msg_start, len(result))
