# UE20CS334 - Natural Language Processing - Project
## Team 03
## Literary device Identification - Personification

### Team Members

| Name                  | SRN           |
| --------------------- | ------------- |
| Ajay Anil Kumar       | PES2UG20CS028 |
| C V Eswar Sai Reddy   | PES2UG20CS096 |
| Rudra Narayan Samanta | PES2UG20CS286 |

### Import Dataset

In [1]:
import pandas as pd
import numpy as np
MOH_train = pd.read_csv("CLS/train0.tsv",sep='\t',header=0,names = ['index'	,'label'	,'sentence'	,'pos'	,'v_index'])
MOH_test = pd.read_csv("CLS/test0.tsv",sep='\t',header=0,names = ['index'	,'label'	,'sentence'	,'pos'	,'v_index'])

### Import required Libraries

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import tensorflow as tf
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from tensorflow import keras
from keras import regularizers, optimizers
from keras.models import Model, Sequential
from keras.layers import Dense, Flatten, Conv1D, MaxPooling1D, Activation, Dropout, GlobalMaxPooling1D
from tensorflow.keras.layers import BatchNormalization
import fasttext
from sklearn.svm import SVC
import pickle
ft_model = fasttext.load_model("cc.en.300.bin")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cvesw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cvesw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cvesw\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
train_df = pd.concat([MOH_train])
test_df = pd.concat([MOH_test])

### Data cleaning and Preprocessing

In [4]:
def preprocess(dataset):
  stop_words = set(stopwords.words('english'))
  for j,i in enumerate(dataset["sentence"]):
    word_tokens = word_tokenize(i)
    filtered_sentence = [lemmatizer.lemmatize(w.lower()) for w in word_tokens if not w.lower() in stop_words and w.isalpha() and len(w)>2]
    dataset["sentence"][j] = filtered_sentence

In [5]:
train_df = train_df.drop_duplicates(subset=["sentence","label"], keep='first')
train_df = train_df.drop_duplicates(subset=["sentence"], keep='last')
train_df = train_df.reset_index(level=0, drop=True, inplace=False, col_level=0, col_fill='')
preprocess(train_df)

test_df = test_df.drop_duplicates(subset=["sentence","label"], keep='first')
test_df = test_df.drop_duplicates(subset=["sentence"], keep='last')
test_df = test_df.reset_index(level=0, drop=True, inplace=False, col_level=0, col_fill='')
preprocess(test_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["sentence"][j] = filtered_sentence


In [6]:
train_df = train_df.dropna(subset=['sentence'])
train_df = train_df.reset_index(level=0, drop=True, inplace=False, col_level=0, col_fill='')
test_df = test_df.dropna(subset=['sentence'])
test_df = test_df.reset_index(level=0, drop=True, inplace=False, col_level=0, col_fill='')

### Converting Words to Fasttext Vectors

In [7]:
n_features = ft_model.get_dimension()

In [8]:
def text_to_vector(listOfWords):
    listOfVectors = []
    for i in listOfWords:
        x = np.zeros((n_features))
        x = ft_model.get_word_vector(i).astype('float32')
        listOfVectors.append(x)
    
    while(len(listOfVectors) < 8):
        x = np.zeros((n_features))
        listOfVectors.append(x)

    listOfVectors = np.array(listOfVectors)
    return listOfVectors

def dataset_to_vector(dataset):
    for j,i in enumerate(dataset["sentence"]):
        dataset["sentence"][j] = text_to_vector(i)

In [9]:
dataset_to_vector(train_df)
dataset_to_vector(test_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["sentence"][j] = text_to_vector(i)


In [10]:
x_train, y_train = train_df['sentence'], train_df['label']
x_test, y_test = test_df['sentence'], test_df['label']

### Reshaping data for input into CNN

In [11]:
def reshape(data):
    rows = data.shape[0]
    words = data[0].shape[0]
    embed_dim = data[0].shape[1]

    x = np.zeros(rows*words*embed_dim)
    x = np.reshape(x,(rows,words,embed_dim))

    for i in range(0,rows):
        for j in range(words):
            x[i][j] = data[i][j]
        
    return x

In [12]:
x_train = reshape(x_train)
x_test = reshape(x_test)

### CNN Architecture that creates 32 features for each sentence

In [13]:
cnn_features = Sequential()

cnn_features.add(Conv1D(128, 1, strides=1, batch_input_shape=(None,8,300), padding='same'))
cnn_features.add(BatchNormalization())
cnn_features.add(Activation('relu'))
cnn_features.add(Dropout(0.2))
cnn_features.add(MaxPooling1D())

cnn_features.add(Conv1D(64, 1, strides=1, padding='same'))
cnn_features.add(BatchNormalization())
cnn_features.add(Activation('relu'))
cnn_features.add(Dropout(0.2))
cnn_features.add(MaxPooling1D())

cnn_features.add(Conv1D(32, 1, strides=1, padding='same'))
cnn_features.add(BatchNormalization())
cnn_features.add(Activation('relu'))
cnn_features.add(Dropout(0.2))

cnn_features.add(MaxPooling1D())
cnn_features.add(Flatten())


In [14]:
from keras.models import model_from_json
import pickle

In [15]:
json_file = open('metaphor/cnn_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

loaded_model.load_weights("metaphor/cnn_model.h5")
print("Loaded model from disk")

Loaded model from disk


### SVM Model

In [16]:
X_train = loaded_model.predict(x_train)
X_test = loaded_model.predict(x_test)



In [17]:
svc_model = SVC(C=1,kernel='rbf')
svc_model.fit(X_train, y_train)
pickle.dump(svc_model, open("metaphor/svc_model.pkl", "wb"))

In [18]:
svc_model = pickle.load(open("metaphor/svc_model.pkl", 'rb'))
svc_pred = svc_model.predict(X_test)

### Results

In [19]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, svc_pred))
print("f1:",metrics.f1_score(y_test, svc_pred))
print("Precision:",metrics.precision_score(y_test, svc_pred))
print("Recall:",metrics.recall_score(y_test, svc_pred))

Accuracy: 0.6984126984126984
f1: 0.732394366197183
Precision: 0.7428571428571429
Recall: 0.7222222222222222


### Predict output for sentences

In [20]:
def pred_input(sentence):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(sentence)
    filtered_sentence = [lemmatizer.lemmatize(w.lower()) for w in word_tokens if not w.lower() in stop_words and w.isalpha() and len(w)>2]
    sentence_vector = text_to_vector(filtered_sentence)
    sentence_vector = np.reshape(sentence_vector,(1,8,300))
    x_sent = loaded_model.predict(sentence_vector)
    y_sent_pred = svc_model.predict(x_sent)
    return y_sent_pred


In [22]:
sent = '''Life is a highway
Car on a highway
Their faces were clouded with sadness.
The container leaked gas
stamp fruit extract the juice.
He leaked information
We rotate crops
lay a responsibility on someone.
The White House sits on Pennsylvania Avenue.
The bicycle looped around the tree.
The earth is rotating on its axis
She traced the circumstances of her birth.
I can not digest all this information.
The government floated the ruble for a few months.
They taxed him failure to appear in court.'''
for sen in sent.split('\n'):
    print(pred_input(sen), sen)

[1] Life is a highway
[0] Car on a highway
[0] Their faces were clouded with sadness.
[0] The container leaked gas
[0] stamp fruit extract the juice.
[0] He leaked information
[1] We rotate crops
[1] lay a responsibility on someone.
[1] The White House sits on Pennsylvania Avenue.
[1] The bicycle looped around the tree.
[0] The earth is rotating on its axis
[0] She traced the circumstances of her birth.
[1] I can not digest all this information.
[0] The government floated the ruble for a few months.
[1] They taxed him failure to appear in court.
