In [2]:
import torch
import os
import argparse
from torch.utils.data import Dataset, DataLoader
import torchtext
from collections import Counter
import numpy as np
import pandas as pd
import pickle

from numpy import array
from numpy import argmax
from tensorflow.keras.utils import to_categorical

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
sw = stopwords.words('english') 


class Tokenizer:

    def __init__(self, file, threshold=5):
        self.file = file
        self.data = pd.read_csv(file)
        self.threshold = threshold

    def preprocess(self):
        tokenizer = torchtext.data.utils.get_tokenizer('spacy', language='en')
        tokens = []
        sentence_list=[]
        for text in self.data['text'].tolist():
            tokens.append(tokenizer(text))
            sentence_list.append(text.split('.'))

        self.data['sentences_list'] = sentence_list
        counter = Counter()
        for line in tokens:
            for word in line:
                counter[word] += 1

        mapper = {word[0]: idx+1 for idx,
                  word in enumerate(counter.most_common())}
        inverse_mapper = {idx+1: word[0] for idx,
                          word in enumerate(counter.most_common())}

        # sos_idx = len(counter_threshold.keys())
        # eos_idx = len(counter_threshold.keys()) + 1
        other_idx = len(counter.keys())

        mapped_tokens = []

        for line in tokens:
            mapped_line = []
            for word in line:
              # map words to their mappings and to other otherwise
                mapped_line.append(mapper.get(word, other_idx))
            mapped_tokens.append(mapped_line)

        return mapped_tokens, inverse_mapper




def similarity_paragraph(data):
    # data = self.data
    sim_list = []
    for para in data['sentences_list'].tolist():
      sim = 200
      start = para[0]
      para = para[1:]
      for sent in para:            
        # tokenization
        X_list = word_tokenize(start) 
        Y_list = word_tokenize(sent)
          
        # sw contains the list of stopwords
        l1 =[];l2 =[]
          
        # remove stop words from the string
        X_set = {w for w in X_list if not w in sw} 
        Y_set = {w for w in Y_list if not w in sw}
          
        # form a set containing keywords of both strings 
        rvector = X_set.union(Y_set) 
        for w in rvector:
            if w in X_set: l1.append(1) # create a vector
            else: l1.append(0)
            if w in Y_set: l2.append(1)
            else: l2.append(0)
        c = 0
          
        # cosine formula 
        for i in range(len(rvector)):
            c+= l1[i]*l2[i]
        try:
          cosine = c / float((sum(l1)*sum(l2))**0.5)
          if sim > cosine:
            sim = cosine
        except:
          sim += 0
          
        start = sent
      
      # sim = sim/(len(para)+1)
      sim_list.append(sim)
    
    data['similarity'] = sim_list

    return data
          
          # print("similarity: ", cosine)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


The data from the GCDC Corpus consists of 4 sets of data each containing 1000 training entries and 200 test entries. Due to lack of data, we train on the 4 training sets and 3 of the test sets and use a single test set to test the results. We also compare the results obtained by each set.


In [None]:
# training data
data1 = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/Clinton_train.csv')
data2 = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/Yahoo_train.csv')
data3 = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/Yelp_train.csv')
data4 = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/Enron_train.csv')

data5 = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/Yahoo_test.csv')
data6 = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/Yelp_test.csv')
data7 = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/Enron_test.csv')
data8 = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/Enron_test.csv')


In [None]:
data = pd.concat([data1, data2, data3, data4, data5, data6, data7])

In [None]:
data.to_csv('/content/gdrive/MyDrive/NLP-Project/GCDC_Corpus_v2/GCDC_rerelease/new_train.csv')

In [None]:
train = Tokenizer("/content/gdrive/MyDrive/GCDC_rerelease/new_train.csv")
test = Tokenizer("/content/gdrive/MyDrive/GCDC_rerelease/Clinton_test.csv")

In [None]:
train = Tokenizer("/content/gdrive/MyDrive/NLP-Project/GCDC_Corpus_v2/GCDC_rerelease/new_train.csv")
test = Tokenizer("/content/gdrive/MyDrive/NLP-Project/GCDC_Corpus_v2/GCDC_rerelease/Clinton_test.csv")

In [None]:
lst = array(train.data['labelA'])
encoded = to_categorical(lst)

In [None]:
lst = array(test.data['labelA'])
t_encoded = to_categorical(lst)

In [None]:
train_mapping, inv_train_mapping = train.preprocess()
test_mapping, inv_test_mapping = test.preprocess()

Similarity Function finds the similarity of adjacent sentences in the paragraph and returns the least of all the similarity values.

In [None]:
train.data = similarity_paragraph(train.data)
test.data = similarity_paragraph(test.data)

In [None]:
train.data['encoding'] = train_mapping
test.data['encoding'] = test_mapping

In [None]:
train.data['labelA'][4]

1

# LSTM with 4600/200 train/test(Clinton) split 3-way multi-classifier

In [None]:
np.random.seed(7)
X_train = sequence.pad_sequences(train.data['encoding'],maxlen = 500)
y_train = encoded
X_test = sequence.pad_sequences(test.data['encoding'],maxlen=500)
y_test = t_encoded

In [None]:
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(40000,embedding_vector_length,input_length = 500))
model.add(LSTM(32,dropout=0.2, return_sequences = True ))
model.add(LSTM(32))
model.add(Dense(4,activation = 'softmax'))
model.compile(loss ='categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train,y_train, epochs = 15, batch_size=23)

scores = model.evaluate(X_test, y_test, verbose =0)
print("Accuracy: ",(scores[1]*100))

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 500, 32)           1280000   
                                                                 
 lstm_16 (LSTM)              (None, 500, 32)           8320      
                                                                 
 lstm_17 (LSTM)              (None, 32)                8320      
                                                                 
 dense_8 (Dense)             (None, 4)                 132       
                                                                 
Total params: 1,296,772
Trainable params: 1,296,772
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
import pickle
filename = 'lstm_binary_min_sim_clinton.sav'
pickle.dump(model,open(filename,'wb'))





INFO:tensorflow:Assets written to: ram://a18a4b47-e74c-4578-82bb-4804458e4475/assets


INFO:tensorflow:Assets written to: ram://a18a4b47-e74c-4578-82bb-4804458e4475/assets


# LSTM with 4600/200 train/test(Clinton) split binary classifier with minimum similarity function

In [None]:
coh_bin = []
for i in range(4600):
  if train.data['labelA'].tolist()[i] >=2:
    coh_bin.append(1)
  else:
    coh_bin.append(0)
train.data['bin_coh']= coh_bin


coh_bin=[]
for i in range(200):
  if test.data['labelA'].tolist()[i] >=2:
    coh_bin.append(1)
  else:
    coh_bin.append(0)

test.data['bin_coh']=coh_bin

In [None]:
lst = array(train.data['bin_coh'])
encoded = to_categorical(lst)

lst = array(test.data['bin_coh'])
t_encoded = to_categorical(lst)



In [None]:
np.random.seed(7)
X_train = sequence.pad_sequences(train.data['encoding'],maxlen = 500)
y_train = encoded#train.data['h_e']
X_test = sequence.pad_sequences(test.data['encoding'],maxlen=500)
y_test = t_encoded#test.data['h_e']

# X_train = np.append(train.data['similarity'][:,np.newaxis], X_train, axis=1)
# X_test = np.append(test.data['similarity'][:,np.newaxis],X_test, axis=1)



In [None]:
type(X_train)

X_train = np.append(train.data['similarity'][:,np.newaxis], X_train, axis=1)
X_test = np.append(test.data['similarity'][:,np.newaxis],X_test, axis=1)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [None]:
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(40000,embedding_vector_length,input_length = 501))
model.add(LSTM(32,dropout=0.2, return_sequences = True ))
model.add(LSTM(32))
model.add(Dense(2,activation = 'softmax'))
model.compile(loss ='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train,y_train, epochs = 15 , batch_size=23)

scores = model.evaluate(X_test, y_test, verbose =0)
print("Accuracy: ",(scores[1]*100))

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 501, 32)           1280000   
                                                                 
 lstm_10 (LSTM)              (None, 501, 32)           8320      
                                                                 
 lstm_11 (LSTM)              (None, 32)                8320      
                                                                 
 dense_5 (Dense)             (None, 2)                 66        
                                                                 
Total params: 1,296,706
Trainable params: 1,296,706
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## Trying different Hyperparameters to fine-tune the results

In [None]:
np.random.seed(7)
X_train = sequence.pad_sequences(train.data['encoding'],maxlen = 256)
y_train = encoded#train.data['h_e']
X_test = sequence.pad_sequences(test.data['encoding'],maxlen=256)
y_test = t_encoded#test.data['h_e']
X_train = np.append(train.data['similarity'][:,np.newaxis], X_train, axis=1)
X_test = np.append(test.data['similarity'][:,np.newaxis],X_test, axis=1)
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(40000,embedding_vector_length,input_length = 257))
model.add(LSTM(32,dropout=0.2, return_sequences = True ))
model.add(LSTM(32))
model.add(Dense(2,activation = 'softmax'))
model.compile(loss ='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train,y_train, epochs = 10 , batch_size=23)

scores = model.evaluate(X_test, y_test, verbose =0)
print("Accuracy: ",(scores[1]*100))

  
  import sys


Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_15 (Embedding)    (None, 257, 32)           1280000   
                                                                 
 lstm_30 (LSTM)              (None, 257, 32)           8320      
                                                                 
 lstm_31 (LSTM)              (None, 32)                8320      
                                                                 
 dense_15 (Dense)            (None, 2)                 66        
                                                                 
Total params: 1,296,706
Trainable params: 1,296,706
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy:  65.49999713897705


In [None]:
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(40000,embedding_vector_length,input_length = 500))
model.add(LSTM(32,dropout=0.2, return_sequences = True ))
model.add(LSTM(32))
model.add(Dense(2,activation = 'softmax'))
model.compile(loss ='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train,y_train, epochs = 4 , batch_size=23)

scores = model.evaluate(X_test, y_test, verbose =0)
print("Accuracy: ",(scores[1]*100))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 32)           1280000   
                                                                 
 lstm_2 (LSTM)               (None, 500, 32)           8320      
                                                                 
 lstm_3 (LSTM)               (None, 32)                8320      
                                                                 
 dense_1 (Dense)             (None, 2)                 66        
                                                                 
Total params: 1,296,706
Trainable params: 1,296,706
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Accuracy:  53.50000262260437


# Implementing the top models on the Wikipedia-CNN Dataset

In [15]:
new_data = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/gitCorpus.csv')

In [None]:
new_data

In [16]:
new_train = Tokenizer('/content/gdrive/MyDrive/GCDC_rerelease/gitCorpus.csv')


In [17]:
new_train.data = new_train.data.drop(['label','to_be_replaced','train','file_id','replace_with','sen_position'],axis = 1)
new_train.data

Unnamed: 0,ctx,ctx-replaced
0,"Estramustine (INN, USAN, BAN) (brand names Emc...","Estramustine (INN, USAN, BAN) (brand names Emc..."
1,L² Puppis (also known as HD 56096) is a giant ...,L² Puppis (also known as HD 56096) is a giant ...
2,David John (Davy/Davey) Gunn (1887-1955) was a...,David John (Davy/Davey) Gunn (1887-1955) was a...
3,Olivia Hussey (born Olivia Osuna; 17 April 195...,Olivia Hussey (born Olivia Osuna; 17 April 195...
4,"Tailapa II, or Taila, (r.973–997 CE) (or Ahava...","Tailapa II, or Taila, (r.973–997 CE) (or Ahava..."
...,...,...
179079,"-LRB- CNN -RRB- If you listen to rock or pop, ...",
179080,"-LRB- CNN -RRB- -- Pirates have struck again, ...",
179081,-LRB- CNN -RRB- -- Russia will begin the const...,
179082,-LRB- CNN -RRB- -- The man police say kidnappe...,


In [18]:
pos_data = new_train.data[['ctx']].copy()
label = [1]*len(pos_data)
pos_data = pos_data.rename(columns = {'ctx':'text'})
pos_data['label']=label
pos_data
neg_data = new_train.data[['ctx-replaced']].copy()
neg_data = neg_data.rename(columns = {'ctx-replaced':'text'})
# neg_data = neg_data[neg_data['text'] != '' and neg_data['text' != None]]
neg_data = neg_data.dropna()
label = [0]*len(neg_data)
neg_data['label'] = label
neg_data
new_train.data = pd.concat([pos_data,neg_data])
new_train.data


Unnamed: 0,text,label
0,"Estramustine (INN, USAN, BAN) (brand names Emc...",1
1,L² Puppis (also known as HD 56096) is a giant ...,1
2,David John (Davy/Davey) Gunn (1887-1955) was a...,1
3,Olivia Hussey (born Olivia Osuna; 17 April 195...,1
4,"Tailapa II, or Taila, (r.973–997 CE) (or Ahava...",1
...,...,...
142301,"-LRB- Fast Company -RRB- -- For years, employ...",0
142302,Paris -LRB- CNN -RRB- -- France will start wit...,0
142303,-LRB- CNN -RRB- -- Pinterest is the breakout s...,0
142304,New York -LRB- CNN -RRB- -- Officer Rafael Ram...,0


In [19]:

new_train_mapping, inv_new_train_mapping = new_train.preprocess()
new_train.data['encoding'] = new_train_mapping

new_train.data = similarity_paragraph(new_train.data)


In [20]:
train_split = new_train.data.sample(frac = 0.8, random_state=200)
test_split = new_train.data.drop(train_split.index)



In [39]:
train_split

Unnamed: 0,text,label,sentences_list,encoding,similarity
157493,-LRB- CNN -RRB- -- World football 's governing...,1,[-LRB- CNN -RRB- -- World football 's governin...,"[30, 31, 32, 27, 137, 227, 15, 3497, 447, 2491...",0.100000
61962,Turrilitidae is a family of extinct heteromor...,1,[Turrilitidae is a family of extinct heteromo...,"[337218, 6, 10, 8, 151, 4, 3604, 337219, 26839...",0.000000
94326,Digidogheadlock is the eighth album by Japanes...,1,[Digidogheadlock is the eighth album by Japane...,"[394473, 10, 2, 2980, 105, 21, 643, 207, 12, 8...",0.091287
118888,"London, England -LRB- CNN -RRB- -- The Britis...",1,"[London, England -LRB- CNN -RRB- -- The Briti...","[282, 1, 6, 279, 30, 31, 32, 27, 12, 189, 2699...",0.000000
154898,-LRB- CNN -RRB- -- Inside the Charles Manson r...,1,[-LRB- CNN -RRB- -- Inside the Charles Manson ...,"[30, 31, 32, 27, 6245, 2, 966, 12012, 1312, 26...",0.109109
...,...,...,...,...,...
108315,-LRB- CNN -RRB- -- Nurse Kaci Hickox and her b...,1,[-LRB- CNN -RRB- -- Nurse Kaci Hickox and her ...,"[30, 31, 32, 27, 20800, 52617, 34480, 5, 57, 5...",0.108465
131663,"Cairo, Egypt -LRB- CNN -RRB- -- Authorities i...",1,"[Cairo, Egypt -LRB- CNN -RRB- -- Authorities ...","[2850, 1, 6, 1261, 30, 31, 32, 27, 2114, 7, 12...",0.120386
155532,-LRB- CNN -RRB- -- Pontiac lovers are feeling ...,1,[-LRB- CNN -RRB- -- Pontiac lovers are feeling...,"[30, 31, 32, 27, 20349, 9075, 35, 3812, 28369,...",0.070014
115363,Russian Prime Minister Dmitry Medvedev has sig...,1,[Russian Prime Minister Dmitry Medvedev has si...,"[555, 949, 483, 14132, 13378, 33, 920, 8, 9333...",0.000000


In [40]:
test_split

Unnamed: 0,text,label,sentences_list,encoding,similarity
33,The swimming competitions at the 2016 Summer O...,1,[The swimming competitions at the 2016 Summer ...,"[12, 4239, 4005, 26, 2, 441, 1786, 1244, 7, 27...",0.157135
56,The Orlando Shakespeare Theater is a theater c...,1,[The Orlando Shakespeare Theater is a theater ...,"[12, 4408, 6544, 5467, 10, 8, 2773, 174, 149, ...",0.077152
67,"Histocompatibility, or tissue compatibility, i...",1,"[Histocompatibility, or tissue compatibility, ...","[126974, 1, 42, 5326, 22566, 1, 10, 2, 1173, 4...",0.000000
86,"William Morris (January 1, 1861 – January 11, ...",1,"[William Morris (January 1, 1861 – January 11,...","[647, 5238, 20, 246, 160, 1, 6907, 179, 246, 4...",0.000000
97,Alphonse Areola (born 27 February 1993) is a F...,1,[Alphonse Areola (born 27 February 1993) is a ...,"[26818, 126992, 20, 96, 788, 328, 1122, 19, 10...",0.000000
...,...,...,...,...,...
142202,Tokyo -LRB- CNN -RRB- -- Japanese Prime Minist...,0,[Tokyo -LRB- CNN -RRB- -- Japanese Prime Minis...,"[2162, 30, 31, 32, 27, 643, 949, 483, 34468, 1...",0.000000
142231,-LRB- CNN -RRB- -- With a first name that mean...,0,[-LRB- CNN -RRB- -- With a first name that mea...,"[30, 31, 32, 27, 624, 8, 50, 124, 23, 793, 14,...",0.133333
142269,-LRB- CNN -RRB- -- As about 2 % of babies born...,0,[-LRB- CNN -RRB- -- As about 2 % of babies bor...,"[30, 31, 32, 27, 218, 67, 204, 337, 4, 7112, 9...",0.055048
142302,Paris -LRB- CNN -RRB- -- France will start wit...,0,[Paris -LRB- CNN -RRB- -- France will start wi...,"[926, 30, 31, 32, 27, 503, 68, 764, 14857, 980...",0.000000


In [41]:
train_split.to_csv('/content/gdrive/MyDrive/GCDC_rerelease/train_split.csv')
test_split.to_csv('/content/gdrive/MyDrive/GCDC_rerelease/test_split.csv')

In [21]:
total_split = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/train_split.csv')
test_split = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/test_split.csv')

# train_split = total_split.sample(frac = 0.9, random_state=200)
# validation_split = total_split.drop(train_split.index)

In [None]:
np.random.seed(7)
X_train = sequence.pad_sequences(train_split['encoding'],maxlen = 500)
y_train = train_split['label']
y_train = tf.one_hot(y_train,depth = 2)
X_test = sequence.pad_sequences(test_split['encoding'],maxlen=500)
y_test = test_split['label']
y_test = tf.one_hot(y_test,depth = 2)

In [22]:
np.random.seed(7)
X_train = sequence.pad_sequences(train_split['encoding'],maxlen = 500)
y_train = train_split['label']
y_train = tf.one_hot(y_train,depth = 2)
X_test = sequence.pad_sequences(test_split['encoding'],maxlen=500)
y_test = test_split['label']
y_test = tf.one_hot(y_test,depth = 2)
# val_x = sequence.pad_sequences(validation_split['encoding'],maxlen=500)
# val_y = validation_split['label']
# val_y = tf.one_hot(val_y, depth = 2)

ValueError: ignored

## Binary Classification with 30 epochs

In [None]:
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(40000,embedding_vector_length,input_length = 500))
model.add(LSTM(32,dropout=0.2, return_sequences = True ))
model.add(LSTM(32))
model.add(Dense(2,activation = 'softmax'))
model.compile(loss ='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath='/content/gdrive/MyDrive/GCDC_rerelease/checkpoints', save_weights_only=True, monitor='val_accuracy', mode='max', save_best_only=True)
print(model.summary())
model.fit(X_train,y_train, epochs = 30, batch_size=500,callbacks = [model_checkpoint_callback])
scores = model.evaluate(X_test, y_test, verbose =0)
print("Accuracy: ",(scores[1]*100))

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 500, 32)           1280000   
                                                                 
 lstm_18 (LSTM)              (None, 500, 32)           8320      
                                                                 
 lstm_19 (LSTM)              (None, 32)                8320      
                                                                 
 dense_9 (Dense)             (None, 2)                 66        
                                                                 
Total params: 1,296,706
Trainable params: 1,296,706
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30


## Binary Classification with 20 Epochs

In [None]:
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(40000,embedding_vector_length,input_length = 500))
model.add(LSTM(32,dropout=0.2, return_sequences = True ))
model.add(LSTM(32))
model.add(Dense(2,activation = 'softmax'))
model.compile(loss ='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath='/content/gdrive/MyDrive/GCDC_rerelease/checkpoints', save_weights_only=True, monitor='val_accuracy', mode='max', save_best_only=True)
print(model.summary())
model.fit(X_train,y_train, epochs = 20, batch_size=82,callbacks = [model_checkpoint_callback])
scores = model.evaluate(X_test, y_test, verbose =0)
print("Accuracy: ",(scores[1]*100))

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 500, 32)           1280000   
                                                                 
 lstm_20 (LSTM)              (None, 500, 32)           8320      
                                                                 
 lstm_21 (LSTM)              (None, 32)                8320      
                                                                 
 dense_10 (Dense)            (None, 2)                 66        
                                                                 
Total params: 1,296,706
Trainable params: 1,296,706
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20

In [None]:
model.save('/content/gdrive/MyDrive/GCDC_rerelease/new_data_lstm_model')



INFO:tensorflow:Assets written to: /content/gdrive/MyDrive/GCDC_rerelease/new_data_lstm_model/assets


INFO:tensorflow:Assets written to: /content/gdrive/MyDrive/GCDC_rerelease/new_data_lstm_model/assets


## Binary Classification with Minimum Similarity Function

In [None]:
np.random.seed(7)
X_train = sequence.pad_sequences(train_split['encoding'],maxlen = 500)
y_train = train_split['label']
y_train = tf.one_hot(y_train,depth = 2)
X_test = sequence.pad_sequences(test_split['encoding'],maxlen=500)
y_test = test_split['label']
y_test = tf.one_hot(y_test,depth = 2)

val_x = sequence.pad_sequences(validation_split['encoding'],maxlen=500)
val_y = validation_split['label']
val_y = tf.one_hot(val_y, depth = 2)


X_train = np.append(train_split['similarity'][:,np.newaxis], X_train, axis=1)
X_test = np.append(test_split['similarity'][:,np.newaxis],X_test, axis=1)

embedding_vector_length = 32
model = Sequential()
model.add(Embedding(400000,embedding_vector_length,input_length = 501))
model.add(LSTM(32,dropout=0.2, return_sequences = True ))
model.add(LSTM(32))
model.add(Dense(2,activation = 'softmax'))
model.compile(loss ='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath='/content/gdrive/MyDrive/GCDC_rerelease/checkpoints', save_weights_only=True, monitor='val_accuracy', mode='max', save_best_only=True)

print(model.summary())
model.fit(X_train,y_train, epochs = 20 , batch_size=82,validation_data = (val_x,val_y), callbacks = model_checkpoint_callback)
# model.fit(X_train,y_train, epochs = 20 , batch_size=82)

scores = model.evaluate(X_test, y_test, verbose =0)
print("Accuracy: ",(scores[1]*100))





# Comparison of Test Data in GCDC Corpus

In [None]:
data1 = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/Clinton_train.csv')
data2 = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/Yahoo_train.csv')
data3 = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/Yelp_train.csv')
data4 = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/Enron_train.csv')

data5 = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/Yahoo_test.csv')
data6 = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/Yelp_test.csv')
data7 = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/Enron_test.csv')
data8 = pd.read_csv('/content/gdrive/MyDrive/GCDC_rerelease/Clinton_test.csv')

In [None]:
data_t = pd.concat([data1,data2,data3,data4,data5,data6,data7])

data_t.to_csv('/content/gdrive/MyDrive/GCDC_rerelease/temp_train.csv')


In [None]:
train = Tokenizer('/content/gdrive/MyDrive/GCDC_rerelease/temp_train.csv')
test = Tokenizer("/content/gdrive/MyDrive/GCDC_rerelease/Clinton_test.csv")

In [None]:
lst = array(train.data['labelA'])
encoded = to_categorical(lst)
lst = array(test.data['labelA'])
t_encoded = to_categorical(lst)

train_mapping, inv_train_mapping = train.preprocess()
test_mapping, inv_test_mapping = test.preprocess()

In [None]:
train.data = similarity_paragraph(train.data)
test.data = similarity_paragraph(test.data)

train.data['encoding'] = train_mapping
test.data['encoding'] = test_mapping

In [None]:
coh_bin = []
for i in range(4600):
  if train.data['labelA'].tolist()[i] >=2:
    coh_bin.append(1)
  else:
    coh_bin.append(0)
train.data['bin_coh']= coh_bin


coh_bin=[]
for i in range(200):
  if test.data['labelA'].tolist()[i] >=2:
    coh_bin.append(1)
  else:
    coh_bin.append(0)

test.data['bin_coh']=coh_bin

In [None]:
lst = array(train.data['bin_coh'])
encoded = to_categorical(lst)

lst = array(test.data['bin_coh'])
t_encoded = to_categorical(lst)


In [None]:
np.random.seed(7)
X_train = sequence.pad_sequences(train.data['encoding'],maxlen = 500)
y_train = encoded#train.data['h_e']
X_test = sequence.pad_sequences(test.data['encoding'],maxlen=500)
y_test = t_encoded#test.data['h_e']

X_train = np.append(train.data['similarity'][:,np.newaxis], X_train, axis=1)
X_test = np.append(test.data['similarity'][:,np.newaxis],X_test, axis=1)

  import sys
  


## Clinton Test Data

In [None]:
#Clinton_Test_Data

embedding_vector_length = 32
model = Sequential()
model.add(Embedding(40000,embedding_vector_length,input_length = 501))
model.add(LSTM(32,dropout=0.2, return_sequences = True ))
model.add(LSTM(32))
model.add(Dense(2,activation = 'softmax'))
model.compile(loss ='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train,y_train, epochs = 15 , batch_size=23)

scores = model.evaluate(X_test, y_test, verbose =0)
print("Accuracy: ",(scores[1]*100))

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 501, 32)           1280000   
                                                                 
 lstm_12 (LSTM)              (None, 501, 32)           8320      
                                                                 
 lstm_13 (LSTM)              (None, 32)                8320      
                                                                 
 dense_6 (Dense)             (None, 2)                 66        
                                                                 
Total params: 1,296,706
Trainable params: 1,296,706
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## Enron Test Data

In [None]:
#Enron_Test_Data

embedding_vector_length = 32
model = Sequential()
model.add(Embedding(40000,embedding_vector_length,input_length = 501))
model.add(LSTM(32,dropout=0.2, return_sequences = True ))
model.add(LSTM(32))
model.add(Dense(2,activation = 'softmax'))
model.compile(loss ='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train,y_train, epochs = 15 , batch_size=23)

scores = model.evaluate(X_test, y_test, verbose =0)
print("Accuracy: ",(scores[1]*100))

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 501, 32)           1280000   
                                                                 
 lstm_10 (LSTM)              (None, 501, 32)           8320      
                                                                 
 lstm_11 (LSTM)              (None, 32)                8320      
                                                                 
 dense_5 (Dense)             (None, 2)                 66        
                                                                 
Total params: 1,296,706
Trainable params: 1,296,706
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## Yelp  Test Data

In [None]:
#Yelp_Test_Data

embedding_vector_length = 32
model = Sequential()
model.add(Embedding(40000,embedding_vector_length,input_length = 501))
model.add(LSTM(32,dropout=0.2, return_sequences = True ))
model.add(LSTM(32))
model.add(Dense(2,activation = 'softmax'))
model.compile(loss ='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train,y_train, epochs = 15 , batch_size=23)

scores = model.evaluate(X_test, y_test, verbose =0)
print("Accuracy: ",(scores[1]*100))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 501, 32)           1280000   
                                                                 
 lstm_2 (LSTM)               (None, 501, 32)           8320      
                                                                 
 lstm_3 (LSTM)               (None, 32)                8320      
                                                                 
 dense_1 (Dense)             (None, 2)                 66        
                                                                 
Total params: 1,296,706
Trainable params: 1,296,706
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## Yahoo Test Data

In [None]:
#Yahoo_Test_Data

embedding_vector_length = 32
model = Sequential()
model.add(Embedding(40000,embedding_vector_length,input_length = 501))
model.add(LSTM(32,dropout=0.2, return_sequences = True ))
model.add(LSTM(32))
model.add(Dense(2,activation = 'softmax'))
model.compile(loss ='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train,y_train, epochs = 15 , batch_size=23)

scores = model.evaluate(X_test, y_test, verbose =0)
print("Accuracy: ",(scores[1]*100))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 501, 32)           1280000   
                                                                 
 lstm (LSTM)                 (None, 501, 32)           8320      
                                                                 
 lstm_1 (LSTM)               (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 2)                 66        
                                                                 
Total params: 1,296,706
Trainable params: 1,296,706
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Ac