### MSCI 598 - Final Project
### Gaurav Mudbhatkal - 20747018

In [1]:
## for data
import json
import pandas as pd
import numpy as np
## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
## for processing
import re
import nltk
## for bag-of-words
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K

In [2]:
from feature_engineering import refuting_features, polarity_features, hand_features, gen_or_load_feats
from feature_engineering import word_overlap_features
from feature_engineering import clean, normalize_word, get_tokenized_lemmas, remove_stopwords
from utils.dataset import DataSet
from utils.generate_test_splits import kfold_split, get_stances_for_folds
from utils.score import report_score, LABELS, score_submission

from utils.system import parse_params, check_version

#### Data Processing

In [3]:
d = DataSet()
folds,hold_out = kfold_split(d,n_folds=10)

#dict of headline, bodyid and stance 
fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)

Reading dataset
Total stances: 49972
Total bodies: 1683


In [4]:
from tqdm import tqdm

def generate_features(stances,dataset,name):
    h, b, y = [],[],[]

    for stance in stances:
        y.append(LABELS.index(stance['Stance']))
        h.append(stance['Headline'])
        b.append(dataset.articles[stance['Body ID']])

    X_overlap = gen_or_load_feats(word_overlap_features, h, b, "features/overlap."+name+".npy")
    X_refuting = gen_or_load_feats(refuting_features, h, b, "features/refuting."+name+".npy")
    X_polarity = gen_or_load_feats(polarity_features, h, b, "features/polarity."+name+".npy")
    X_hand = gen_or_load_feats(hand_features, h, b, "features/hand."+name+".npy")

    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
    return X,y

def clean_data(stances, dataset):
    h, b, y = [], [], []
    for stance in stances:
        y.append(LABELS.index(stance['Stance']))
        h.append(stance['Headline'])
        b.append(dataset.articles[stance['Body ID']])
    X = []
    clean_headlines = []
    clean_bodies = []
    for i, (headline, body) in tqdm(enumerate(zip(h, b))):
        clean_headline = clean(headline)
        clean_body = clean(body)
        clean_headline = get_tokenized_lemmas(clean_headline)
        clean_body = get_tokenized_lemmas(clean_body)
        clean_headlines.append(clean_headline)
        clean_bodies.append(clean_body)
    return clean_headlines, clean_bodies, y

clean_headlines_folds = dict()
clean_bodies_folds = dict()
labels = dict()

print("Cleaning data")
for fold in fold_stances:
    clean_headlines_folds[fold], clean_bodies_folds[fold], labels[fold] = clean_data(fold_stances[fold],d)

Cleaning data


4124it [00:16, 248.59it/s]
4663it [00:22, 208.67it/s]
3783it [00:09, 383.16it/s]
3388it [00:10, 314.41it/s]
3644it [00:12, 299.38it/s]
4644it [00:15, 302.62it/s]
3848it [00:16, 239.27it/s]
4273it [00:15, 276.52it/s]
4039it [00:12, 312.94it/s]
3944it [00:11, 333.47it/s]


In [5]:
print("Finding features")
# feature vectors for each fold
Xs = dict()
ys = dict()
for fold in fold_stances:
    Xs[fold],ys[fold] = generate_features(fold_stances[fold],d,str(fold))

Finding features


In [6]:
ids = list(range(len(folds)))

remaining_features_temp = []
for i in ids:
    remaining_features_temp.append(Xs[i])
remaining_features = [item for sublist in remaining_features_temp for item in sublist]

clean_headlines_temp = []
for i in ids:
    clean_headlines_temp.append(clean_headlines_folds[i])
clean_headlines = [item for sublist in clean_headlines_temp for item in sublist]

clean_bodies_temp = []
for i in ids:
    clean_bodies_temp.append(clean_headlines_folds[i])
clean_bodies = [item for sublist in clean_bodies_temp for item in sublist]

stances_temp = []
for i in ids:
    stances_temp.append(labels[i])
stances = [item for sublist in stances_temp for item in sublist]

clean_headlines_sent = []
for headline in clean_headlines:
    clean_headlines_sent.append(" ".join(headline))
    
clean_bodies_sent = []
for body in clean_bodies:
    clean_bodies_sent.append(" ".join(body))

In [7]:
# Tf-Idf 
vectorizer_total = feature_extraction.text.TfidfVectorizer(max_features=8000, ngram_range=(1,2), stop_words='english')

In [8]:
vectorizer_total.fit(clean_headlines_sent+clean_bodies_sent)
vectorized_headlines = vectorizer_total.transform(clean_headlines_sent)
vectorized_bodies = vectorizer_total.transform(clean_bodies_sent)

In [9]:
from scipy import spatial
cosine_sims = []
for headline, body in list(zip(clean_headlines_sent, clean_bodies_sent)):
    vectorized_headline = vectorizer_total.transform([headline]).toarray()
    vectorized_body = vectorizer_total.transform([body]).toarray()
#     print(vectorized_headline.shape)
#     print(vectorized_body.shape)
    cosine_sim = 1 - spatial.distance.cosine(vectorized_headline, vectorized_body) # for similarity, 1-score
    cosine_sims.append(cosine_sim)

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [10]:
from keras.utils.np_utils import to_categorical   
from scipy.sparse import hstack
cosine_similarity = np.array(cosine_sims).reshape(-1,1)

X_train = np.c_[np.array(remaining_features), cosine_similarity]
# X_train_arr = X_train.toarray()
y_train = to_categorical(stances, num_classes=4)

In [11]:
# Load the competition dataset
competition_dataset = DataSet("competition_test")

clean_headlines_comp, clean_bodies_comp, labels_comp = clean_data(competition_dataset.stances,competition_dataset)
X_competition, y_competition = generate_features(competition_dataset.stances, competition_dataset, "competition")

clean_headlines_sent_comp = []
for headline in clean_headlines_comp:
    clean_headlines_sent_comp.append(" ".join(headline))
    
clean_bodies_sent_comp = []
for body in clean_bodies_comp:
    clean_bodies_sent_comp.append(" ".join(body))

vectorized_headlines_comp = vectorizer_total.transform(clean_headlines_sent_comp)
vectorized_bodies_comp = vectorizer_total.transform(clean_bodies_sent_comp)

cosine_sims_comp = []
for headline, body in list(zip(clean_headlines_sent_comp, clean_bodies_sent_comp)):
    vectorized_headline = vectorizer_total.transform([headline]).toarray()
    vectorized_body = vectorizer_total.transform([body]).toarray()
    cosine_sim = 1 - spatial.distance.cosine(vectorized_headline, vectorized_body) # similarity: 1-score
    cosine_sims_comp.append(cosine_sim)

cosine_similarity_comp = np.array(cosine_sims_comp).reshape(-1,1)

# X_comp = hstack((vectorized_headlines_comp, X_competition, cosine_similarity_comp, vectorized_bodies_comp))
X_comp = np.c_[X_competition, cosine_similarity_comp]
# X_comp_arr = X_comp.toarray()
y_comp = to_categorical(labels_comp, num_classes=4)

Reading dataset
Total stances: 25413
Total bodies: 904


25413it [01:22, 308.67it/s]


#### Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train, stances)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# classifier.fit(X_train,stances)
predicted_logreg = [LABELS[int(a)] for a in clf.predict(X_comp)]
actual_logreg = [LABELS[int(a)] for a in labels_comp]

# score_submission(actual, predicted)
report_score(actual_logreg, predicted_logreg)

 # write predicted labels to file
with open("logreg_predictions.csv","w") as f:
    for prediction in predicted_logreg:
        f.write(prediction + "\n")

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    262    |     5     |   1441    |    195    |
-------------------------------------------------------------
| disagree  |    80     |     0     |    465    |    152    |
-------------------------------------------------------------
|  discuss  |    341    |     2     |   3708    |    413    |
-------------------------------------------------------------
| unrelated |    82     |     0     |   1108    |   17159   |
-------------------------------------------------------------
Score: 8843.25 out of 11651.25	(75.89958158995816%)


In [14]:
from sklearn.metrics import precision_score, recall_score
precision_score(actual_logreg, predicted_logreg, average='macro'), recall_score(actual_logreg, predicted_logreg, average='macro')

(0.4629230300628946, 0.47586721058518794)

#### Feed Forward Neural Network

In [18]:
model = models.Sequential()
model.add(layers.Dense(100, activation='relu'))
model.add(layers.Dropout(0.6))
model.add(layers.Dense(64,  activation='relu'))
# model.add(layers.Dense(32, activation='softmax'))
model.add(layers.Dropout(0.1))
model.add(layers.Dense(4, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=100, batch_size=500)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [20]:
predicted_dense = [LABELS[int(np.argmax(a))] for a in model.predict(X_comp)]
actual_dense = [LABELS[int(a)] for a in labels_comp]

# score_submission(actual, predicted)
report_score(actual_dense, predicted_dense)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    123    |     2     |   1580    |    198    |
-------------------------------------------------------------
| disagree  |    22     |     0     |    506    |    169    |
-------------------------------------------------------------
|  discuss  |    138    |     0     |   3900    |    426    |
-------------------------------------------------------------
| unrelated |    14     |     0     |   1120    |   17215   |
-------------------------------------------------------------
Score: 8888.75 out of 11651.25	(76.29009762900976%)


76.29009762900976

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model_dense.png', show_shapes=True, show_layer_names=True)

In [21]:
from sklearn.metrics import precision_score, recall_score
precision_score(actual_dense, predicted_dense, average='macro'), recall_score(actual_dense, predicted_dense, average='macro')

(0.4797343507787215, 0.46912224202304054)

In [26]:
 # write predicted labels to file
with open("dense_predictions.csv","w") as f:
    for prediction in predicted_dense:
        f.write(prediction + "\n")