In [None]:
#import necessary modules
import pandas as pd
import os
import re
import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
import numpy as np
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
# read in the data
train_h = pd.read_csv('train_stances.csv')
train_b = pd.read_csv('train_bodies.csv')
test_h = pd.read_csv('competition_test_stances.csv')
test_b = pd.read_csv('competition_test_bodies.csv')

In [None]:
# function to merge the headlines and articlebody datasets
def merge(d1, d2):
  data = pd.merge(d1, d2, how='inner', left_on=['Body ID'], right_on=['Body ID'])
  return data

In [None]:
# removing specials chars, puting words in lowercase
def clean(s):
  return re.sub("[^a-zA-Z]", " ",str(s)).lower()

_wnl = nltk.WordNetLemmatizer()

# lemmatizing
def normalize_word(w):
  return _wnl.lemmatize(w).lower()

# tokenizing
def get_tokenized_lemmas(s):
  return [normalize_word(t) for t in nltk.word_tokenize(s)]

# removing stopwords
def remove_stopwords(l):
    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]

# do all the above preprocessing and put back into sentences
def preprocess(data, title):
  content = []
  content = [clean(line) for line in data[title]]
  content = [remove_stopwords(line) for line in data[title]]
  content = [get_tokenized_lemmas(line) for line in data[title]]
  content = [' '.join(x) for x in content]
  data[title] = content


In [None]:
# do all necessary preprocessing on all data we need
preprocess(train_h, 'Headline')
preprocess(train_b, 'articleBody')
preprocess(test_h, 'Headline')
preprocess(test_b, 'articleBody')


In [None]:
# merge bodies and headlines
test_data = merge(test_h, test_b)
train_data = merge(train_h, train_b)

In [None]:
# get list of all sentences from the headlines and the bodies to use in the tf vectorizer
sentences = []
for line in train_data['Headline']:
  sentences.append(line)
for line in train_data['articleBody']:
  sentences.append(line)

In [None]:
# this function vectorizes the words/sentences and also measures the similaries between headings and bodies
# it returns data pertaining to the vectorized haedings, bodies, and their similarities
def tf_data(df):
    tfvectorizer = TfidfVectorizer(max_features=2000, use_idf=False).fit(sentences)
    tfidfvectorizer = TfidfVectorizer(max_features=2000, use_idf=True).fit(sentences)
    data = []
    for index, row in df.iterrows():
        head = row['Headline']
        body = row['articleBody']
        tf_head = tfvectorizer.transform([head]).toarray().reshape(1, -1)
        tf_body = tfvectorizer.transform([body]).toarray().reshape(1, -1)
        head_tfidf = tfidfvectorizer.transform([head]).toarray()
        body_tfidf = tfidfvectorizer.transform([body]).toarray()
        # using cosine similarity
        tfidf_cos = cosine_similarity(head_tfidf, body_tfidf).reshape(1, -1)
        # merging the features
        features= np.squeeze(np.c_[tf_head, tf_body, tfidf_cos])
        data.append(features)
    data = np.array(data)
    return data

In [None]:
X_train = tf_data(train_data)
X_test = tf_data(test_data)

In [None]:
# replace the stances with numerical values
train_data.Stance.replace('unrelated', 1, True)
train_data.Stance.replace('agree', 2, True)
train_data.Stance.replace('disagree', 3, True)
train_data.Stance.replace('discuss', 4, True)

y_train = train_data['Stance']

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
prediction = model.predict(X_test)

In [None]:
stances = []
for p in range(len(prediction)):
  if prediction[p] == 1: 
    stances.append("unrelated")
  if prediction[p] == 2: 
    stances.append("disagree")
  if prediction[p] == 3: 
    stances.append("agree")
  if prediction[p] == 4: 
    stances.append("discuss")

In [None]:
def get_accuracy(real, test):
  correct = 0
  total = 0
  for i in range(len(real)):
    if real[i] == test[i]:
      correct += 1
    total += 1
  print( correct/total)

In [None]:
predictions_df = {}
predictions_df = pd.DataFrame({'Stance': stances})

get_accuracy(test_data['Stance'], predictions_df['Stance'])


In [None]:
def score_submission(gold_labels, test_labels):
    score = 0.0

    for i in range(len(gold_labels)):  
      if gold_labels[i] == test_labels[i]:
        if gold_labels[i] == 'unrelated':
          score += 0.25
        if gold_labels[i]!= 'unrelated':
          score += 0.75
      elif gold_labels[i] != 'unrelated':
        if test_labels[i] in ['agrees', 'disagrees', 'discusses']:
          score += 0.25

    return score

In [None]:
# get score from this model
score_submission(test_data['Stance'], predictions_df['Stance'])