# Data Loading/Prep

In [1]:
import numpy as np
import pandas as pd

In [2]:
TEST_BODIES_PATH = "data/fnc-1/competition_test_bodies.csv"
TEST_STANCES_PATH = "data/fnc-1/competition_test_stances.csv"

bodies = pd.read_csv(TEST_BODIES_PATH, names=['Body ID', 'articleBody'], header=0)
stances = pd.read_csv(TEST_STANCES_PATH, names=['Headline', 'Body ID', 'Stance'], header=0)
stance_idx = {}
for headline in stances['Headline'].values:
    if headline not in stance_idx.keys():
        stance_idx[headline] = len(stance_idx.keys())
stances['Head ID'] = [stance_idx[head] for head in stances['Headline'].values]
# TODO: load predictions by ALBERT

In [3]:
df = pd.merge(bodies, stances, on='Body ID')
# TODO: merge predictions by ALBERT
display(df.sample(n=5))
print(df.shape)

Unnamed: 0,Body ID,articleBody,Headline,Stance,Head ID
21458,2243,Israel's Army Radio substantiated earlier clai...,Israeli right welcomes Sisi's offer to a Pales...,agree,664
16267,1782,POLICE are investigating claims a doctor took ...,Joan Rivers' 'selfie' doctor 'DENIES performin...,discuss,336
5772,765,As traditions go this may be the most unbeliev...,President of Argentina adopts Jewish boy to st...,agree,358
4493,579,The 80-year-old and bride-to-be Afton Elaine B...,"Report: HP to split into two companies, one fo...",unrelated,448
9228,1158,Microsoft Corp. is in serious discussions to b...,Microsoft Near Deal to Buy Minecraft Maker Mojang,agree,210


(25413, 5)


In [4]:
# Filter out only related pairs
df_rel = df.loc[df['Stance'] != 'unrelated']
# TEMP: set predictions as stances
df_rel['Predictions'] = df_rel.loc[:,'Stance'].copy()
display(df_rel.sample(n=5))
print(df_rel.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rel['Predictions'] = df_rel.loc[:,'Stance'].copy()


Unnamed: 0,Body ID,articleBody,Headline,Stance,Head ID,Predictions
15286,1703,A 19-year-old woman in Poland woke up during b...,Teenager wakes up while undergoing BRAIN SURGE...,agree,199,agree
11927,1391,The Islamic State of Syria and Iraq has report...,ISIS Beheads American Journalist Steven Sotlof...,discuss,554,discuss
9107,1135,Claim: Mitt Romney said Michelle Obama should ...,Did Mitt Romney Call Michelle Obama “First Wom...,discuss,831,discuss
23051,2366,A new video appears to show the execution of S...,Steven Sotloff 'beheaded by Islamic State',discuss,377,discuss
6568,876,Even bears can’t stand Justin Bieber’s music.\...,Justin Bieber saves man from bear attack,agree,376,agree


(7064, 6)


# Data Transformation

In [5]:
import nltk
import sklearn
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
import string
import gensim
import gensim.downloader as api

# Download if not installed already
# nltk.download('stopwords')



In [8]:
# load word2vec model
wv = api.load('word2vec-google-news-300')
# stopwords
sw = stopwords
# punctuation
punct = set(string.punctuation)
# stemmer
stemmer = PorterStemmer()

In [9]:
embeddings = []

for headline in tqdm(df_rel['Headline'].values):
    tokens = word_tokenize(headline)
    # Doc vec is average of summed word vectors
    doc_vec = None
    n = len(tokens)
    for token in tokens:
        vec = wv[token]
        if doc_vec == None:
            doc_vec = vec
        else:
            doc_vec += vec
    doc_vec /= n
    embeddings.append(doc_vec)
    
        
        
### Tokenization and lemmatization    
#     new_headline = []
#     for token in tokens:
#         token = token.lower()
#         if token not in punct and token not in sw:
#             new_headline.append(stemmer.stem(token))
#     tokenized_headlines.append(new_headline)
# 'th' means tokenized headlines
df_rel['embeddings'] = embeddings
display(df_rel[['Headline', 'th']].sample(n=5))

  0%|          | 0/7064 [00:00<?, ?it/s]

KeyError: "Key 'El-Sisi' not present"