In [1]:
import pandas as pd
import numpy as np
import sklearn
import os
from tqdm import tqdm, trange
import re

In [2]:
data_path = "./fnc-1/"
bodies_train = pd.read_csv(os.path.join(data_path, "train_bodies.csv"), header=0)
headlines_train = pd.read_csv(os.path.join(data_path, "train_stances.csv"), header=0)

In [3]:
print(bodies_train.shape)
print(headlines_train.shape)

(1683, 2)
(49972, 3)


In [4]:
bodies_train.head()

Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [5]:
headlines_train.head()

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [6]:
headlines_train[headlines_train["Body ID"] == 712]

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1787,Seth Rogen to Play Apple’s Steve Wozniak,712,discuss
3974,Mexico police find mass grave near site 43 stu...,712,unrelated
4936,Mexico Says Missing Students Not Found In Firs...,712,unrelated
5210,New iOS 8 bug can delete all of your iCloud do...,712,unrelated
5863,Return of the Mac: Seth Rogen in talks to star...,712,discuss
6199,Seth Rogen Is Woz,712,discuss
6756,Mexico finds 4 more graves at site of suspecte...,712,unrelated
7526,Are missing students in mass graves found near...,712,unrelated
9003,Mexico prosecutor: Students not in 1st mass gr...,712,unrelated


In [7]:
headlines_train.head()

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [8]:
def add_bodies_to_headline_df(headlines_train, bodies_train): # DO NOT USE - JUST FOR TESTING (slow)
    for index, row in tqdm(headlines_train.iterrows(), total=headlines_train.shape[0]):
        cur_body_id = int(row["Body ID"])
        cur_body_text = bodies_train[bodies_train["Body ID"] == cur_body_id]
        #print(cur_body_text)
        #if index > 3:
        #    break
        headlines_train["Body"] = cur_body_text
#add_bodies_to_headline_df(headlines_train, bodies_train)
#headlines_train.head()

In [9]:
temp = headlines_train.merge(bodies_train, on=["Body ID"], how="left")

In [10]:
temp.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,Hundreds of Palestinians were evacuated from t...
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,30-year-old Moscow resident was hospitalized w...
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,(Reuters) - A Canadian soldier was shot at the...
4,Spider burrowed through tourist's stomach and ...,1923,disagree,"Fear not arachnophobes, the story of Bunbury's..."


In [11]:
X_train = temp[["Headline", "articleBody"]]

In [12]:
y_train = temp["Stance"]
y_train.head()

0    unrelated
1        agree
2    unrelated
3    unrelated
4     disagree
Name: Stance, dtype: object

In [13]:
print(X_train.shape)
print(y_train.shape)

(49972, 2)
(49972,)


In [14]:
## START BASELINE FEATURE RE-WRITE
import nltk
_wnl = nltk.WordNetLemmatizer()

def normalize_word(w):
    return _wnl.lemmatize(w).lower()


def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]


def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()


def remove_stopwords(l):
    # Removes stopwords from a list of tokens
    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]


def word_overlap_features(row):
    clean_headline = clean(row["Headline"])
    clean_body = clean(row["articleBody"])
    clean_headline = get_tokenized_lemmas(clean_headline)
    clean_body = get_tokenized_lemmas(clean_body)
    # feature is 1 real number -- fraction of token shared in headline and body (lemmatized, cleaned)
    feature = len(set(clean_headline).intersection(clean_body)) / float(len(set(clean_headline).union(clean_body)))
    return feature

In [15]:
X_train.head()

Unnamed: 0,Headline,articleBody
0,Police find mass graves with at least '15 bodi...,Danny Boyle is directing the untitled film\n\n...
1,Hundreds of Palestinians flee floods in Gaza a...,Hundreds of Palestinians were evacuated from t...
2,"Christian Bale passes on role of Steve Jobs, a...",30-year-old Moscow resident was hospitalized w...
3,HBO and Apple in Talks for $15/Month Apple TV ...,(Reuters) - A Canadian soldier was shot at the...
4,Spider burrowed through tourist's stomach and ...,"Fear not arachnophobes, the story of Bunbury's..."


In [16]:
X_train.apply(lambda row: word_overlap_features(row), axis=1)

0        0.014085
1        0.046083
2        0.030303
3        0.028169
4        0.032727
5        0.074236
6        0.004608
7        0.011111
8        0.038462
9        0.026846
10       0.000000
11       0.044898
12       0.017094
13       0.011111
14       0.050000
15       0.041885
16       0.014368
17       0.054264
18       0.019231
19       0.031746
20       0.010526
21       0.025000
22       0.015228
23       0.025641
24       0.054054
25       0.064103
26       0.005882
27       0.000000
28       0.011583
29       0.081081
           ...   
49942    0.010638
49943    0.005076
49944    0.015015
49945    0.010791
49946    0.026432
49947    0.018382
49948    0.028302
49949    0.041667
49950    0.025478
49951    0.013889
49952    0.000000
49953    0.008621
49954    0.008197
49955    0.011696
49956    0.037288
49957    0.008850
49958    0.008174
49959    0.010638
49960    0.018182
49961    0.004494
49962    0.047619
49963    0.000000
49964    0.016760
49965    0.011494
49966    0

In [17]:
X_train.head()

Unnamed: 0,Headline,articleBody
0,Police find mass graves with at least '15 bodi...,Danny Boyle is directing the untitled film\n\n...
1,Hundreds of Palestinians flee floods in Gaza a...,Hundreds of Palestinians were evacuated from t...
2,"Christian Bale passes on role of Steve Jobs, a...",30-year-old Moscow resident was hospitalized w...
3,HBO and Apple in Talks for $15/Month Apple TV ...,(Reuters) - A Canadian soldier was shot at the...
4,Spider burrowed through tourist's stomach and ...,"Fear not arachnophobes, the story of Bunbury's..."
