In [1]:
import pandas as pd
import numpy as np
import sklearn
import os
from tqdm import tqdm, trange, tqdm_notebook
import re
import nltk
from multiprocessing import cpu_count, Pool
from functools import partial
import ipywidgets
import random

In [2]:
# This script will:
# i) Read in train and test .csv files
# ii) From the train files, create a train set and a dev set. These will share no Body IDs (articles) in common.
# ii) Re-create the features present in the official fnc-1-baseline for all sets (train, dev/holdout, and test), and
#     write all these out as an .h5. You can then load these to build more features/models upon.

In [3]:
data_path = "./fnc-1/"
bodies_train = pd.read_csv(os.path.join(data_path, "train_bodies.csv"), header=0)
headlines_train = pd.read_csv(os.path.join(data_path, "train_stances.csv"), header=0)
proportion_for_holdout_dev_set = 0.2

In [4]:
print(bodies_train.shape)
print(headlines_train.shape)

(1683, 2)
(49972, 3)


In [5]:
Xy_train = headlines_train.merge(bodies_train, on=["Body ID"], how="left")
print(Xy_train.shape)

(49972, 4)


In [6]:
# Usage: pass Xy_train (or a df with Body ID col., and y col. as well)
# returns the same format of df, but spplit so that no body ID is shared between sets
def disjoint_train_test_split(Xy, frac_for_test_set=proportion_for_holdout_dev_set, random_state=42):
    r = random.Random()
    r.seed(random_state)
    Xy_IDs = list(set(Xy["Body ID"]))
    print("Total unique IDs: "+str(len(Xy_IDs)))
    r.shuffle(Xy_IDs)
    num_IDs_for_train = int((1-frac_for_test_set)*len(Xy_IDs))
    train_IDs = Xy_IDs[:num_IDs_for_train]
    test_IDs = Xy_IDs[num_IDs_for_train:]
    train_df = Xy_train[Xy_train["Body ID"].isin(train_IDs)]
    test_df = Xy_train[Xy_train["Body ID"].isin(test_IDs)]
    print("# instances in train: "+str(train_df.shape[0]))
    print("# instances in test: "+str(test_df.shape[0]))
    assert len(set(train_df["Body ID"].unique()) & set(test_df["Body ID"].unique())) == 0 # totally disjoint
    return train_df, test_df

In [7]:
Xy_train, Xy_dev = disjoint_train_test_split(Xy_train, frac_for_test_set=proportion_for_holdout_dev_set)
# We will also keep the Body ID in both df's in case we want to merge them back when modelling
# (e.g. to filter out a specific class), and they have been shuffled.
X_train = Xy_train[["Headline", "articleBody", "Body ID"]]
y_train = Xy_train[["Stance", "Body ID"]]

X_dev = Xy_dev[["Headline", "articleBody", "Body ID"]]
y_dev = Xy_dev[["Stance", "Body ID"]]

y_train.to_hdf('y_train_disjoint.h5','df')
y_dev.to_hdf('y_dev_disjoint.h5','df')
del Xy_train # purposefully make these not accessibly so we do not risk leaking data from y
del Xy_dev
del y_train
del y_dev

Total unique IDs: 1683
# instances in train: 39437
# instances in test: 10535


In [8]:
# We re-create the features from the official baseline for X_train and then X_dev

In [9]:
## Helper functions. You should call gen_all_features(X_df) to create the features.
# You will need approx. 14GB of memory since we cache columns and copy the strings rather than referring to them
# by ID.
_wnl = nltk.WordNetLemmatizer()

def normalize_word(w):
    return _wnl.lemmatize(w).lower()


def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]


def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()


def remove_stopwords(l):
    # Removes stopwords from a list of tokens
    return [w for w in l if w not in sklearn.feature_extraction.text.ENGLISH_STOP_WORDS]

def word_overlap_features_2(df):
    clean_headline = set(df["___clean_headline_tokenized_lemmas"])
    clean_body = set(df["___clean_body_tokenized_lemmas"])
    feature = len(clean_headline & clean_body)/float(len(clean_headline | clean_body))
    return feature

def refuting_features_adder(df):
    # Returns 1/0 if each of these words is present in headline
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        # 'refute',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]
    clean_headline = df["___clean_headline_tokenized_lemmas"]
    features = clean_headline.apply(lambda hl: pd.Series([1 if word in hl else 0 for word in _refuting_words]))
    features.columns = ["wrf_hl_"+ref_word for ref_word in _refuting_words]
    return pd.concat([df, features], axis=1)


def polarity_features_adder(df):
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]
    def calculate_polarity(tokens):
        return sum([t in _refuting_words for t in tokens]) % 2
    
    clean_headline = df["___clean_headline_tokenized_lemmas"]
    clean_body = df["___clean_body_tokenized_lemmas"]
    
    
    headline_polarity = pd.DataFrame(clean_headline.apply(calculate_polarity))
    headline_polarity.columns = ["polar_hl"]
    
    body_polarity = pd.DataFrame(clean_body.apply(calculate_polarity))
    body_polarity.columns = ["polar_body"]
    
    df = pd.concat([df, headline_polarity, body_polarity], axis=1)
    return df

## START hand_features
def binary_co_occurrence(row):
    # Count how many times a token in the title
    # appears in the body text.
    bin_count = 0
    bin_count_early = 0
    for headline_token in row["___clean_headline"].split(" "):
        if headline_token in row["___clean_body"]:
            bin_count += 1
        if headline_token in row["___clean_body"][:255]:
            bin_count_early += 1
    return pd.Series((bin_count, bin_count_early))

def binary_co_occurence_stops(row):
        # Count how many times a token in the title
        # appears in the body text. Stopwords in the title
        # are ignored.
        bin_count_stopless = 0
        #bin_count_early = 0
        for headline_token in remove_stopwords(row["___clean_headline"].split(" ")):
            if headline_token in row["___clean_body"]:
                bin_count_stopless += 1
                #bin_count_early += 1 # This is technically in the baseline, but it add no new information, so not including
        return bin_count_stopless


In [10]:
def chargrams(input, n):
    output = []
    for i in range(len(input) - n + 1):
        output.append(input[i:i + n])
    return output

def ngrams(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input) - n + 1):
        output.append(input[i:i + n])
    return output

def append_chargrams(row, size=None):
    grams = [' '.join(x) for x in chargrams(" ".join(remove_stopwords(row["___clean_headline"].split())), size)]
    grams_hits = 0
    grams_early_hits = 0
    grams_first_hits = 0
    for gram in grams:
        if gram in row["___clean_body"]:
            grams_hits += 1
        if gram in row["___clean_body"][:255]:
            grams_early_hits += 1
        if gram in row["___clean_body"][:100]:
            grams_first_hits += 1
    return pd.Series((grams_hits, grams_early_hits, grams_first_hits))

def append_ngrams(row, size=None):
    grams = [' '.join(x) for x in ngrams(row["___clean_headline"], size)]
    grams_hits = 0
    grams_early_hits = 0
    for gram in grams:
        if gram in row["___clean_body"]:
            grams_hits += 1
        if gram in row["___clean_body"][:255]:
            grams_early_hits += 1
    return pd.Series((grams_hits, grams_early_hits))

In [11]:
# Multiprocessing code copied from: http://blog.adeel.io/2016/11/06/parallelize-pandas-map-or-apply/
cores = cpu_count() #Number of CPU cores on your system
partitions = cores #Define as many partitions as you want
def parallelize(data, func):
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data
def word_overlap_features_parallel_helper(df):
        return df.apply(lambda row: word_overlap_features_2(row), axis=1)
def clean_helper(df):
        return df.apply(clean)
def get_tokenized_lemmas_helper(df):
        return df.apply(get_tokenized_lemmas)
def chargram_helper(df, input_size=None):
    return df.apply(lambda row: append_chargrams(row, input_size), axis=1)
def ngram_helper(df, input_size=None):
    return df.apply(lambda row: append_ngrams(row, input_size), axis=1)

def add_cached_columns(df):
    # column names starting with 3 underscores (___....) are cached intermediate
    # values only used to speed-up feature computation
    df["___clean_headline"] = parallelize(df["Headline"], clean_helper)
    df["___clean_headline_tokenized_lemmas"] = parallelize(df["___clean_headline"], get_tokenized_lemmas_helper)
    
    df["___clean_body"] = parallelize(df["articleBody"], clean_helper)
    df["___clean_body_tokenized_lemmas"] = parallelize(df["___clean_body"], get_tokenized_lemmas_helper)
    

def gen_all_features(df):
    print("Adding cached columns...")
    add_cached_columns(df)
    
    print("Adding co-occurrences...")
    co_occurrences = df.apply(binary_co_occurrence, axis=1)
    co_occurrences.columns = ["bin_count", "bin_count_early"]
    df = pd.concat([df, co_occurrences], axis=1)
    df["bin_count_stopless"] = df.apply(binary_co_occurence_stops, axis=1)
    
    # Note: As far as I can tell, the SettingWithCopy warning the following call raises
    # is just a false positive. Usage is actually ok.
    print("Adding word_overlap_features....")
    df["word_overlap_features"] = parallelize(df, word_overlap_features_parallel_helper)
    
    print("Adding refuting_features....")
    df = refuting_features_adder(df)
    
    print("Adding polarity_features....")
    df = polarity_features_adder(df)
    
    print("Adding ngrams...")
    ngram_sizes = [2, 3, 4, 5, 6]
    for ngram_size in tqdm_notebook(ngram_sizes, total=len(ngram_sizes)):
        helper_fn = partial(ngram_helper, input_size=ngram_size)
        temp = parallelize(df, helper_fn)
        temp.columns = ["ngram_"+str(ngram_size)+"_hits", "ngram_"+str(ngram_size)+"_early_hits"]
        df = pd.concat([df, temp], axis=1)
    
    print("Adding chargrams...")
    chargram_sizes = [2, 8, 4, 16]
    for chargram_size in tqdm_notebook(chargram_sizes, total=len(chargram_sizes)):
        helper_fn = partial(chargram_helper, input_size=chargram_size)
        temp = parallelize(df, helper_fn)
        temp.columns = ["chargram_"+str(chargram_size)+"_hits", "chargram_"+str(chargram_size)+"_early_hits", "chargram_"+str(chargram_size)+"_first_hits"]
        df = pd.concat([df, temp], axis=1)
    
    return df

In [12]:
%time X_train = gen_all_features(X_train)

Adding cached columns...
Adding co-occurrences...
Adding word_overlap_features....
Adding refuting_features....
Adding polarity_features....
Adding ngrams...


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Adding chargrams...


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


CPU times: user 1min 23s, sys: 2min 35s, total: 3min 59s
Wall time: 11min 32s


In [13]:
%time X_dev = gen_all_features(X_dev)

Adding cached columns...
Adding co-occurrences...
Adding word_overlap_features....
Adding refuting_features....
Adding polarity_features....
Adding ngrams...


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Adding chargrams...


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


CPU times: user 24.2 s, sys: 3min 8s, total: 3min 32s
Wall time: 4min 21s


In [14]:
X_train.head()

Unnamed: 0,Headline,articleBody,Body ID,___clean_headline,___clean_headline_tokenized_lemmas,___clean_body,___clean_body_tokenized_lemmas,bin_count,bin_count_early,bin_count_stopless,...,chargram_2_first_hits,chargram_8_hits,chargram_8_early_hits,chargram_8_first_hits,chargram_4_hits,chargram_4_early_hits,chargram_4_first_hits,chargram_16_hits,chargram_16_early_hits,chargram_16_first_hits
0,Police find mass graves with at least '15 bodi...,Danny Boyle is directing the untitled film\n\n...,712,police find mass graves with at least 15 bodie...,"[police, find, mass, graf, with, at, least, 15...",danny boyle is directing the untitled film set...,"[danny, boyle, is, directing, the, untitled, f...",2,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Christian Bale passes on role of Steve Jobs, a...",30-year-old Moscow resident was hospitalized w...,137,christian bale passes on role of steve jobs ac...,"[christian, bale, pass, on, role, of, steve, j...",30 year old moscow resident was hospitalized w...,"[30, year, old, moscow, resident, wa, hospital...",5,4,1,...,1,0,0,0,0,0,0,0,0,0
5,'Nasa Confirms Earth Will Experience 6 Days of...,Thousands of people have been duped by a fake ...,154,nasa confirms earth will experience 6 days of ...,"[nasa, confirms, earth, will, experience, 6, d...",thousands of people have been duped by a fake ...,"[thousand, of, people, have, been, duped, by, ...",17,15,14,...,3,0,0,0,0,0,0,0,0,0
6,Accused Boston Marathon Bomber Severely Injure...,A British fighter who travelled to Iraq to sto...,962,accused boston marathon bomber severely injure...,"[accused, boston, marathon, bomber, severely, ...",a british fighter who travelled to iraq to sto...,"[a, british, fighter, who, travelled, to, iraq...",4,1,1,...,2,0,0,0,0,0,0,0,0,0
7,Identity of ISIS terrorist known as 'Jihadi Jo...,"Adding to Apple's iOS 8 launch troubles, a rep...",2033,identity of isis terrorist known as jihadi joh...,"[identity, of, isi, terrorist, known, a, jihad...",adding to apple s ios 8 launch troubles a repo...,"[adding, to, apple, s, io, 8, launch, trouble,...",2,1,0,...,2,0,0,0,0,0,0,0,0,0


In [15]:
X_dev.head()

Unnamed: 0,Headline,articleBody,Body ID,___clean_headline,___clean_headline_tokenized_lemmas,___clean_body,___clean_body_tokenized_lemmas,bin_count,bin_count_early,bin_count_stopless,...,chargram_2_first_hits,chargram_8_hits,chargram_8_early_hits,chargram_8_first_hits,chargram_4_hits,chargram_4_early_hits,chargram_4_first_hits,chargram_16_hits,chargram_16_early_hits,chargram_16_first_hits
1,Hundreds of Palestinians flee floods in Gaza a...,Hundreds of Palestinians were evacuated from t...,158,hundreds of palestinians flee floods in gaza a...,"[hundred, of, palestinian, flee, flood, in, ga...",hundreds of palestinians were evacuated from t...,"[hundred, of, palestinian, were, evacuated, fr...",10,7,7,...,3,0,0,0,0,0,0,0,0,0
3,HBO and Apple in Talks for $15/Month Apple TV ...,(Reuters) - A Canadian soldier was shot at the...,1034,hbo and apple in talks for 15 month apple tv s...,"[hbo, and, apple, in, talk, for, 15, month, ap...",reuters a canadian soldier was shot at the can...,"[reuters, a, canadian, soldier, wa, shot, at, ...",3,3,0,...,1,0,0,0,0,0,0,0,0,0
4,Spider burrowed through tourist's stomach and ...,"Fear not arachnophobes, the story of Bunbury's...",1923,spider burrowed through tourist s stomach and ...,"[spider, burrowed, through, tourist, s, stomac...",fear not arachnophobes the story of bunbury s ...,"[fear, not, arachnophobes, the, story, of, bun...",9,5,4,...,4,0,0,0,0,0,0,0,0,0
12,Kidnapped Nigerian schoolgirls: Government cla...,No one has died more times than Fidel Castro.\...,1003,kidnapped nigerian schoolgirls government clai...,"[kidnapped, nigerian, schoolgirl, government, ...",no one has died more times than fidel castro b...,"[no, one, ha, died, more, time, than, fidel, c...",2,2,0,...,3,0,0,0,0,0,0,0,0,0
13,"No, that high school kid didn't make $72 milli...",The video was one of those viral sensations th...,2132,no that high school kid didn t make 72 million...,"[no, that, high, school, kid, didn, t, make, 7...",the video was one of those viral sensations th...,"[the, video, wa, one, of, those, viral, sensat...",4,2,2,...,1,0,0,0,0,0,0,0,0,0


In [16]:
X_train.to_hdf('X_train_disjoint-allfeatures.h5','df')
X_dev.to_hdf('X_dev_disjoint-allfeatures.h5','df')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['Headline', 'articleBody', '___clean_headline', '___clean_headline_tokenized_lemmas', '___clean_body', '___clean_body_tokenized_lemmas']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [18]:
bodies_test = pd.read_csv(os.path.join(data_path, "competition_test_bodies.csv"), header=0)
headlines_test = pd.read_csv(os.path.join(data_path, "competition_test_stances.csv"), header=0)

In [19]:
Xy_test = headlines_test.merge(bodies_test, on=["Body ID"], how="left")

In [20]:
Xy_test.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Ferguson riots: Pregnant woman loses eye after...,2008,unrelated,A RESPECTED senior French police officer inves...
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,unrelated,Dave Morin's social networking company Path is...
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,unrelated,A bereaved Afghan mother took revenge on the T...
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,unrelated,Hewlett-Packard is officially splitting in two...
4,Argentina's President Adopts Boy to End Werewo...,37,unrelated,An airline passenger headed to Dallas was remo...


In [29]:
y_test = Xy_test[["Stance", "Body ID"]]
X_test = Xy_test[["Headline", "articleBody", "Body ID"]]

In [24]:
y_test.to_hdf('y_TEST_disjoint.h5','df')
del y_test

In [27]:
%time X_test = gen_all_features(X_test)

Adding cached columns...
Adding co-occurrences...
Adding word_overlap_features....
Adding refuting_features....
Adding polarity_features....
Adding ngrams...


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Adding chargrams...


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


CPU times: user 59.2 s, sys: 4min 33s, total: 5min 32s
Wall time: 7min 5s


In [28]:
X_test.to_hdf('X_TEST-allfeatures.h5','df')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['Headline', 'articleBody', '___clean_headline', '___clean_headline_tokenized_lemmas', '___clean_body', '___clean_body_tokenized_lemmas']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
