In [3]:
import pandas as pd
import numpy as np
import sklearn
import os
from tqdm import tqdm, trange, tqdm_notebook
import re
import nltk
from multiprocessing import cpu_count, Pool
from functools import partial
import ipywidgets

In [8]:
data_path = "./fnc-1/"
bodies_train = pd.read_csv(os.path.join(data_path, "train_bodies.csv"), header=0)
headlines_train = pd.read_csv(os.path.join(data_path, "train_stances.csv"), header=0)

In [10]:
print(bodies_train.shape)
print(headlines_train.shape)

(1683, 2)
(49972, 3)
(904, 2)
(25413, 3)


In [12]:
bodies_train.head()

Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [14]:
headlines_train.head()

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [17]:
headlines_train[headlines_train["Body ID"] == 712]

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1787,Seth Rogen to Play Apple’s Steve Wozniak,712,discuss
3974,Mexico police find mass grave near site 43 stu...,712,unrelated
4936,Mexico Says Missing Students Not Found In Firs...,712,unrelated
5210,New iOS 8 bug can delete all of your iCloud do...,712,unrelated
5863,Return of the Mac: Seth Rogen in talks to star...,712,discuss
6199,Seth Rogen Is Woz,712,discuss
6756,Mexico finds 4 more graves at site of suspecte...,712,unrelated
7526,Are missing students in mass graves found near...,712,unrelated
9003,Mexico prosecutor: Students not in 1st mass gr...,712,unrelated


In [19]:
temp = headlines_train.merge(bodies_train, on=["Body ID"], how="left")
X_train = temp[["Headline", "articleBody"]] # TODO include stance here and re-run
y_train = temp["Stance"]
assert X_train.shape[0] == y_train.shape[0]

In [20]:
# Write out y_train already
y_train.to_hdf('y_train_full.h5','df')

In [21]:
# Object no longer used
del y_train

In [22]:
# TEMPORARY LIMIT JUST FOR TESTING TODO REMOVE
#X_train = X_train[:10]

In [23]:
## START BASELINE FEATURE RE-WRITE
_wnl = nltk.WordNetLemmatizer()

def normalize_word(w):
    return _wnl.lemmatize(w).lower()


def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]


def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()


def remove_stopwords(l):
    # Removes stopwords from a list of tokens
    return [w for w in l if w not in sklearn.feature_extraction.text.ENGLISH_STOP_WORDS]

def word_overlap_features_2(df):
    clean_headline = set(df["___clean_headline_tokenized_lemmas"])
    clean_body = set(df["___clean_body_tokenized_lemmas"])
    feature = len(clean_headline & clean_body)/float(len(clean_headline | clean_body))
    return feature

def refuting_features_adder(df):
    # Returns 1/0 if each of these words is present in headline
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        # 'refute',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]
    clean_headline = df["___clean_headline_tokenized_lemmas"]
    features = clean_headline.apply(lambda hl: pd.Series([1 if word in hl else 0 for word in _refuting_words]))
    features.columns = ["wrf_hl_"+ref_word for ref_word in _refuting_words]
    return pd.concat([df, features], axis=1)


def polarity_features_adder(df):
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]
    def calculate_polarity(tokens):
        return sum([t in _refuting_words for t in tokens]) % 2
    
    clean_headline = df["___clean_headline_tokenized_lemmas"]
    clean_body = df["___clean_body_tokenized_lemmas"]
    
    
    headline_polarity = pd.DataFrame(clean_headline.apply(calculate_polarity))
    headline_polarity.columns = ["polar_hl"]
    
    body_polarity = pd.DataFrame(clean_body.apply(calculate_polarity))
    body_polarity.columns = ["polar_body"]
    
    df = pd.concat([df, headline_polarity, body_polarity], axis=1)
    return df

## START hand_features
def binary_co_occurrence(row):
    # Count how many times a token in the title
    # appears in the body text.
    bin_count = 0
    bin_count_early = 0
    for headline_token in row["___clean_headline"].split(" "):
        if headline_token in row["___clean_body"]:
            bin_count += 1
        if headline_token in row["___clean_body"][:255]:
            bin_count_early += 1
    return pd.Series((bin_count, bin_count_early))

def binary_co_occurence_stops(row):
        # Count how many times a token in the title
        # appears in the body text. Stopwords in the title
        # are ignored.
        bin_count_stopless = 0
        #bin_count_early = 0
        for headline_token in remove_stopwords(row["___clean_headline"].split(" ")):
            if headline_token in row["___clean_body"]:
                bin_count_stopless += 1
                #bin_count_early += 1 # TODO why are these values the same in the baseline??? bizarre... deleting
        return bin_count_stopless


In [24]:
def chargrams(input, n):
    output = []
    for i in range(len(input) - n + 1):
        output.append(input[i:i + n])
    return output

def ngrams(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input) - n + 1):
        output.append(input[i:i + n])
    return output

def append_chargrams(row, size=None):
    grams = [' '.join(x) for x in chargrams(" ".join(remove_stopwords(row["___clean_headline"].split())), size)]
    grams_hits = 0
    grams_early_hits = 0
    grams_first_hits = 0
    for gram in grams:
        if gram in row["___clean_body"]:
            grams_hits += 1
        if gram in row["___clean_body"][:255]:
            grams_early_hits += 1
        if gram in row["___clean_body"][:100]:
            grams_first_hits += 1
    return pd.Series((grams_hits, grams_early_hits, grams_first_hits))

def append_ngrams(row, size=None):
    grams = [' '.join(x) for x in ngrams(row["___clean_headline"], size)]
    grams_hits = 0
    grams_early_hits = 0
    for gram in grams:
        if gram in row["___clean_body"]:
            grams_hits += 1
        if gram in row["___clean_body"][:255]:
            grams_early_hits += 1
    return pd.Series((grams_hits, grams_early_hits))

In [14]:
# Multiprocessing code copied from: http://blog.adeel.io/2016/11/06/parallelize-pandas-map-or-apply/
cores = cpu_count() #Number of CPU cores on your system
partitions = cores #Define as many partitions as you want
def parallelize(data, func):
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data
def word_overlap_features_parallel_helper(df):
        return df.apply(lambda row: word_overlap_features_2(row), axis=1)
def clean_helper(df):
        return df.apply(clean)
def get_tokenized_lemmas_helper(df):
        return df.apply(get_tokenized_lemmas)
def chargram_helper(df, input_size=None):
    return df.apply(lambda row: append_chargrams(row, input_size), axis=1)
def ngram_helper(df, input_size=None):
    return df.apply(lambda row: append_ngrams(row, input_size), axis=1)

def add_cached_columns(df):
    # column names starting with 3 underscores (___....) are cached intermediate
    # values only used to speed-up feature computation

    df["___clean_headline"] = parallelize(df["Headline"], clean_helper)
    df["___clean_headline_tokenized_lemmas"] = parallelize(df["___clean_headline"], get_tokenized_lemmas_helper)
    
    df["___clean_body"] = parallelize(df["articleBody"], clean_helper)
    df["___clean_body_tokenized_lemmas"] = parallelize(df["___clean_body"], get_tokenized_lemmas_helper)
    

def gen_all_features(df):
    print("Adding cached columns...")
    add_cached_columns(X_train)
    
    print("Adding co-occurrences...")
    co_occurrences = df.apply(binary_co_occurrence, axis=1)
    co_occurrences.columns = ["bin_count", "bin_count_early"]
    df = pd.concat([df, co_occurrences], axis=1)
    df["bin_count_stopless"] = df.apply(binary_co_occurence_stops, axis=1)
    
    # Note: As far as I can tell, the SettingWithCopy warning the following call raises
    # is just a false positive. Usage is actually ok.
    print("Adding word_overlap_features....")
    df["word_overlap_features"] = parallelize(df, word_overlap_features_parallel_helper)
    
    print("Adding refuting_features....")
    df = refuting_features_adder(df)
    
    print("Adding polarity_features....")
    df = polarity_features_adder(df)
    
    print("Adding ngrams...")
    ngram_sizes = [2, 3, 4, 5, 6]
    for ngram_size in tqdm_notebook(ngram_sizes, total=len(ngram_sizes)):
        helper_fn = partial(ngram_helper, input_size=ngram_size)
        temp = parallelize(df, helper_fn)
        temp.columns = ["ngram_"+str(ngram_size)+"_hits", "ngram_"+str(ngram_size)+"_early_hits"]
        df = pd.concat([df, temp], axis=1)
    
    print("Adding chargrams...")
    chargram_sizes = [2, 8, 4, 16]
    for chargram_size in tqdm_notebook(chargram_sizes, total=len(chargram_sizes)):
        helper_fn = partial(chargram_helper, input_size=chargram_size)
        temp = parallelize(df, helper_fn)
        temp.columns = ["chargram_"+str(chargram_size)+"_hits", "chargram_"+str(chargram_size)+"_early_hits", "chargram_"+str(chargram_size)+"_first_hits"]
        df = pd.concat([df, temp], axis=1)
    
    return df
%time X_train = gen_all_features(X_train)

Adding cached columns...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Process ForkPoolWorker-26:
Process ForkPoolWorker-27:
Process ForkPoolWorker-30:
Process ForkPoolWorker-31:
Process ForkPoolWorker-29:
Process ForkPoolWorker-32:
Process ForkPoolWorker-25:
Process ForkPoolWorker-28:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/gui/anaconda3/envs/nlu4/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/gui/anaconda3/envs/nlu4/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/gui/anaconda3/envs/nlu4/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/gui/anaconda3/envs/nlu4/lib/python3.6/multiprocessing/process.py", line

KeyboardInterrupt: 

In [None]:
%time X_test = gen_all_features(X_test)

In [15]:
X_train.head()

Unnamed: 0,Headline,articleBody,___clean_headline,___clean_headline_tokenized_lemmas,___clean_body
0,Police find mass graves with at least '15 bodi...,Danny Boyle is directing the untitled film\n\n...,police find mass graves with at least 15 bodie...,"[police, find, mass, graf, with, at, least, 15...",danny boyle is directing the untitled film set...
1,Hundreds of Palestinians flee floods in Gaza a...,Hundreds of Palestinians were evacuated from t...,hundreds of palestinians flee floods in gaza a...,"[hundred, of, palestinian, flee, flood, in, ga...",hundreds of palestinians were evacuated from t...
2,"Christian Bale passes on role of Steve Jobs, a...",30-year-old Moscow resident was hospitalized w...,christian bale passes on role of steve jobs ac...,"[christian, bale, pass, on, role, of, steve, j...",30 year old moscow resident was hospitalized w...
3,HBO and Apple in Talks for $15/Month Apple TV ...,(Reuters) - A Canadian soldier was shot at the...,hbo and apple in talks for 15 month apple tv s...,"[hbo, and, apple, in, talk, for, 15, month, ap...",reuters a canadian soldier was shot at the can...
4,Spider burrowed through tourist's stomach and ...,"Fear not arachnophobes, the story of Bunbury's...",spider burrowed through tourist s stomach and ...,"[spider, burrowed, through, tourist, s, stomac...",fear not arachnophobes the story of bunbury s ...


In [16]:
X_train.to_hdf('X_train_full_allfeatures.h5','df')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['Headline', 'articleBody', '___clean_headline', '___clean_headline_tokenized_lemmas', '___clean_body']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [25]:
### NOW GENERATE FEATURES FOR TEST SET

In [26]:
data_path = "./fnc-1/"
bodies_test = pd.read_csv(os.path.join(data_path, "competition_test_bodies.csv"), header=0)
headlines_test = pd.read_csv(os.path.join(data_path, "competition_test_stances.csv"), header=0)

In [27]:
print(bodies_test.shape)
print(headlines_test.shape)

(904, 2)
(25413, 3)


In [28]:
bodies_test.head()

Unnamed: 0,Body ID,articleBody
0,1,Al-Sisi has denied Israeli reports stating tha...
1,2,A bereaved Afghan mother took revenge on the T...
2,3,CNBC is reporting Tesla has chosen Nevada as t...
3,12,A 4-inch version of the iPhone 6 is said to be...
4,19,GR editor’s Note\n\nThere are no reports in th...


In [29]:
headlines_test.head()

Unnamed: 0,Headline,Body ID,Stance
0,Ferguson riots: Pregnant woman loses eye after...,2008,unrelated
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,unrelated
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,unrelated
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,unrelated
4,Argentina's President Adopts Boy to End Werewo...,37,unrelated


In [30]:
temp = headlines_test.merge(bodies_test, on=["Body ID"], how="left")
X_test = temp[["Headline", "articleBody"]] # TODO include stance here and re-run
y_test = temp["Stance"]
assert X_test.shape[0] == y_test.shape[0]

In [31]:
y_test.to_hdf('y_test_full.h5', 'df')

In [32]:
del y_test