In [1]:
import pandas as pd
import numpy as np
import sklearn
import os
from tqdm import tqdm, trange, tqdm_notebook
import re
import nltk
from multiprocessing import cpu_count, Pool
from functools import partial

In [2]:
data_path = "./fnc-1/"
bodies_train = pd.read_csv(os.path.join(data_path, "train_bodies.csv"), header=0)
headlines_train = pd.read_csv(os.path.join(data_path, "train_stances.csv"), header=0)

In [None]:
print(bodies_train.shape)
print(headlines_train.shape)

(1683, 2)
(49972, 3)


In [None]:
bodies_train.head()

In [None]:
headlines_train.head()

In [None]:
headlines_train[headlines_train["Body ID"] == 712]

In [None]:
headlines_train.head()

In [None]:
temp = headlines_train.merge(bodies_train, on=["Body ID"], how="left")
X_train = temp[["Headline", "articleBody"]]

In [None]:
# TEMPORARY LIMIT JUST FOR TESTING TODO REMOVE
#X_train = X_train[:10]

In [None]:
## START BASELINE FEATURE RE-WRITE
_wnl = nltk.WordNetLemmatizer()

def normalize_word(w):
    return _wnl.lemmatize(w).lower()


def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]


def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()


def remove_stopwords(l):
    # Removes stopwords from a list of tokens
    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]

def word_overlap_features_2(df):
    clean_headline = set(df["___clean_headline_tokenized_lemmas"])
    clean_body = set(df["___clean_body_tokenized_lemmas"])
    feature = len(clean_headline & clean_body)/float(len(clean_headline | clean_body))
    return feature

def refuting_features_adder(df):
    # Returns 1/0 if each of these words is present in headline
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        # 'refute',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]
    clean_headline = df["___clean_headline_tokenized_lemmas"]
    features = clean_headline.apply(lambda hl: pd.Series([1 if word in hl else 0 for word in _refuting_words]))
    features.columns = ["wrf_hl_"+ref_word for ref_word in _refuting_words]
    return pd.concat([df, features], axis=1)


def polarity_features_adder(df):
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]
    def calculate_polarity(tokens):
        return sum([t in _refuting_words for t in tokens]) % 2
    
    clean_headline = df["___clean_headline_tokenized_lemmas"]
    clean_body = df["___clean_body_tokenized_lemmas"]
    
    
    headline_polarity = pd.DataFrame(clean_headline.apply(calculate_polarity))
    headline_polarity.columns = ["polar_hl"]
    
    body_polarity = pd.DataFrame(clean_body.apply(calculate_polarity))
    body_polarity.columns = ["polar_body"]
    
    df = pd.concat([df, headline_polarity, body_polarity], axis=1)
    return df

## START hand_features
def binary_co_occurrence(row):
    # Count how many times a token in the title
    # appears in the body text.
    bin_count = 0
    bin_count_early = 0
    for headline_token in row["___clean_headline"].split(" "):
        if headline_token in row["___clean_body"]:
            bin_count += 1
        if headline_token in row["___clean_body"][:255]:
            bin_count_early += 1
    return pd.Series((bin_count, bin_count_early))

def binary_co_occurence_stops(row):
        # Count how many times a token in the title
        # appears in the body text. Stopwords in the title
        # are ignored.
        bin_count_stopless = 0
        #bin_count_early = 0
        for headline_token in remove_stopwords(row["___clean_headline"].split(" ")):
            if headline_token in row["___clean_body"]:
                bin_count_stopless += 1
                #bin_count_early += 1 # TODO why are these values the same in the baseline??? bizarre... deleting
        return bin_count_stopless


In [None]:
def chargrams(input, n):
    output = []
    for i in range(len(input) - n + 1):
        output.append(input[i:i + n])
    return output

def ngrams(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input) - n + 1):
        output.append(input[i:i + n])
    return output

def append_chargrams(row, size=None):
    grams = [' '.join(x) for x in chargrams(" ".join(remove_stopwords(row["___clean_headline"].split())), size)]
    grams_hits = 0
    grams_early_hits = 0
    grams_first_hits = 0
    for gram in grams:
        if gram in row["___clean_body"]:
            grams_hits += 1
        if gram in row["___clean_body"][:255]:
            grams_early_hits += 1
        if gram in row["___clean_body"][:100]:
            grams_first_hits += 1
    return pd.Series((grams_hits, grams_early_hits, grams_first_hits))

def append_ngrams(row, size=None):
    grams = [' '.join(x) for x in ngrams(row["___clean_headline"], size)]
    grams_hits = 0
    grams_early_hits = 0
    for gram in grams:
        if gram in row["___clean_body"]:
            grams_hits += 1
        if gram in row["___clean_body"][:255]:
            grams_early_hits += 1
    return pd.Series((grams_hits, grams_early_hits))

In [None]:
# Multiprocessing code copied from: http://blog.adeel.io/2016/11/06/parallelize-pandas-map-or-apply/
cores = cpu_count() #Number of CPU cores on your system
partitions = cores #Define as many partitions as you want
def parallelize(data, func):
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data
def word_overlap_features_parallel_helper(df):
        return df.apply(lambda row: word_overlap_features_2(row), axis=1)
def clean_helper(df):
        return df.apply(clean)
def get_tokenized_lemmas_helper(df):
        return df.apply(get_tokenized_lemmas)
def chargram_helper(df, input_size=None):
    return df.apply(lambda row: append_chargrams(row, input_size), axis=1)
def ngram_helper(df, input_size=None):
    return df.apply(lambda row: append_ngrams(row, input_size), axis=1)

def add_cached_columns(df):
    # column names starting with 3 underscores (___....) are cached intermediate
    # values only used to speed-up feature computation

    df["___clean_headline"] = parallelize(df["Headline"], clean_helper)
    df["___clean_headline_tokenized_lemmas"] = parallelize(df["___clean_headline"], get_tokenized_lemmas_helper)
    
    df["___clean_body"] = parallelize(df["articleBody"], clean_helper)
    df["___clean_body_tokenized_lemmas"] = parallelize(df["___clean_body"], get_tokenized_lemmas_helper)
    

def gen_all_features(df):
    print("Adding cached columns...")
    add_cached_columns(X_train)
    
    print("Adding co-occurrences...")
    co_occurrences = df.apply(binary_co_occurrence, axis=1)
    co_occurrences.columns = ["bin_count", "bin_count_early"]
    df = pd.concat([df, co_occurrences], axis=1)
    df["bin_count_stopless"] = df.apply(binary_co_occurence_stops, axis=1)
    
    # Note: As far as I can tell, the SettingWithCopy warning the following call raises
    # is just a false positive. Usage is actually ok.
    print("Adding word_overlap_features....")
    df["word_overlap_features"] = parallelize(df, word_overlap_features_parallel_helper)
    
    print("Adding refuting_features....")
    df = refuting_features_adder(df)
    
    print("Adding polarity_features....")
    df = polarity_features_adder(df)
    
    print("Adding ngrams...")
    ngram_sizes = [2, 3, 4, 5, 6]
    for ngram_size in tqdm_notebook(ngram_sizes, total=len(ngram_sizes)):
        helper_fn = partial(ngram_helper, input_size=ngram_size)
        temp = parallelize(df, helper_fn)
        temp.columns = ["ngram_"+str(ngram_size)+"_hits", "ngram_"+str(ngram_size)+"_early_hits"]
        df = pd.concat([df, temp], axis=1)
    
    print("Adding chargrams...")
    chargram_sizes = [2, 8, 4, 16]
    for chargram_size in tqdm_notebook(chargram_sizes, total=len(chargram_sizes)):
        helper_fn = partial(chargram_helper, input_size=chargram_size)
        temp = parallelize(df, helper_fn)
        temp.columns = ["chargram_"+str(chargram_size)+"_hits", "chargram_"+str(chargram_size)+"_early_hits", "chargram_"+str(chargram_size)+"_first_hits"]
        df = pd.concat([df, temp], axis=1)
    
    return df
%time X_train = gen_all_features(X_train)

In [None]:
X_train.head()

In [None]:
X_train.to_hdf('X_train_full_allfeatures.h5','df')