In [1]:
import pandas as pd
import numpy as np
import sklearn
import os
from tqdm import tqdm, trange
import re

In [2]:
data_path = "./fnc-1/"
bodies_train = pd.read_csv(os.path.join(data_path, "train_bodies.csv"), header=0)
headlines_train = pd.read_csv(os.path.join(data_path, "train_stances.csv"), header=0)

In [3]:
print(bodies_train.shape)
print(headlines_train.shape)

(1683, 2)
(49972, 3)


In [4]:
bodies_train.head()

Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [5]:
headlines_train.head()

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [6]:
headlines_train[headlines_train["Body ID"] == 712]

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1787,Seth Rogen to Play Apple’s Steve Wozniak,712,discuss
3974,Mexico police find mass grave near site 43 stu...,712,unrelated
4936,Mexico Says Missing Students Not Found In Firs...,712,unrelated
5210,New iOS 8 bug can delete all of your iCloud do...,712,unrelated
5863,Return of the Mac: Seth Rogen in talks to star...,712,discuss
6199,Seth Rogen Is Woz,712,discuss
6756,Mexico finds 4 more graves at site of suspecte...,712,unrelated
7526,Are missing students in mass graves found near...,712,unrelated
9003,Mexico prosecutor: Students not in 1st mass gr...,712,unrelated


In [7]:
headlines_train.head()

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [8]:
temp = headlines_train.merge(bodies_train, on=["Body ID"], how="left")
X_train = temp[["Headline", "articleBody"]]

In [9]:
# TEMPORARY LIMIT JUST FOR TESTING TODO REMOVE
#X_train = X_train[:500]

In [10]:
## START BASELINE FEATURE RE-WRITE
import nltk
_wnl = nltk.WordNetLemmatizer()

def normalize_word(w):
    return _wnl.lemmatize(w).lower()


def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]


def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()


def remove_stopwords(l):
    # Removes stopwords from a list of tokens
    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]


def word_overlap_features(row):
    clean_headline = clean(row["Headline"])
    clean_body = clean(row["articleBody"])
    clean_headline = get_tokenized_lemmas(clean_headline)
    clean_body = get_tokenized_lemmas(clean_body)
    # feature is 1 real number -- fraction of token shared in headline and body (lemmatized, cleaned)
    feature = len(set(clean_headline).intersection(clean_body)) / float(len(set(clean_headline).union(clean_body)))
    return feature

def word_overlap_features_2(df):
    clean_headline = set(df["___clean_headline_tokenized_lemmas"])
    clean_body = set(df["___clean_body_tokenized_lemmas"])
    feature = len(clean_headline & clean_body)/float(len(clean_headline | clean_body))
    return feature

def refuting_features_adder(df):
    # Returns 1/0 if each of these words is present in headline
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        # 'refute',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]
    #clean_headline = df["___clean_headline"]
    #clean_headline = clean_headline.apply(get_tokenized_lemmas)
    clean_headline = df["___clean_headline_tokenized_lemmas"]
    features = clean_headline.apply(lambda hl: pd.Series([1 if word in hl else 0 for word in _refuting_words]))
    features.columns = ["wrf_hl_"+ref_word for ref_word in _refuting_words]
    return pd.concat([df, features], axis=1)


def polarity_features_adder(df):
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]
    def calculate_polarity(tokens):
        return sum([t in _refuting_words for t in tokens]) % 2
    
    #clean_headline = df["___clean_headline"]
    #clean_body = df["___clean_body"]
    
    #clean_headline = clean_headline.apply(get_tokenized_lemmas)
    clean_headline = df["___clean_headline_tokenized_lemmas"]
    #clean_body = clean_body.apply(get_tokenized_lemmas)
    clean_body = df["___clean_body_tokenized_lemmas"]
    
    
    headline_polarity = pd.DataFrame(clean_headline.apply(calculate_polarity))
    headline_polarity.columns = ["polar_hl"]
    
    body_polarity = pd.DataFrame(clean_body.apply(calculate_polarity))
    body_polarity.columns = ["polar_body"]
    
    df = pd.concat([df, headline_polarity, body_polarity], axis=1)
    return df

In [None]:
from multiprocessing import cpu_count, Pool#, Parallel
# Multiprocessing code copied from: http://blog.adeel.io/2016/11/06/parallelize-pandas-map-or-apply/
cores = cpu_count() #Number of CPU cores on your system
partitions = cores #Define as many partitions as you want
def parallelize(data, func):
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data
def word_overlap_features_parallel_helper(df):
        return df.apply(lambda row: word_overlap_features_2(row), axis=1)
def clean_helper(df):
        return df.apply(clean)
def get_tokenized_lemmas_helper(df):
        return df.apply(get_tokenized_lemmas)

def add_cached_columns(df):
    # column names starting with 3 underscores (___....) are cached intermediate
    # values only used to speed-up feature computation

    #df["___clean_headline"] = df["Headline"].apply(clean)
    df["___clean_headline"] = parallelize(df["Headline"], clean_helper)
    #df["___clean_headline_tokenized_lemmas"] = df["___clean_headline"].apply(get_tokenized_lemmas)
    df["___clean_headline_tokenized_lemmas"] = parallelize(df["___clean_headline"], get_tokenized_lemmas_helper)
    
    #df["___clean_body"] = df["articleBody"].apply(clean)
    df["___clean_body"] = parallelize(df["articleBody"], clean_helper)
    #df["___clean_body_tokenized_lemmas"] = df["___clean_body"].apply(get_tokenized_lemmas)
    df["___clean_body_tokenized_lemmas"] = parallelize(df["___clean_body"], get_tokenized_lemmas_helper)
    


def gen_all_features(df):
    print("Adding cached columns...")
    add_cached_columns(X_train)
    
    
    # Note: As far as I can tell, the SettingWithCopy warning the following call raises
    # is just a false positive. Usage is actually ok.
    print("Adding word_overlap_features....")
    df["word_overlap_features"] = parallelize(df, word_overlap_features_parallel_helper)
    
    print("Adding refuting_features....")
    df = refuting_features_adder(df)
    
    print("Adding polarity_features....")
    df = polarity_features_adder(df)
    return df
%time X_train = gen_all_features(X_train)

Adding cached columns...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Adding word_overlap_features....


In [None]:
X_train.head()