Load dataset

In [1]:
import pandas as pd
import csv

In [2]:
class DataSet():
    def __init__(self, name="train", path="dataset"):
        self.path = path

        print("Reading dataset in %s/" %path)
        bodies = name+"_bodies.csv"
        stances = name+"_stances.csv"

        print("Loading files %s, %s" %(bodies,stances))
        self.data = self.read(bodies,stances)

    def read(self,bodies,stances):
        train_bodies = pd.read_csv(self.path + "/" + bodies)
        train_stances = pd.read_csv(self.path + "/" + stances)
        merged = train_bodies.merge(train_stances,left_on='Body ID',right_on='Body ID',how='outer')
        return merged
    
    def print(self):
        print(self.data.head())

In [3]:
training = DataSet()
test = DataSet(name="competition_test")

Reading dataset in dataset/
Loading files train_bodies.csv, train_stances.csv
Reading dataset in dataset/
Loading files competition_test_bodies.csv, competition_test_stances.csv


In [4]:
print(training.data.head())

   Body ID                                        articleBody  \
0        0  A small meteorite crashed into a wooded area i...   
1        0  A small meteorite crashed into a wooded area i...   
2        0  A small meteorite crashed into a wooded area i...   
3        0  A small meteorite crashed into a wooded area i...   
4        0  A small meteorite crashed into a wooded area i...   

                                            Headline     Stance  
0  Soldier shot, Parliament locked down after gun...  unrelated  
1  Tourist dubbed ‘Spider Man’ after spider burro...  unrelated  
2  Luke Somers 'killed in failed rescue attempt i...  unrelated  
3   BREAKING: Soldier shot at War Memorial in Ottawa  unrelated  
4  Giant 8ft 9in catfish weighing 19 stone caught...  unrelated  


Cleaning the text and tokenizing

In [5]:
import os
import re
import nltk
import numpy as np
from sklearn import feature_extraction
from tqdm import tqdm

_wnl = nltk.WordNetLemmatizer() 

def normalize_word(w):
    return _wnl.lemmatize(w).lower()

def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]

def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()

def remove_stopwords(l):
    # Removes stopwords from a list of tokens
    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]

def get_clean_tokens(s):
    s = clean(s)
    tokens = get_tokenized_lemmas(s)
    clean_tokens = remove_stopwords(tokens)
    return clean_tokens

In [6]:
training.data['Headline tokens'] = training.data['Headline'].map(get_clean_tokens)

In [7]:
print(training.data.head())

   Body ID                                        articleBody  \
0        0  A small meteorite crashed into a wooded area i...   
1        0  A small meteorite crashed into a wooded area i...   
2        0  A small meteorite crashed into a wooded area i...   
3        0  A small meteorite crashed into a wooded area i...   
4        0  A small meteorite crashed into a wooded area i...   

                                            Headline     Stance  \
0  Soldier shot, Parliament locked down after gun...  unrelated   
1  Tourist dubbed ‘Spider Man’ after spider burro...  unrelated   
2  Luke Somers 'killed in failed rescue attempt i...  unrelated   
3   BREAKING: Soldier shot at War Memorial in Ottawa  unrelated   
4  Giant 8ft 9in catfish weighing 19 stone caught...  unrelated   

                                     Headline tokens  
0  [soldier, shot, parliament, locked, gunfire, e...  
1  [tourist, dubbed, spider, man, spider, burrow,...  
2  [luke, somers, killed, failed, rescue,

In [8]:
training.data['articleBody tokens'] = training.data['articleBody'].map(get_clean_tokens)

In [9]:
#training.data.to_pickle("trainingdatatokens.pkl")

In [10]:
# Reload the dataframe because it takes a long time to transform the body :-) 
training.data = pd.read_pickle("trainingdatatokens.pkl")

In [11]:
print(training.data.head())

   Body ID                                        articleBody  \
0        0  A small meteorite crashed into a wooded area i...   
1        0  A small meteorite crashed into a wooded area i...   
2        0  A small meteorite crashed into a wooded area i...   
3        0  A small meteorite crashed into a wooded area i...   
4        0  A small meteorite crashed into a wooded area i...   

                                            Headline     Stance  \
0  Soldier shot, Parliament locked down after gun...  unrelated   
1  Tourist dubbed ‘Spider Man’ after spider burro...  unrelated   
2  Luke Somers 'killed in failed rescue attempt i...  unrelated   
3   BREAKING: Soldier shot at War Memorial in Ottawa  unrelated   
4  Giant 8ft 9in catfish weighing 19 stone caught...  unrelated   

                                     Headline tokens  \
0  [soldier, shot, parliament, locked, gunfire, e...   
1  [tourist, dubbed, spider, man, spider, burrow,...   
2  [luke, somers, killed, failed, resc

In [12]:
from collections import Counter

min_len = 2 #tunable parameter

count_words = Counter()
for tokens in training.data['articleBody tokens'].values:
    for token in tokens:
        if len(token)>min_len:
            count_words[token]+=1

In [13]:
count_words.most_common(10)

[('said', 134112),
 ('state', 44137),
 ('video', 43653),
 ('year', 42425),
 ('report', 40928),
 ('apple', 40879),
 ('time', 37308),
 ('isi', 36373),
 ('people', 35930),
 ('told', 35424)]

Vectorize: from text to a vector space

In [14]:
vocabulary = training.data['articleBody tokens'].values + training.data['Headline tokens'].values 

In [15]:
listed_docs = list(training.data['articleBody'].values)
listed_docs2 = list(training.data['Headline'].values)

Vectorize - fit: Fit vocabulary means generate the vector space for the words

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(2,2), min_df=5)
vectorizer.fit(listed_docs)
vectorizer.fit(listed_docs2)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(2, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

Vectorize - transform: Represent a document on the vector space that was fit previously

In [18]:
training.data['tfidf headlines'] = vectorizer.transform(listed_docs2)

In [19]:
#training.data['tfidf bodies'] = vectorizer.transform(listed_docs)

In [20]:
print(training.data.head())

   Body ID                                        articleBody  \
0        0  A small meteorite crashed into a wooded area i...   
1        0  A small meteorite crashed into a wooded area i...   
2        0  A small meteorite crashed into a wooded area i...   
3        0  A small meteorite crashed into a wooded area i...   
4        0  A small meteorite crashed into a wooded area i...   

                                            Headline     Stance  \
0  Soldier shot, Parliament locked down after gun...  unrelated   
1  Tourist dubbed ‘Spider Man’ after spider burro...  unrelated   
2  Luke Somers 'killed in failed rescue attempt i...  unrelated   
3   BREAKING: Soldier shot at War Memorial in Ottawa  unrelated   
4  Giant 8ft 9in catfish weighing 19 stone caught...  unrelated   

                                     Headline tokens  \
0  [soldier, shot, parliament, locked, gunfire, e...   
1  [tourist, dubbed, spider, man, spider, burrow,...   
2  [luke, somers, killed, failed, resc

Load data just to make it less heavy

In [21]:
#training.data.to_pickle('data_tfidf.pkl')

In [22]:
training.data = pd.read_pickle("data_tfidf.pkl")

How does the vector of a document looks like?

In [23]:
print(listed_docs[0])

A small meteorite crashed into a wooded area in Nicaragua's capital of Managua overnight, the government said Sunday. Residents reported hearing a mysterious boom that left a 16-foot deep crater near the city's airport, the Associated Press reports. 

Government spokeswoman Rosario Murillo said a committee formed by the government to study the event determined it was a "relatively small" meteorite that "appears to have come off an asteroid that was passing close to Earth." House-sized asteroid 2014 RC, which measured 60 feet in diameter, skimmed the Earth this weekend, ABC News reports. 
Murillo said Nicaragua will ask international experts to help local scientists in understanding what happened.

The crater left by the meteorite had a radius of 39 feet and a depth of 16 feet,  said Humberto Saballos, a volcanologist with the Nicaraguan Institute of Territorial Studies who was on the committee. He said it is still not clear if the meteorite disintegrated or was buried.

Humberto Garcia

In [24]:
a = vectorizer.transform([listed_docs[0]])
print(a)

  (0, 6540)	0.270362121695911
  (0, 6328)	0.40880455331186605
  (0, 4678)	0.20606023018732983
  (0, 4582)	0.23969388029179453
  (0, 4532)	0.24399338064741213
  (0, 3150)	0.25560080851563965
  (0, 2570)	0.2450334731861741
  (0, 1830)	0.2450334731861741
  (0, 1653)	0.47938776058358906
  (0, 1127)	0.21618092366193067
  (0, 310)	0.2798814335000886
  (0, 68)	0.2450334731861741


Measuring similarity between two documents

In [25]:
from sklearn.metrics.pairwise import cosine_similarity
a = vectorizer.transform([listed_docs[0]])
b = vectorizer.transform([listed_docs[200]])
print(cosine_similarity(a,b))

[[0.]]


Find the headline closest to the first body

In [26]:
# find the headline closest to the first body:
body = vectorizer.transform([listed_docs[0]])
sim_new = 0
sim_old = 0
for indx, headline in enumerate(listed_docs2):
    head_vectorized = vectorizer.transform([headline])
    sim_new = cosine_similarity(body,head_vectorized)
    if sim_new > sim_old:
        sim_old = sim_new
        most_similar_indx = indx

In [27]:
print("Similarity: %f" %sim_old)
print("Most similar: %s" %listed_docs2[most_similar_indx])
print("Match for: %s" %listed_docs[0])

Similarity: 0.271787
Most similar: Meteor Leaves 40-Foot Crater Near Managua's Airport
Match for: A small meteorite crashed into a wooded area in Nicaragua's capital of Managua overnight, the government said Sunday. Residents reported hearing a mysterious boom that left a 16-foot deep crater near the city's airport, the Associated Press reports. 

Government spokeswoman Rosario Murillo said a committee formed by the government to study the event determined it was a "relatively small" meteorite that "appears to have come off an asteroid that was passing close to Earth." House-sized asteroid 2014 RC, which measured 60 feet in diameter, skimmed the Earth this weekend, ABC News reports. 
Murillo said Nicaragua will ask international experts to help local scientists in understanding what happened.

The crater left by the meteorite had a radius of 39 feet and a depth of 16 feet,  said Humberto Saballos, a volcanologist with the Nicaraguan Institute of Territorial Studies who was on the commi

# Reintroduce the task

Ok, so let's do some fake news detection.

Given a headline and a body of text, we want to say whether these two are:

* unrelated
* agree with each other
* disagree with each other
* discuss each other


# Features ?

First, we have to generate features.

Let's check:

* Word overlap (between headline and body)
* Refuting features: words which are refuting
* Polarity features: words which contain polarity
* n-gram
* char-grams
* co-occurences

In [28]:
def word_overlap_features(headlines, bodies):
    # Computes the percentage of overlap between the headline and the body of text (numerical)
    X = []
    for headline, body in zip(headlines, bodies):
        clean_headline = clean(headline)
        clean_body = clean(body)
        clean_headline = get_tokenized_lemmas(clean_headline)
        clean_body = get_tokenized_lemmas(clean_body)
        features = [
            len(set(clean_headline).intersection(clean_body)) / float(len(set(clean_headline).union(clean_body)))]
        X.append(features)
    return X


In [29]:
word_overlap_features(list(training.data['Headline'].values[0:10]), list(training.data['articleBody'].values[0:10]))

[[0.005376344086021506],
 [0.0],
 [0.00546448087431694],
 [0.01098901098901099],
 [0.04736842105263158],
 [0.010582010582010581],
 [0.016483516483516484],
 [0.027472527472527472],
 [0.016129032258064516],
 [0.011049723756906077]]

In [30]:
def calculate_polarity(tokens):
    return sum([t in _refuting_words for t in tokens]) % 2
    
def refuting_features(headlines, bodies):
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]
    X = []
    for headline, body in zip(headlines, bodies):
        clean_headline = clean(headline)
        clean_body = clean(body)
        clean_headline = get_tokenized_lemmas(clean_headline)
        clean_body = get_tokenized_lemmas(clean_body)
        features = []
        features = [1 if word in clean_headline else 0 for word in _refuting_words] #how many refutting words in the headline
        features += [1 if word in clean_body else 0 for word in _refuting_words]
        #print(features)
        X.append(features)
    return X

In [31]:
refuting_features(list(training.data['Headline'].values)[0:1], list(training.data['articleBody'].values)[0:1])

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]]

In [32]:
def polarity_features(headlines, bodies):
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]

    def calculate_polarity(text):
        tokens = get_tokenized_lemmas(text)
        return sum([t in _refuting_words for t in tokens]) % 2
    
    X = []
    for headline, body in zip(headlines, bodies):
        clean_headline = clean(headline)
        clean_body = clean(body)
        features = []
        features.append(calculate_polarity(clean_headline))
        features.append(calculate_polarity(clean_body))
        X.append(features)
        #print(features)
    return np.array(X)

In [33]:
polarity_features(list(training.data['Headline'].values)[0:10], list(training.data['articleBody'].values)[0:10])

array([[0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 1],
       [0, 1],
       [0, 1]])

In [34]:
def ngrams(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input) - n + 1):
        output.append(input[i:i + n])
    return output

def chargrams(input, n):
    output = []
    for i in range(len(input) - n + 1):
        output.append(input[i:i + n])
    return output

def append_chargrams(features, text_headline, text_body, size):
    grams = [' '.join(x) for x in chargrams(" ".join(remove_stopwords(text_headline.split())), size)]
    grams_hits = 0
    grams_early_hits = 0
    grams_first_hits = 0
    for gram in grams:
        if gram in text_body:
            grams_hits += 1
        if gram in text_body[:255]:
            grams_early_hits += 1
        if gram in text_body[:100]:
            grams_first_hits += 1
    features.append(grams_hits)
    features.append(grams_early_hits)
    features.append(grams_first_hits)
    return features

def append_ngrams(features, text_headline, text_body, size):
    grams = [' '.join(x) for x in ngrams(text_headline, size)]
    grams_hits = 0
    grams_early_hits = 0
    for gram in grams:
        if gram in text_body:
            grams_hits += 1
        if gram in text_body[:255]:
            grams_early_hits += 1
    features.append(grams_hits)
    features.append(grams_early_hits)
    return features

def hand_features(headlines, bodies):
    def binary_co_occurence(headline, body):
        # Count how many times a token in the title
        # appears in the body text.
        bin_count = 0
        bin_count_early = 0
        for headline_token in clean(headline).split(" "):
            if headline_token in clean(body):
                bin_count += 1
            if headline_token in clean(body)[:255]:
                bin_count_early += 1
        return [bin_count, bin_count_early]

    def binary_co_occurence_stops(headline, body):
        # Count how many times a token in the title
        # appears in the body text. Stopwords in the title
        # are ignored.
        bin_count = 0
        bin_count_early = 0
        for headline_token in remove_stopwords(clean(headline).split(" ")):
            if headline_token in clean(body):
                bin_count += 1
                bin_count_early += 1
        return [bin_count, bin_count_early]

    def count_grams(headline, body):
        # Count how many times an n-gram of the title
        # appears in the entire body, and intro paragraph
        clean_body = clean(body)
        clean_headline = clean(headline)
        features = []
        features = append_chargrams(features, clean_headline, clean_body, 2)
        features = append_chargrams(features, clean_headline, clean_body, 8)
        features = append_chargrams(features, clean_headline, clean_body, 4)
        features = append_chargrams(features, clean_headline, clean_body, 16)
        features = append_ngrams(features, clean_headline, clean_body, 2)
        features = append_ngrams(features, clean_headline, clean_body, 3)
        features = append_ngrams(features, clean_headline, clean_body, 4)
        features = append_ngrams(features, clean_headline, clean_body, 5)
        features = append_ngrams(features, clean_headline, clean_body, 6)
        return features

    X = []
    for headline, body in zip(headlines, bodies):
        X.append(binary_co_occurence(headline, body)
                 + binary_co_occurence_stops(headline, body)
                 + count_grams(headline, body))

    return X

In [35]:
hand_features(list(training.data['Headline'].values)[0:1], list(training.data['articleBody'].values)[0:1])

[[1,
  1,
  0,
  0,
  18,
  4,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]]

In [36]:
def gen_or_load_feats(feat_fn, headlines, bodies, feature_file):
    if not os.path.isfile(feature_file):
        feats = feat_fn(headlines, bodies)
        np.save(feature_file, feats)

    return np.load(feature_file)

In [37]:
LABELSINT = {'agree':0, 'disagree':1, 'discuss':2, 'unrelated':3}

def generate_features(dataset,name):
    h, b, y = [],[],[]

    y = [LABELSINT[label] for label in list(dataset['Stance'].values)]
    h = list(dataset['Headline'].values)
    b = list(dataset['articleBody'].values)

    X_overlap = gen_or_load_feats(word_overlap_features, h, b, "features/overlap."+name+".npy")
    X_refuting = gen_or_load_feats(refuting_features, h, b, "features/refuting."+name+".npy")
    X_polarity = gen_or_load_feats(polarity_features, h, b, "features/polarity."+name+".npy")
    X_hand = gen_or_load_feats(hand_features, h, b, "features/hand."+name+".npy")
    
    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
    return X,y

In [38]:
F = generate_features(training.data,'train')

# Classification
Now, we choose our "favourite" classification algorithm

In [39]:
from utils.dependencies import *

In [40]:
from sklearn.ensemble import GradientBoostingClassifier

LABELS = ['agree', 'disagree', 'discuss', 'unrelated']
LABELS_RELATED = ['unrelated','related']
RELATED = LABELS[0:3]
#check_version()
#parse_params()
#Load the training dataset and generate folds
d = DataSet("train")
folds,hold_out = kfold_split(d,n_folds=10)
fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)

# Load the competition dataset
competition_dataset = DataSet("competition_test")
X_competition, y_competition = generate_features(competition_dataset.data, "competition")

training = DataSet()
test = DataSet(name="competition_test")
Xs = dict()
ys = dict()

# Load/Precompute all features now
X_holdout,y_holdout = generate_features(hold_out_stances,"holdout")
for fold in fold_stances:
    Xs[fold],ys[fold] = generate_features(fold_stances[fold],str(fold))

best_score = 0
best_fold = None

# Classifier for each fold
for fold in fold_stances:
    ids = list(range(len(folds)))
    del ids[fold]

    X_train = np.vstack(tuple([Xs[i] for i in ids]))
    y_train = np.hstack(tuple([ys[i] for i in ids]))

    X_test = Xs[fold]
    y_test = ys[fold]

    clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
    clf.fit(X_train, y_train)

    predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
    actual = [LABELS[int(a)] for a in y_test]

    fold_score, _ = score_submission(actual, predicted)
    max_fold_score, _ = score_submission(actual, actual)

    score = fold_score/float(max_fold_score)

    print("Score for fold "+ str(fold) + " was - " + str(score))
    if score > best_score:
        best_score = score
        best_fold = clf


#Run on Holdout set and report the final score on the holdout set
predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
actual = [LABELS[int(a)] for a in y_holdout]

print("Scores on the dev set")
report_score(actual,predicted)
print("")
print("")

#Run on competition dataset
predicted = [LABELS[int(a)] for a in best_fold.predict(X_competition)]
actual = [LABELS[int(a)] for a in y_competition]

print("Scores on the test set")
report_score(actual,predicted)

Reading dataset in dataset/
Loading files train_bodies.csv, train_stances.csv
Reading dataset in dataset/
Loading files competition_test_bodies.csv, competition_test_stances.csv
Reading dataset in dataset/
Loading files train_bodies.csv, train_stances.csv
Reading dataset in dataset/
Loading files competition_test_bodies.csv, competition_test_stances.csv
      Iter       Train Loss   Remaining Time 
         1       34833.0286            1.18m
         2       31155.3700            1.12m
         3       28188.2279            1.10m
         4       25762.4758            1.08m
         5       23754.6905            1.07m
         6       22083.6100            1.07m
         7       20689.5452            1.06m
         8       19500.9426            1.05m
         9       18515.5698            1.04m
        10       17669.3099            1.03m
        20       13606.0571           57.67s
        30       12451.4618           53.91s
        40       11978.3661           50.42s
        50   

75.13142366698852

In [44]:
#Run on competition dataset
predicted = [LABELS[int(a)] for a in best_fold.predict(X_competition)]
actual = [LABELS[int(a)] for a in y_competition]

print("Scores on the test set")
report_score(actual,predicted)

Scores on the test set
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    205    |    15     |   1395    |    288    |
-------------------------------------------------------------
| disagree  |    52     |    31     |    363    |    251    |
-------------------------------------------------------------
|  discuss  |    252    |    35     |   3494    |    683    |
-------------------------------------------------------------
| unrelated |    13     |     1     |    352    |   17983   |
-------------------------------------------------------------
Score: 8753.75 out of 11651.25	(75.13142366698852%)


75.13142366698852