# Data Prep

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
import string
from tqdm.notebook import tqdm
# stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielvangelder/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
TEST_BODIES_PATH = "data/fnc-1/competition_test_bodies.csv"
TEST_STANCES_PATH = "data/fnc-1/competition_test_stances.csv"
TRAIN_BODIES_PATH = "data/fnc-1/train_bodies.csv"
TRAIN_STANCES_PATH = "data/fnc-1/train_stances.csv"
ALBERT_PREDICTIONS = "data/fnc-1/golden_labels_2.csv"
BASELINE_PREDICTIONS = "data/fnc-1/baseline_output.csv"



In [3]:
def create_merged_df(bodies_path, stances_path):
    bodies = pd.read_csv(bodies_path, names=['Body ID', 'articleBody'], header=0)
    stances = pd.read_csv(stances_path, names=['Headline', 'Body ID', 'Stance'], header=0)
    df = pd.merge(bodies, stances, on='Body ID')
    return df


albert = pd.read_csv(ALBERT_PREDICTIONS, names=['Headline', 'Body ID', 'Stance'], header=0)
baseline = pd.read_csv(BASELINE_PREDICTIONS, names=['Headline', 'Body ID', 'Stance'], header=0)
baseline.columns = ['Headline', 'Body ID', 'Stance_baseline']
test_res = create_merged_df(TEST_BODIES_PATH, TEST_STANCES_PATH)
# test_res['albert'] = pd.malbert[['Headline', 'Stance']]
test_res = pd.merge(test_res,albert, on=['Headline', 'Body ID'], suffixes=['_true', '_albert'])
test_res = pd.merge(test_res,baseline, on=['Headline', 'Body ID'])
train = create_merged_df(TRAIN_BODIES_PATH, TRAIN_STANCES_PATH)
# display(test_res)
# display(train)

In [5]:
test_res_rel = test_res.loc[test_res['Stance_true'] != 'unrelated']
correct = test_res_rel.copy()
correct['correct_albert'] = test_res_rel['Stance_true'] == test_res_rel['Stance_albert']
correct['correct_base'] = test_res_rel['Stance_true'] == test_res_rel['Stance_baseline']
correct = correct[['articleBody', 'Headline', 'Stance_true', 'correct_albert', 'correct_base']]
# display(correct)
correct_count = correct[['Headline', 'correct_albert', 'correct_base']].groupby(['Headline']).sum().sort_values('correct_albert', ascending=False)
correct_count.reset_index(level=0, inplace=True)
# display(correct_count)
pair_count = {}
rel_headlines = set(test_res_rel['Headline'].values)
# print(rel_headlines)
for head in rel_headlines:
    pair_count[head] = test_res_rel.loc[test_res_rel['Headline'] == head].shape[0]
    
grouped_res = correct_count.copy()
missed_count_albert = []
missed_count_base = []
total = []

for headline in grouped_res['Headline'].values:
    total.append(pair_count[headline])
    missed_count_albert.append(pair_count[headline] - grouped_res.loc[grouped_res['Headline'] == headline]['correct_albert'].values[0])
    missed_count_base.append(pair_count[headline] - grouped_res.loc[grouped_res['Headline'] == headline]['correct_base'].values[0])
     
grouped_res['missed_albert'] = missed_count_albert
grouped_res['missed_base'] = missed_count_base
grouped_res['total'] = total

grouped_res['prop_albert'] = grouped_res['correct_albert'] / grouped_res['total']
grouped_res['prop_base'] = grouped_res['correct_base'] / grouped_res['total']

grouped_res['difference'] = grouped_res['correct_albert'] - grouped_res['correct_base']
display(grouped_res)
# display(grouped_res.loc[grouped_res['total'] > 20].sort_values('prop_albert', ascending=False))



Unnamed: 0,Headline,correct_albert,correct_base,missed_albert,missed_base,total,prop_albert,prop_base,difference
0,"Source: Joan Rivers' doc did biopsy, selfie",126,193,75,8,201,0.626866,0.960199,-67
1,Charles Manson’s fiancee allegedly wanted to m...,81,81,0,0,81,1.000000,1.000000,0
2,Joan Rivers Personal Doctor Allegedly Took A S...,55,82,35,8,90,0.611111,0.911111,-27
3,Disturbed aunt cuts off nephew’s penis after h...,41,41,16,16,57,0.719298,0.719298,0
4,Joan Rivers’ Doctor Snapped Selfie During Thro...,36,36,0,0,36,1.000000,1.000000,0
...,...,...,...,...,...,...,...,...,...
888,Son's Asinine Questions Inspire Priceless Dad-...,0,1,1,0,1,0.000000,1.000000,-1
889,Here's What We Know About ISIS's Alleged Organ...,0,6,7,1,7,0.000000,0.857143,-6
890,9 Things You Need To Know About The Climate Ch...,0,2,5,3,5,0.000000,0.400000,-2
891,Heart specialist questions benefits of high-in...,0,0,4,4,4,0.000000,0.000000,0


In [14]:
headlines_success = grouped_res.loc[grouped_res['prop_albert'] > 0.99]['Headline'].values
headlines_failed = grouped_res.loc[grouped_res['prop_albert'] < 0.01]['Headline'].values

# Topic Modelling

In [8]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/danielvangelder/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [60]:
token_dict = {}

stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            token_dict[result[-1]] = token
    return result

In [61]:
doc_sample = headlines_success[0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Charles', 'Manson’s', 'fiancee', 'allegedly', 'wanted', 'to', 'marry', 'him', 'for', 'his', 'corpse']


 tokenized and lemmatized document: 
['charl', 'manson', 'fiance', 'alleg', 'want', 'marri', 'corps']


In [62]:
headlines_success_proc = list(map(preprocess, headlines_success))
headlines_failed_proc = list(map(preprocess, headlines_failed))

In [63]:
print(headlines_success[5],headlines_success_proc[5])
print(headlines_failed[5],headlines_failed_proc[5])

Journalist Steven Sotloff reportedly executed by ISIS ['journalist', 'steven', 'sotloff', 'report', 'execut', 'isi']
‘You Were My Guy’: Trump Tells Macron He Supported Him During French Election ['trump', 'tell', 'macron', 'support', 'french', 'elect']


A very crude tokenizer indeed...

In [64]:
dictionary_success = gensim.corpora.Dictionary(headlines_success_proc)
dictionary_success.filter_extremes(no_below=3, no_above=0.5, keep_n=100000)
dictionary_failed = gensim.corpora.Dictionary(headlines_failed_proc)
dictionary_failed.filter_extremes(no_below=3, no_above=0.5, keep_n=100000)
bow_corpus_success = [dictionary_success.doc2bow(doc) for doc in headlines_success_proc]
bow_corpus_failed = [dictionary_failed.doc2bow(doc) for doc in headlines_failed_proc]

In [65]:
t = 99
for i in range(len(bow_corpus_success[t])):
    print("Word {} (\"{}\") appears {} time.".format(bow_corpus_success[t][i][0], dictionary_success[bow_corpus_success[t][i][0]], 
bow_corpus_success[t][i][1]))

Word 17 ("kill") appears 1 time.
Word 20 ("isi") appears 1 time.
Word 54 ("baghdadi") appears 1 time.
Word 55 ("bakr") appears 1 time.
Word 57 ("leader") appears 1 time.
Word 59 ("airstrik") appears 1 time.
Word 61 ("wound") appears 1 time.


In [66]:
print("Topic modelling success...")
lda_model_sucess = gensim.models.LdaMulticore(bow_corpus_success, num_topics=10, id2word=dictionary_success, passes=2, workers=4)
print("Topic modelling failed...")
lda_model_failed = gensim.models.LdaMulticore(bow_corpus_failed, num_topics=10, id2word=dictionary_failed, passes=2, workers=4)
print("DONE")

Topic modelling success...
Topic modelling failed...
DONE


In [68]:
# for idx, topic in lda_model_sucess.print_topics(-1):
#     print('Topic: {} \nWords: {}'.format(idx, topic))
for i in range(10):
    print([token_dict[p[0]] for p in lda_model_sucess.show_topic(i, topn=10)])

['note', 'suicide', 'transgender', 'woman', 'teen', 'blaming', 'parents', 'christian', 'madonna', 'voters']
['saudi', 'penis', 'acid', 'woman', 'arabia', 'airline', 'boyfriend', 'girl', 'pours', 'tape']
['apple', 'watch', 'iphone', 'inch', 'launch', 'rumors', 'health', 'features', 'march', 'christmas']
['afghan', 'missing', 'reports', 'soldiers', 'border', 'canadian', 'bakr', 'baghdadi', 'killed', 'apple']
['jihadi', 'isis', 'airstrikes', 'john', 'reports', 'injured', 'nevada', 'tesla', 'year', 'letter']
['reports', 'apple', 'blackberry', 'lenovo', 'rivers', 'joan', 'selfie', 'doctor', 'watch', 'corpse']
['reports', 'split', 'companies', 'attacked', 'wounded', 'selfie', 'doctor', 'rivers', 'joan', 'packard']
['reports', 'isis', 'million', 'sell', 'sotloff', 'beheads', 'apple', 'steven', 'foley', 'james']
['claims', 'apple', 'fighters', 'watch', 'display', 'screens', 'supplying', 'samsung', 'photos', 'rumors']
['reports', 'apple', 'plan', 'music', 'expansion', 'beats', 'vegas', 'isis', 

In [69]:
# for idx, topic in lda_model_failed.print_topics(-1):
#     print('Topic: {} \nWords: {}'.format(idx, topic))
for i in range(10):
    print([token_dict[p[0]] for p in lda_model_failed.show_topic(i, topn=10)])

['lost', 'year', 'billed', 'missing', 'birthday', 'friend', 'party', 'ferguson', 'shot', 'woman']
['climate', 'change', 'hackers', 'sony', 'reports', 'hoax', 'stolen', 'beheads', 'isis', 'claims']
['threatened', 'kushner', 'isis', 'families', 'syria', 'channel', 'denies', 'wanted', 'state', 'wedding']
['airstrikes', 'president', 'intensity', 'killed', 'reports', 'jihadi', 'argentina', 'werewolf', 'isis', 'russian']
['penis', 'woman', 'reports', 'isis', 'claims', 'getting', 'stolen', 'boob', 'iraqi', 'year']
['women', 'plan', 'apple', 'separate', 'birthday', 'billed', 'party', 'missing', 'friend', 'photos']
['woman', 'breast', 'surgery', 'state', 'watch', 'apple', 'claims', 'weapons', 'told', 'denies']
['isis', 'organ', 'harvesting', 'wedding', 'operation', 'boob', 'finance', 'iraqi', 'fake', 'accuses']
['stop', 'saturated', 'heart', 'wikileaks', 'rich', 'seth', 'streets', 'weapons', 'claws', 'plan']
['killed', 'reports', 'revealed', 'woman', 'fake', 'breast', 'buried', 'grave', 'zombie