In [1]:
!unzip results_fulltext.zip

Archive:  results_fulltext.zip
  inflating: fulltext_1nn.json       
  inflating: fulltext_in_batch.json  
  inflating: fulltext_knn.json       
  inflating: fulltext_mst.json       


In [2]:
import json

In [3]:
with open('fulltext_mst.json') as f:
  results = json.load(f)

In [None]:
results.keys()

dict_keys(['n_entities', 'n_mentions', 'knn_mentions', 'accuracy', 'failure', 'success'])

In [None]:
results['n_mentions']

5202

In [None]:
results['knn_mentions']

2

In [None]:
results['accuracy']

'16.782006920415224 %'

In [4]:
success = results['success']
failure = results['failure']

In [None]:
success[0]

{'mention_gold_cui': '272',
 'mention_gold_cui_name': 'PolitiFact Prior to Benghazi were there 13 attacks on embassies and 60 deaths under President George W Bush',
 'mention_id': 791407718267514880,
 'mention_name': 'remember when person said we didnt lose a single person in libya yeah so do i hashtag url hashtag',
 'predicted_cui': '272',
 'predicted_name': 'PolitiFact Prior to Benghazi were there 13 attacks on embassies and 60 deaths under President George W Bush'}

In [None]:
len(success)

873

In [None]:
# Check how many success and failure are Politifact or Snopes
snopes_success_count = sum([1 if 'Snopes' in x['mention_gold_cui_name'] else 0 for x in success])
snopes_failure_count = sum([1 if 'Snopes' in x['mention_gold_cui_name'] else 0 for x in failure])
politifact_success_count = sum([1 if 'PolitiFact' in x['mention_gold_cui_name'] else 0 for x in success])
politifact_failure_count = sum([1 if 'PolitiFact' in x['mention_gold_cui_name'] else 0 for x in failure])

In [None]:
print("Snopes Success Rate: %.3f" % (snopes_success_count/(snopes_success_count+snopes_failure_count)))
print("PolitiFact Success Rate: %.3f" % (politifact_success_count/(politifact_success_count+politifact_failure_count)))

Snopes Success Rate: 0.144
PolitiFact Success Rate: 0.139


In [None]:
success_cuis = list(set([x['mention_gold_cui'] for x in success]))
total_cuis = list(set([x['mention_gold_cui'] for x in success])) + list(set([x['mention_gold_cui'] for x in failure]))
print("No. of unique CUIs succesfully predicted: %d" % len(success_cuis))
print("Total no. of unique CUIs: %d" % len(total_cuis))

No. of unique CUIs succesfully predicted: 343
Total no. of unique CUIs: 1576


In [None]:
success_titles = list(set([x['mention_gold_cui_name'] for x in success]))
failure_titles = list(set([x['mention_gold_cui_name'] for x in failure]))
failure_mentions = [x['mention_name'] for x in failure]
success_mentions = [x['mention_name'] for x in success]

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize.casual import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
nltk.download('wordnet')
wnl = WordNetLemmatizer()
tokenizer = TweetTokenizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
success_mentions_lemmatized = [" ".join([wnl.lemmatize(x) for x in tokenizer.tokenize(y)]) for y in success_mentions]
failure_mentions_lemmatized = [" ".join([wnl.lemmatize(x) for x in tokenizer.tokenize(y)]) for y in failure_mentions]

In [None]:
vectorizer = CountVectorizer(stop_words='english')
success_features = vectorizer.fit_transform(success_mentions_lemmatized)
failure_features = vectorizer.fit_transform(failure_mentions_lemmatized)

In [None]:
topic_model = LatentDirichletAllocation()

<WordListCorpusReader in '.../corpora/stopwords' (not loaded yet)>

In [None]:
success[20:50]

[{'mention_gold_cui': '279',
  'mention_gold_cui_name': 'PolitiFact Did Obama administration approve bump stocks',
  'mention_id': 966694930268667904,
  'mention_name': 'user he listened to them and announced he wats to ban bump stocks that person made legal up the age to own a rifle to number and stronger background checks more security at schools what have libs done nothing',
  'predicted_cui': '279',
  'predicted_name': 'PolitiFact Did Obama administration approve bump stocks'},
 {'mention_gold_cui': '279',
  'mention_gold_cui_name': 'PolitiFact Did Obama administration approve bump stocks',
  'mention_id': 967166231588540416,
  'mention_name': 'of course we should ban bump stocks it should have been done a long time ago governor of ohio user on hashtag url',
  'predicted_cui': '279',
  'predicted_name': 'PolitiFact Did Obama administration approve bump stocks'},
 {'mention_gold_cui': '279',
  'mention_gold_cui_name': 'PolitiFact Did Obama administration approve bump stocks',
  'men

In [None]:
failure[20:50]

[{'mention_gold_cui': '2816',
  'mention_gold_cui_name': 'FACT CHECK Maxine Waters Fears a Trumpled War After Nuclear Attack by North Korea',
  'mention_id': 923574207304404992,
  'mention_name': 'user taking things out of context dhe fears nk nuking america she is then saying in a way its person who is going number make that happen',
  'predicted_cui': '272',
  'predicted_name': 'PolitiFact Prior to Benghazi were there 13 attacks on embassies and 60 deaths under President George W Bush'},
 {'mention_gold_cui': '155',
  'mention_gold_cui_name': 'FACT CHECK Hillary Clinton Gave 20 Percent of United States Uranium to Russia in Exchange for Clinton Foundation Donations',
  'mention_id': 986961901245657088,
  'mention_name': 'i love when people say person had no scandals what do they call fast and furious uranium number benghazi irs targeting nsa spying these scandals are real scandals unlike washed up porn stars looking to get rich off of falsely accusing president person',
  'predicted_c

In [None]:
from collections import Counter

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

In [None]:
success_titles = [x['mention_gold_cui_name'] for x in success]
c = Counter(success_titles)

In [None]:
c.most_common(10)

[('FACT CHECK Hillary Clinton Gave 20 Percent of United States Uranium to Russia in Exchange for Clinton Foundation Donations',
  165),
 ('PolitiFact David Hogg not at school during shooting Bloggers spread misinformation',
  27),
 ('FACT CHECK Did Democrats Refuse to Stand for a Navy SEALs Widow', 20),
 ('Are Donald Trumps Make America Great Again Caps Made in China Snopescom',
  15),
 ('PolitiFact Factchecking Donald Trumps claim Hillary Clinton started Obama birther movement',
  15),
 ('PolitiFact Mostly False 18 US school shootings so far in 2018 and 18 in rest of the world over past 20 years',
  13),
 ('PolitiFact No the 9th Circuit isnt the most overturned court in the country as Hannity says',
  11),
 ('PolitiFact Fake news posts blame Puerto Ricos truck drivers for refusing to ship relief supplies',
  11),
 ('Was the Texas Church Shooter an Antifa Member Who Vowed to Start Civil War Snopescom',
  10),
 ('No American Flags Present at the Democratic National Convention Snopescom'

In [None]:
failure_titles = [x['mention_gold_cui_name'] for x in failure]
c = Counter(failure_titles)

In [None]:
c.most_common(10)

[('FACT CHECK Hillary Clinton Gave 20 Percent of United States Uranium to Russia in Exchange for Clinton Foundation Donations',
  195),
 ('Hillary Clinton Kissed by Former Klan Member Snopescom', 73),
 ('PolitiFact Did Hillary Clinton start the Obama birther movement', 68),
 ('Is Green Party Candidate Jill Stein AntiVaccine Snopescom', 65),
 ('Donald Trump Condoned Flag Burning Snopescom', 50),
 ('Did Hillary Clinton Say Democratic Voters Are Stupid Snopescom', 45),
 ('PolitiFact Why comparing Trumps and Obamas immigration restrictions is flawed',
  40),
 ('PolitiFact Unfit to serve Congressman isnt even real', 39),
 ('Obama Encouraged Illegal Aliens to Vote Snopescom', 38),
 ('PolitiFact In a nuclear claim Donald Trump says Hillary Clinton gave up onefifth of US uranium to Russia',
  35)]

In [None]:
total_titles = success_titles + failure_titles
total_counts = Counter(total_titles)

In [None]:
new_dict = {k:c[k]/total_counts[k] for k in c.keys() if k in total_counts}

In [None]:
len(c.keys())

343

In [None]:
len(total_counts.keys())

1376

In [None]:
often_correct = [x[0] for x in sorted(new_dict.items(), key=lambda item: item[1], reverse=True) if x[1] > 0.5]

In [None]:
rarely_correct = [x[0] for x in sorted(new_dict.items(), key=lambda item: item[1], reverse=False) if x[1] < 0.3]

In [None]:
never_correct = [x[0] for x in total_counts.items() if x[0] not in c.keys()]

In [None]:
len(never_correct)

1033

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])

In [None]:
often_correct_titles = " ".join(often_correct)

In [None]:
doc_correct = nlp(often_correct_titles)

In [None]:
entities = []
for ent in doc.ents:
  entities.extend(ent.text.lower().split(' '))

In [None]:
success_entities = Counter(entities)

In [None]:
num_entities = sum(success_entities.values())

In [None]:
for key in success_entities:
  success_entities[key] /= num_entities

In [None]:
success_entities.most_common(20)

[('trump', 0.04128440366972477),
 ('snopescom', 0.034403669724770644),
 ('donald', 0.02981651376146789),
 ('politifact', 0.027522935779816515),
 ('the', 0.027522935779816515),
 ('trumps', 0.013761467889908258),
 ('us', 0.013761467889908258),
 ('a', 0.011467889908256881),
 ('hillary', 0.011467889908256881),
 ('clinton', 0.011467889908256881),
 ('of', 0.009174311926605505),
 ('false', 0.009174311926605505),
 ('white', 0.009174311926605505),
 ('obama', 0.006880733944954129),
 ('fact', 0.006880733944954129),
 ('iran', 0.006880733944954129),
 ('first', 0.006880733944954129),
 ('united', 0.006880733944954129),
 ('house', 0.006880733944954129),
 ('china', 0.006880733944954129)]

In [None]:
never_correct_titles = " ".join(never_correct)
doc = nlp(never_correct_titles)
entities = []
for ent in doc.ents:
  entities.extend(ent.text.lower().split(' '))

In [None]:
failure_entities = Counter(entities)

In [None]:
num_entities = sum(failure_entities.values())

In [None]:
for key in failure_entities:
  failure_entities[key] /= num_entities

In [None]:
failure_entities.most_common(20)

[('politifact', 0.04521845401772075),
 ('trump', 0.041857622975863124),
 ('clinton', 0.03635808127100519),
 ('donald', 0.03299725022914757),
 ('snopescom', 0.03238619003971891),
 ('hillary', 0.026886648334860985),
 ('the', 0.026886648334860985),
 ('trumps', 0.012832263978001834),
 ('obama', 0.011304613504430187),
 ('us', 0.010388023220287198),
 ('a', 0.008860372746715552),
 ('check', 0.007332722273143905),
 ('sanders', 0.006721662083715246),
 ('fact', 0.0058050717995722576),
 ('bernie', 0.0058050717995722576),
 ('democrats', 0.005499541704857928),
 ('of', 0.005499541704857928),
 ('gop', 0.00488848151542927),
 ('john', 0.00458295142071494),
 ('million', 0.00458295142071494)]

In [None]:
with open('fulltext_in_batch.json') as f:
  results = json.load(f)

In [None]:
predicted_cui = [x['predicted_cui'] for x in results['failure']] + [x['predicted_cui'] for x in results['success']]

In [None]:
Counter(predicted_cui).most_common(30)

[('1163', 119),
 ('155', 88),
 ('2885', 43),
 ('1064', 34),
 ('123', 31),
 ('2118', 28),
 ('2600', 28),
 ('1074', 28),
 ('1040', 25),
 ('2822', 24),
 ('1047', 23),
 ('1090', 22),
 ('44', 21),
 ('3008', 19),
 ('2594', 18),
 ('2922', 18),
 ('3163', 18),
 ('168', 17),
 ('57', 17),
 ('23', 17),
 ('1182', 17),
 ('1592', 17),
 ('1207', 16),
 ('1304', 16),
 ('1479', 16),
 ('93', 15),
 ('1244', 15),
 ('1268', 15),
 ('1449', 15),
 ('3391', 15)]

In [None]:
predicted_cui = [x['predicted_cui'] for x in failure] + [x['predicted_cui'] for x in success]

In [None]:
Counter(predicted_cui).most_common(30)

[('155', 196),
 ('1308', 164),
 ('1545', 136),
 ('1064', 127),
 ('235', 102),
 ('2015', 96),
 ('365', 82),
 ('1280', 82),
 ('3398', 79),
 ('280', 75),
 ('2487', 73),
 ('1557', 66),
 ('311', 62),
 ('1527', 61),
 ('1719', 59),
 ('1779', 59),
 ('2882', 59),
 ('1162', 56),
 ('1268', 56),
 ('1114', 54),
 ('2885', 54),
 ('1215', 50),
 ('2428', 50),
 ('746', 46),
 ('1040', 46),
 ('1157', 46),
 ('1922', 45),
 ('1269', 43),
 ('1352', 43),
 ('2146', 43)]

In [None]:
doc_correct = nlp(often_correct_titles)
doc_incorrect = nlp(never_correct_titles)

In [None]:
success_mentions_lemmatized = " ".join([" ".join([wnl.lemmatize(x).lower() for x in tokenizer.tokenize(y)]) for y in success_mentions])
failure_mentions_lemmatized = " ".join([" ".join([wnl.lemmatize(x).lower() for x in tokenizer.tokenize(y)]) for y in failure_mentions])

In [None]:
success_titles_lemmatized = " ".join([" ".join([wnl.lemmatize(x).lower() for x in tokenizer.tokenize(y)]) for y in success_titles])
failure_titles_lemmatized = " ".join([" ".join([wnl.lemmatize(x).lower() for x in tokenizer.tokenize(y)]) for y in failure_titles])

In [None]:
word_overlaps = []
for x in success:
  mention = x['mention_name']
  fact_title = x['mention_gold_cui_name']
  mention_lemmatized = set([wnl.lemmatize(x).lower() for x in tokenizer.tokenize(mention)])
  fact_title_lemmatized = set([wnl.lemmatize(x).lower() for x in tokenizer.tokenize(fact_title)])
  word_overlap = len(mention_lemmatized.intersection(fact_title_lemmatized))/len(mention_lemmatized.union(fact_title_lemmatized))
  word_overlaps.append(word_overlap)

In [None]:
word_overlaps_failure = []
for x in failure:
  mention = x['mention_name']
  fact_title = x['mention_gold_cui_name']
  mention_lemmatized = set([wnl.lemmatize(x).lower() for x in tokenizer.tokenize(mention)])
  fact_title_lemmatized = set([wnl.lemmatize(x).lower() for x in tokenizer.tokenize(fact_title)])
  word_overlap = len(mention_lemmatized.intersection(fact_title_lemmatized))/len(mention_lemmatized.union(fact_title_lemmatized))
  word_overlaps_failure.append(word_overlap)

In [None]:
sum(word_overlaps)/len(word_overlaps)

0.12509481152426433

In [None]:
sum(word_overlaps_failure)/len(word_overlaps_failure)

0.04257807016520341

In [None]:
doc_success = nlp(success_mentions_lemmatized)
doc_failure = nlp(failure_mentions_lemmatized)

In [None]:
len(doc_success.ents)/len(doc_success)

0.05809956757955428

In [None]:
len(doc_failure.ents)/len(doc_failure)

0.036761195454979474

In [5]:
import pandas as pd

In [9]:
df = pd.DataFrame.from_dict(failure)

In [12]:
pd.set_option('display.max_colwidth', None)

In [16]:
pd.set_option('display.max_rows', None)

In [31]:
df.iloc[[1051,3224,1031,200,1547,899], :]

Unnamed: 0,mention_id,mention_name,mention_gold_cui,mention_gold_cui_name,predicted_name,predicted_cui
1051,773677160322117632,user originated from clintons campaign tho iirc,1047,PolitiFact Did Hillary Clinton start the Obama birther movement,PolitiFact Factchecking Donald Trumps claim Hillary Clinton started Obama birther movement,1064
3224,843948028050509824,user psychologist know the diff btw make believe proof too url,57,Did Hillary Clinton Say Democratic Voters Are Stupid Snopescom,New Study Shows Liberals Have a Lower Average IQ Than Conservatives Snopescom,2104
1031,835956212076249088,user the birther movement was started by hillarys campaign during the number primaries you should blame hillary,44,Did Clinton Supporters Start the Birther Movement Snopescom,PolitiFact Factchecking Donald Trumps claim Hillary Clinton started Obama birther movement,1064
200,837051377868124160,user popular vote tells you where the majority of americans are no mandate,93,Did Trump Win 3084 of 3141 Counties in 2016 While Clinton Won Only 57 Snopescom,PolitiFact Mike Pence says Donald Trump won most counties by a Republican since Ronald Reagan,311
1547,855905954407120896,user nothing was more calming than when he smoothly told everyone about giving number in cash to the mullahs of iran but hey its cool right,1311,PolitiFact No Donald Trump we are not giving Iran 150 billion for nothing,PolitiFact Donald Trumps Mostly False claim that 400 million payment to Iran was ransom,1269
899,845913265515352064,user person doesnt lie about russia cnn does it was hillary that sold putin number our uranium research it,1163,PolitiFact In a nuclear claim Donald Trump says Hillary Clinton gave up onefifth of US uranium to Russia,FACT CHECK Hillary Clinton Gave 20 Percent of United States Uranium to Russia in Exchange for Clinton Foundation Donations,155


In [32]:
df.sample(frac=0.3)[['mention_gold_cui_name', 'predicted_name']]

Unnamed: 0,mention_gold_cui_name,predicted_name
3728,FACT CHECK Did 58 Scientific Papers Published in 2017 Say Global Warming is a Myth,FACT CHECK PeerReviewed Study Proves All Recent Global Warming Fabricated by Climatologists
1050,PolitiFact Did Hillary Clinton start the Obama birther movement,PolitiFact Factchecking Donald Trumps claim Hillary Clinton started Obama birther movement
680,PolitiFact Richard Cordray told Ohio board to give permit to progun group for rally Kucinich says,PolitiFact Mark Pocan claim Seven years later Scott Walker has not met his jobs promise from first campaign
1206,PolitiFact State Sen Stephen Martin says Democratic Party created the Ku Klux Klan,FACT CHECK Did Adolf Hitler Say That Nazis Are Mortal Enemies of the Present Capitalist Economic System
1857,Is Green Party Candidate Jill Stein AntiVaccine Snopescom,Was a Woman Arrested for Making Fur Coats with the Neighbors Cats Snopescom
1864,FACT CHECK Did Tomi Lahren Say That Victims of Mass Shootings Dont Believe in God Enough,Was a Woman Arrested for Making Fur Coats with the Neighbors Cats Snopescom
1490,FACT CHECK Did Puerto Ricos Teamsters Union Go on Strike During Hurricane Maria Relief Efforts,PolitiFact Fake news posts blame Puerto Ricos truck drivers for refusing to ship relief supplies
1585,PolitiFact Headlines that say GOP bill makes sexual assault a preexisting condition are misleading,Did CocaCola Invent the Modern Image of Santa Claus Snopescom
3667,Bernie Sanders 1972 Essay on Rape Snopescom,Slutwalk Image Repurposed as Womens March Photograph Snopescom
493,FACT CHECK Did Trump Tweet in 2015 That the President Should Be Shot Out of a Cannon if the Dow Joans Tanks,Russian Billboard Features Trump and Putin Snopescom
