In [1]:
import pandas as pd
import numpy as np
import string
import re
import requests
import time
import csv
import json

from SPARQLWrapper import SPARQLWrapper, JSON, XML
from bs4 import BeautifulSoup

### Get politifact info

In [2]:
file = 'Data/c4.csv'
df_claims = pd.read_csv(file,sep=",")
df_claims = df_claims.loc[(df_claims['organization'] != 'snopes')]

df_claims.head()

Unnamed: 0,id,text,author,date,keywords,organization,rating
0,http://data.gesis.org/claimskg/creative_work/9...,California’s Prop 55 'prevents $4 billion in n...,Yes on 55,03/26/2055,"education,state finances",politifact,MIXTURE
1,http://data.gesis.org/claimskg/creative_work/e...,Says an election clerk in Green Bay refused to...,Our Democracy 2020,03/26/2020,"bipartisanship,city government,education,elect...",politifact,MIXTURE
2,http://data.gesis.org/claimskg/creative_work/0...,Says state schools superintendent candidate Do...,One Wisconsin Now,12/03/2019,"corporations,education,small business,state bu...",politifact,MIXTURE
3,http://data.gesis.org/claimskg/creative_work/f...,"'The Walton family, which owns Wal-Mart, contr...",One Wisconsin Now,11/27/2019,"corporations,economy,income,labor,wealth",politifact,TRUE
4,http://data.gesis.org/claimskg/creative_work/c...,In his first meeting with University of Wiscon...,One Wisconsin Now,11/09/2019,"education,state budget",politifact,FALSE


In [3]:
authors = df_claims["author"].unique()
print(len(authors))

2076


In [22]:
links = list()
translator = str.maketrans('', '', string.punctuation)

for author in authors:
    if type(author) is str:
        link = author.translate(translator).lower()
        link = re.sub(" on "," ",link)
        link = re.sub(" of "," ",link)
        link = re.sub(" for "," ",link)
        link = re.sub(" a "," ",link)
        link = re.sub(" the "," ",link)
        if link[0:4] == 'for ':
            link = link[4:]
        if link[0:4] == 'the ':
            link = link[4:]
        link = link.strip()
        link = re.sub(" ","-",link)
        links.append(link) 

In [24]:
def get_description(suffix):
    url = 'https://www.politifact.com/personalities/' + suffix 
    req = requests.get(url)
    if req.status_code == 200:
        html = req.text
        soup = BeautifulSoup(html, "lxml")
        description = soup.find('div', {'class' : 'scorecard__bio-text'})
        description = description.get_text(separator=' ')
        lines = (line.strip() for line in description.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        description = ' '.join(chunk for chunk in chunks if chunk)
        title = soup.find('h3', {'class' : 'scorecard__title'})
        title = title.get_text(separator=' ')
        lines = (line.strip() for line in title.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        title = ' '.join(chunk for chunk in chunks if chunk)
        p = re.compile("Our .* feeds")
        found = p.search(title)
        if found is not None:
            title = ''
        return [title,description]
    else:
        return None

In [25]:
authors_dict = {key:None for key in authors}

In [26]:
for i in range(len(links)):
    description = get_description(links[i])
    if description is not None:
        authors_dict[authors[i]] = description
    time.sleep(0.3)

In [33]:
#SAVE AS CSV
rows = list()
header = ["author","title","description"]
rows.append(header)

for key,value in authors_dict.items():
    if value is not None:
        row = [key,value[0],value[1]]
    else:
        row = [key,'','']
    rows.append(row)

with open('Data/authors.csv','w',newline='',encoding="utf-8") as writeFile:
    writer = csv.writer(writeFile,delimiter=',')
    writer.writerows(rows)
    writeFile.close()

### Get wikipedia info

In [4]:
file = 'Data/authors.csv'
df_authors = pd.read_csv(file,sep=",")

df_authors.head()

Unnamed: 0,author,title,description
0,Yes on 55,Organization from California,The Yes on 55 campaign supports Proposition 55...
1,Our Democracy 2020,,Our Democracy 2020 is a Madison-based coalitio...
2,One Wisconsin Now,Organization from Wisconsin,One Wisconsin Now is a liberal advocacy group ...
3,Secure America Now,,"Secure America Now is an independent, non-prof..."
4,Vote NO on 8,None from Florida,The Vote NO on 8 group is urging Florida resid...


In [5]:
file = 'Data/output_full_17072019.csv'
df_new = pd.read_csv(file,sep=",")
df_new.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,claimReview_author,claimReview_author_name,claimReview_author_url,claimReview_claimReviewed,claimReview_datePublished,claimReview_source,claimReview_url,creativeWork_author_name,creativeWork_author_sameAs,...,extra_entities_body,extra_entities_claimReview_claimReviewed,extra_entities_keywords,extra_refered_links,extra_tags,extra_title,rating_alternateName,rating_bestRating,rating_ratingValue,rating_worstRating
0,0,,factcheck_aap,,"120,000 Tasmanians - a quarter of our populati...",2019-07-08,factcheck_aap,https://factcheck.aap.com.au/news-media-claims...,"Anna Reynolds, Lord Mayor of Hobart, and Counc...",,...,"[{""id"" : 170584"",""""begin"": 76,""end"": 88,""entit...","[{""id"" : 3536675"",""""begin"": 36,""end"": 46,""enti...",[],"#h.p_yottUsUom2Qk,https://www.google.com/url?q...",,"Do a quarter of Tasmanians - or 120,000 people...",Ambiguous,,,
1,1,,factcheck_aap,,Over the census period numbers [of homeless pe...,2019-07-09,factcheck_aap,https://factcheck.aap.com.au/news-media-claims...,Assistant Minister for Community Housing and H...,,...,"[{""id"" : 6889"",""""begin"": 23,""end"": 29,""entity""...","[{""id"" : 6889"",""""begin"": 9,""end"": 15,""entity"":...",[],"#h.p_yottUsUom2Qk,https://www.google.com/url?q...",,Has the increase in the number of homeless peo...,Misleading,,,
2,2,,factcheck_aap,,Since 2014 approximately $1.6 billion in healt...,2019-07-03,factcheck_aap,https://factcheck.aap.com.au/news-media-claims...,Independent federal MP Andrew Wilkie.,,...,"[{""id"" : 962277"",""""begin"": 67,""end"": 70,""entit...","[{""id"" : 962277"",""""begin"": 53,""end"": 56,""entit...",[],"#h.p_yottUsUom2Qk,https://www.google.com/url?q...",,Has the Tasmanian government been diverting $1...,True,,,
3,3,,factcheck_aap,,We’ve still got 90 per cent of the state in so...,2019-07-01,factcheck_aap,https://factcheck.aap.com.au/news-media-claims...,NSW Farmers Association CEO Peter Arkle.,,...,"[{""id"" : 37335"",""""begin"": 72,""end"": 79,""entity...","[{""id"" : 37335"",""""begin"": 58,""end"": 65,""entity...",[],"#h.p_yottUsUom2Qk,https://www.google.com/url?q...",,Is 90 per cent of NSW drought affected and 30 ...,Ambiguous,,,
4,4,,factcheck_aap,,"It has been a significant, unusual and sustain...",2019-06-21,factcheck_aap,https://factcheck.aap.com.au/news-media-claims...,WA Tourism Council chief executive Evan Hall.,,...,"[{""id"" : 33613"",""""begin"": 96,""end"": 113,""entit...","[{""id"" : 33613"",""""begin"": 82,""end"": 99,""entity...",[],"#h.p_yottUsUom2Qk,https://www.google.com/url?q...",,Has there been a significant and sustained dow...,Mostly True,,,


In [6]:
ix=[i for i in df_new.index if (df_new.at[i,'creativeWork_author_name'] in authors)]
df_filtered = df_new.loc[ix]

In [7]:
def load_json(js):
    js = re.sub("\",\"\"begin\"" ,",\"begin\"",js)
    js = re.sub("\"\"","\\\"",js)
    return json.loads(js)

def get_wikipedia_url(dictionary):
    base = "https://en.wikipedia.org/wiki/"
    suffix = dictionary['entity'].replace(' ','_')
    return base + suffix

In [8]:
df_authors['wikipedia'] = ''

In [9]:
i = 0

for idx in df_authors.index: 
    claims = df_new.loc[df_new['creativeWork_author_name'] == df_authors.at[idx,'author']]
    if (len(claims) > 0):
        js = claims['extra_entities_author'][claims.index[0]]
        if js.strip() != '[]':
            js = load_json(js)
            if len(js) == 1:
                for j in range(len(js)):
                    df_authors.at[idx,'wikipedia'] = get_wikipedia_url(js[j]) 
                i = i + 1

### Author classification

In [10]:
df_authors['type'] = ''

In [11]:
for idx in df_authors.index:
    title = df_authors.at[idx,'title']
    author = df_authors.at[idx,'author']
    if title is not np.nan:
        if title.find('Democrat') >= 0:
            df_authors.at[idx,'type'] = 'Democrat'
            continue
        if title.find('Republican') >= 0:
            df_authors.at[idx,'type'] = 'Republican'
            continue
        if title.find('Organization') >= 0:
            df_authors.at[idx,'type'] = 'Organization'
            continue
        if ((title.find('Columnist') >= 0) | (title.find('Talk show host') >= 0) | 
        (title.find('Newsmaker') >= 0) | (title.find('Journalist') >= 0)):
            df_authors.at[idx,'type'] = 'Journalist'
            continue
        if ((title.find('Libertarian') >= 0) | (title.find('Green') >= 0) | 
        (title.find('Independent') >= 0) | (title.find('Tea Party') >= 0)):
            df_authors.at[idx,'type'] = 'Political'
            continue
    if author is not np.nan:
        if ((author.lower().find('alliance') >= 0) | (author.lower().find('foundation') >= 0) |
        (author.lower().find('association') >= 0) | (author.lower().find('union') >= 0) |
        (author.lower().find('fund') >= 0) | (author.lower().find('united') >= 0) |
        (author.lower().find('.org') >= 0) | (author.lower().find('club') >= 0) |
        (author.lower().find('committee') >= 0) | (author.lower().find('council') >= 0) |
        (author.lower().find('department') >= 0) | (author.lower().find('party') >= 0)):
            df_authors.at[idx,'type'] = 'Organization'
            continue
    df_authors.at[idx,'type'] = ''

In [12]:
ix=[i for i in df_authors.index if (df_authors.at[i,'type'] != '')]
df_authors.loc[ix].shape
#1170 authors classified with simple politifact heuristic

(1207, 5)

In [13]:
ix=[i for i in df_authors.index if ((df_authors.at[i,'type'] == '') & (df_authors.at[i,'wikipedia'] != ''))]
df_authors.loc[ix].shape
#630 authors with wikipedia link and no type from politifact

(604, 5)

In [14]:
def get_political_party(entity):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setQuery("""
    prefix dbr: <http://dbpedia.org/resource/>
    prefix dbo: <http://dbpedia.org/ontology/>

    select distinct ?party where 
    {
    <http://dbpedia.org/resource/%s> dbo:party ?party.
    <http://dbpedia.org/resource/%s> a dbo:Person. 
    } 
    """ % (entity,entity))
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if len(results["results"]["bindings"]) == 1:
        return results["results"]["bindings"][0]["party"]["value"]
    else:
        return None

In [15]:
# Don_Balfour_(politician)
get_political_party('Barack_Obama')

'http://dbpedia.org/resource/Democratic_Party_(United_States)'

In [16]:
for i in ix:
    author = df_authors.at[i,'wikipedia']
    author = author[30:]
    party = get_political_party(author)
    
    if party is not None:
        if party.find('Democratic') >= 0:
            df_authors.at[i,'type'] = 'Democrat'
            continue
        if party.find('Republican') >= 0:
            df_authors.at[i,'type'] = 'Republican'
            continue
        df_authors.at[i,'type'] = 'Political'

In [17]:
ix=[i for i in df_authors.index if ((df_authors.at[i,'type'] == '') & (df_authors.at[i,'wikipedia'] != ''))]
df_authors.loc[ix].shape
#454 authors with wikipedia link and no political party

(428, 5)

In [18]:
display(df_authors.loc[ix])

Unnamed: 0,author,title,description,wikipedia,type
3,Secure America Now,,"Secure America Now is an independent, non-prof...",https://en.wikipedia.org/wiki/Secure_America_Now,
5,Yes on 8,None from Florida,"""Yes on 8 Florida"" is a group urging residents...",https://en.wikipedia.org/wiki/California_Propo...,
11,Glenn Beck,The PolitiFact scorecard,Glenn Beck is the host of The Glenn Beck Progr...,https://en.wikipedia.org/wiki/Glenn_Beck,
14,Michael M. Crow,,Michael M. Crow is Arizona State University's ...,https://en.wikipedia.org/wiki/Michael_M._Crow,
20,Sean Hannity,The PolitiFact scorecard,Sean Hannity is a conservative political comme...,https://en.wikipedia.org/wiki/Sean_Hannity,
24,Rush Limbaugh,The PolitiFact scorecard,Rush Limbaugh is a conservative radio talk sho...,https://en.wikipedia.org/wiki/Rush_Limbaugh,
31,League of Conservation Voters,,"The Washington, D.C.-based League of Conservat...",https://en.wikipedia.org/wiki/League_of_Conser...,
36,Chain email,The PolitiFact scorecard,"Chain emails circulate on the Internet, usuall...",https://en.wikipedia.org/wiki/Chain_letter,
39,Mike Gallagher,,Republican Mike Gallagher of Green Bay was ele...,https://en.wikipedia.org/wiki/Mike_Gallagher_(...,
59,EmpowerED Georgia,None from Georgia,EmpowerED Georgia is an organization of an est...,https://en.wikipedia.org/wiki/Georgia_(U.S._st...,


In [19]:
#is organization
def is_organization(entity):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setQuery("""
    prefix dbr: <http://dbpedia.org/resource/>
    prefix dbo: <http://dbpedia.org/ontology/>

    ask where 
    {
    <http://dbpedia.org/resource/%s> a dbo:Organisation. 
    } 
    """ % entity)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results['boolean']

In [20]:
for i in ix:
    author = df_authors.at[i,'wikipedia']
    author = author[30:]
    if is_organization(author) == True:
        df_authors.at[i,'type'] = 'Organization'

In [21]:
ix=[i for i in df_authors.index if ((df_authors.at[i,'type'] == '') & (df_authors.at[i,'wikipedia'] != ''))]
df_authors.loc[ix].shape
#367 authors unclassified authors with wikipedia link

(354, 5)

In [22]:
# for i in ix:
#     print(df_authors.at[i,'author'])

In [23]:
#is person
def is_person(entity):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setQuery("""
    prefix dbr: <http://dbpedia.org/resource/>
    prefix dbo: <http://dbpedia.org/ontology/>

    ask where 
    {
    <http://dbpedia.org/resource/%s> a dbo:Person. 
    } 
    """ % entity)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results['boolean']

In [24]:
for i in ix:
    author = df_authors.at[i,'wikipedia']
    author = author[30:]
    if is_person(author) == True:
        df_authors.at[i,'type'] = 'Person'

In [25]:
#ix=[i for i in df_authors.index if ((df_authors.at[i,'type'] == '') & (df_authors.at[i,'wikipedia'] != ''))]
ix=[i for i in df_authors.index if ((df_authors.at[i,'type'] == ''))]
df_authors.loc[ix].shape

(393, 5)

In [26]:
for i in ix:
    print('entity: %s, description: %s\n' % 
    (df_authors.at[i,'author'],df_authors.at[i,'description']))

entity: Our Democracy 2020, description: Our Democracy 2020 is a Madison-based coalition that formed in 2015 to push for voting rights and other government and election reforms.

entity: Secure America Now, description: Secure America Now is an independent, non-profit group that claims 2 million members of all political stripes, united behind "policies that will protect our nation against terrorist infiltration, attack and capitulation to our enemies."

entity: Vote NO on 8, description: The Vote NO on 8 group is urging Florida residents to vote against Amendment 8, the class size amendment.

entity: Yes on 8, description: "Yes on 8 Florida" is a group urging residents to vote in favor of Amendment 8, which would change a 2002 amendment about class sizes.

entity: Wisconsin Jobs Now, description: Wisconsin Jobs Now is an advocacy group involved on jobs and education issues. It describes itself as "a non-profit organization committed to fighting income inequality from the bottom up and 

In [28]:
ix=[i for i in df_authors.index if ((df_authors.at[i,'type'] != ''))]
df_authors.loc[ix].shape
#1683 classified authors

(1683, 5)

In [30]:
rows = list()
header = ["author","type"]
rows.append(header)

for i in ix:
    row = [df_authors.at[i,'author'],df_authors.at[i,'type']]
    rows.append(row)

with open('Data/authors_classified.csv','w',newline='',encoding="utf-8") as writeFile:
    writer = csv.writer(writeFile,delimiter=',')
    writer.writerows(rows)
    writeFile.close()

### Read author dictionary

In [36]:
#Read authors dictionary
import csv

authors_dict = dict()

with open('Data/authors_classified.csv', 'r', newline='') as file:
    reader = csv.reader(file)
    next(reader)  # Skip header row.
    for row in reader:
        authors_dict[row[0]] = row[1]