In [37]:
import pickle
import nltk
import csv

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from rdflib import Graph

### Load Data

In [1]:
g = Graph()
g.parse("Data/claimskg_v1.0.ttl", format="turtle")

6606074

### Save graph as PKL

In [5]:
filename = 'Data/ClaimsKG.pkl'
pickle.dump(g, open(filename, 'wb'))

### Load PKL

In [2]:
g2 = pickle.load(open("Data/ClaimsKG.pkl", 'rb'))

### Execute Query

In [8]:
qres = g2.query(
    """PREFIX schema: <http://schema.org/>
       PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

       SELECT DISTINCT ?keywords WHERE {
          ?claim a schema:CreativeWork.
          ?claim schema:keywords ?keywords
       } LIMIT 100""")

for row in qres:
    print("%s" % row)

ASP Article, barack obama
Health Care,Immigration,Public Health
Children
Energy,Environment
Corporations,Economy,Gays and Lesbians,State Finances,States
Economy,Workers
Bankruptcy,Debt,Federal Budget,Financial Regulation,Retirement
Crime,Criminal Justice,Economy,Sexuality
Income,Small Business,Taxes
People
Elections,Immigration,Redistricting
Federal Budget,Government Efficiency
Guns
ASP Article, Not Necessarily The News
Job Accomplishments,Stimulus
Candidate Biography,Legal Issues
fast food rumors, food, italy, mcdonald's
Economy,Gays and Lesbians,Sexuality,States
Economy,Jobs,Workers
Elections
Education,Poverty,State Budget,State government spending ,Taxes,Urban
Elections,Transparency
Crime
ASP Article
Debates,Elections
Candidate Biography,Pundits
emergency alert system, false flag, fcc, FEMA, hoboken train crash, viral videos
barack obama, national anthem
Crime,Jobs
Trade
cancer, health, reader suggestion, vaccination
Health Care
Candidate Biography,Corrections and Updates
ASP Articl

### Lemmatize keywords and save as CSV

In [21]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_sentence(sentence):
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)]).lower().strip()

In [54]:
def get_claims(g):
    qres = g.query(
    """
    PREFIX schema: <http://schema.org/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT DISTINCT ?claim ?text ?date ?keywords ?orgName ?ratingName WHERE {
        ?claimReview schema:itemReviewed ?claim.
        ?claimReview schema:reviewRating ?rating.
        ?claimReview schema:author ?org.
        ?org schema:name ?orgName.
        ?rating schema:author <http://data.gesis.org/claimskg/organization/claimskg>.
        ?rating schema:alternateName ?ratingName.
        ?claim a schema:CreativeWork.
        ?claim schema:keywords ?keywords.
        ?claim schema:text ?text
        OPTIONAL {?claim schema:datePublished ?date}.
    } ORDER BY DESC (?date) ASC(?claim)
    """)
    return qres

In [55]:
qres = get_claims(g2)

In [57]:
rows = list()
header = ["uri","date","claim","keywords","organization","rating"]
rows.append(header)

for row in qres:
    claim = row.claim
    date = row.date
    keywords = ",".join([lemmatize_sentence(x) for x in row.keywords.split(',')])
    orgName = row.orgName.value
    ratingName = row.ratingName.value
    text = row.text.value
    
    if date == None:
        date = ''
    else:
        date = date.value.strftime("%m/%d/%Y")
        
    claim_row = [claim,date,text,keywords,orgName,ratingName]
    rows.append(claim_row)

with open('Data/data.csv','w',newline='',encoding="utf-8") as writeFile:
    writer = csv.writer(writeFile,delimiter=';')
    writer.writerows(rows)
    writeFile.close()