### Imports

In [1]:
import pickle
import pandas as pd
import numpy as np
import collections
import nltk

from rdflib import Graph
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from matplotlib import pyplot as plt
from sklearn.preprocessing import normalize

from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

from sklearn.metrics.pairwise import cosine_similarity

#nltk.download('wordnet')

### Load Data

In [2]:
g = pickle.load(open("Data/ClaimsKG.pkl", 'rb'))

### Lemmatization

In [3]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_sentence(sentence):
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)]).lower().strip()

### Get Vocabulary

In [4]:
def get_claims():
    qres = g.query(
    """PREFIX schema: <http://schema.org/>
       PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

       SELECT DISTINCT ?claim ?keywords WHERE {
           ?claim a schema:CreativeWork.
           ?claim schema:keywords ?keywords
       }""")
    return qres

def get_claims_with_authors():
    qres = g.query(
    """PREFIX schema: <http://schema.org/>
       PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
       SELECT DISTINCT ?claim ?keywords ?author WHERE {
            ?claimReview schema:itemReviewed ?claim.
            ?claimReview schema:author ?org.
            ?org schema:name ?author.
            ?claim a schema:CreativeWork.
            ?claim schema:keywords ?keywords
       }""")
    return qres

def get_claims_with_ratings():
    qres = g.query(
    """PREFIX schema: <http://schema.org/>
       PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

       SELECT DISTINCT ?claim ?keywords ?ratingName WHERE {
           ?claimReview schema:itemReviewed ?claim.
           ?claimReview schema:reviewRating ?rating.
           ?rating schema:author <http://data.gesis.org/claimskg/organization/claimskg>.
	       ?rating schema:alternateName ?ratingName.
	       ?claim a schema:CreativeWork.
	       ?claim schema:keywords ?keywords
    }""")
    return qres

In [5]:
qres = get_claims()

keywordDict = dict()
for row in qres:
    keywords = [lemmatize_sentence(x) for x in row.keywords.split(',')]
    for keyword in keywords:
        if keyword in keywordDict:
            keywordDict[keyword] += 1
        else:
            keywordDict[keyword] = 1

print("There's a total of %d keywords after lemmatizing" % len(keywordDict))

There's a total of 8883 keywords after lemmatizing


In [6]:
#Sort dict by occurrences
sorted_dict = sorted(keywordDict.items(), key=lambda kv: kv[1])
sorted_dict = collections.OrderedDict(sorted_dict)

n_occurrences = 50

#Keep keywords with n or more ocurrences
for key in list(sorted_dict):
    if sorted_dict[key] < n_occurrences:
        del sorted_dict[key]

keywordList = list(sorted_dict)
print("There's a total of %d keywords after removing keywords that occur in less than %d claims" % (len(sorted_dict),n_occurrences))

There's a total of 159 keywords after removing keywords that occur in less than 50 claims


In [7]:
for key, value in sorted_dict.items():
    print("%s: %d" % (key,value))

reader suggestion: 50
tourism: 50
employment: 51
sona: 51
space: 51
statistic: 52
melania trump: 52
gun: 53
satire: 53
ebola: 53
fact check: 53
trump: 53
cpc: 54
recreation: 54
russia: 54
woman: 55
alcohol: 55
message machine 2014: 56
racism: 57
the last line of defense: 58
your news wire: 59
gun control: 59
election: 59
america 's last line of defense: 59
pensions: 60
conspiracy theory: 60
fauxtography: 62
jacob zuma: 62
state government spending: 63
president trump: 63
something for nothing: 63
weather: 64
oil spill: 65
television: 66
afghanistan: 67
pop culture: 67
crusader habit: 67
radio & tv: 68
animals: 69
county budget: 71
israel: 74
county government: 76
viral video: 77
bernie sander: 77
new hampshire 2012: 77
marriage: 77
human rights: 77
debates: 81
sexuality: 82
natural phenomena: 82
music: 84
development: 86
wealth: 88
population: 88
congressional rules: 88
movies: 91
infrastructure: 91
hoax: 91
china: 93
florida: 96
welfare: 97
world news daily report: 101
muslim: 101
sma

In [8]:
def get_keyword_provenance(sorted_dict):
    qres = get_claims_with_authors()
    organization_dict = dict.fromkeys(sorted_dict)

    for row in qres:
        author = row.author
        keywords = [lemmatize_sentence(x) for x in row.keywords.split(',')]
        for keyword in keywords:
            if keyword in organization_dict:
                if organization_dict[keyword] == None:
                    organization_dict[keyword] = author.value
                else:
                    if organization_dict[keyword].find(author.value) == -1:
                        organization_dict[keyword] += ', ' + author.value
    return organization_dict

In [9]:
organization_dict = get_keyword_provenance(sorted_dict)

for key, value in organization_dict.items():
    print("%s: %s" % (key,value))

reader suggestion: africacheck
tourism: africacheck, politifact, snopes
employment: africacheck
sona: africacheck
space: politifact, snopes
statistic: africacheck
melania trump: snopes
gun: snopes, africacheck
satire: snopes
ebola: snopes, politifact, africacheck
fact check: snopes
trump: snopes
cpc: factscan
recreation: politifact
russia: snopes
woman: africacheck, snopes
alcohol: politifact, africacheck, snopes
message machine 2014: politifact
racism: snopes
the last line of defense: snopes
your news wire: snopes
gun control: snopes
election: africacheck, snopes, factscan
america 's last line of defense: snopes
pensions: politifact
conspiracy theory: snopes
fauxtography: snopes
jacob zuma: africacheck, snopes
state government spending: politifact
president trump: snopes
something for nothing: snopes
weather: politifact, snopes
oil spill: politifact
television: snopes
afghanistan: politifact, snopes
pop culture: politifact
crusader habit: snopes
radio & tv: snopes
animals: politifact


# Co-occurrence Clustering

### Build Matrix

In [10]:
n = len(keywordList)
matrix = np.zeros((n,n))

qres = get_claims()

for row in qres:
    keywords = [lemmatize_sentence(x) for x in row.keywords.split(',')]
    for k1 in keywords:
        try:
            i1 = keywordList.index(k1)
        except ValueError:
            continue
        for k2 in keywords:
            if k1 != k2:
                try:
                    i2 = keywordList.index(k2)
                except ValueError:
                    continue
                matrix[i1][i2] = matrix[i1][i2]+1

In [11]:
np.savetxt("Data\matrix.csv", matrix, delimiter=",", header=",".join(keywordList),comments="")

### KMeans Clustering

In [12]:
def get_clusters(labels,keywordList):
    clusters = dict()
    for wordIndex, clusterIndex in enumerate(labels):
        if clusterIndex > -1:
            if clusterIndex in clusters:
                clusters[clusterIndex] += ", " + keywordList[wordIndex]
            else:
                clusters[clusterIndex] = keywordList[wordIndex]
    return clusters

def get_cluster_distribution(clusters):
    cluster_dist = dict()
    for x in range(0, len(clusters)):
        cluster_dist[x] = dict()
        cluster_dist[x]["TRUE"] = 0
        cluster_dist[x]["FALSE"] = 0
        cluster_dist[x]["MIXTURE"] = 0
        cluster_dist[x]["OTHER"] = 0
        cluster_dist[x]["TOTAL"] = 0

    qres = get_claims_with_ratings()
    for row in qres:
        rating = row.ratingName
        keywords = [lemmatize_sentence(x) for x in row.keywords.split(',')]
        for keyword in keywords: 
            for x in range(0, len(clusters)):
                kwl = [y.strip() for y in clusters[x].split(',')]
                for kw in kwl:
                    if (kw == keyword):
                        cluster_dist[x]["TOTAL"] += 1
                        cluster_dist[x][rating.value] += 1
                        break
    return cluster_dist

def print_clusters(clusters, cluster_dist):
    for x in range(0, len(clusters)):
        print("Cluster %d (Total: %d, True: %d, False: %d, Mixture: %d, Other: %d)" 
              % (x,cluster_dist[x]["TOTAL"],cluster_dist[x]["TRUE"],cluster_dist[x]["FALSE"]
              ,cluster_dist[x]["MIXTURE"],cluster_dist[x]["OTHER"]))
        print("{ %s } \n" % (clusters[x]))
        
def get_claim_count(clusters):
    qres = get_claims()
    count = dict()
    for x in range(0, len(clusters)):
        count[x] = 0

    for row in qres:
        keywords = [lemmatize_sentence(x) for x in row.keywords.split(',')]
        for keyword in keywords: 
            for x in range(0, len(clusters)):
                kwl = [y.strip() for y in clusters[x].split(',')]
                for kw in kwl:
                    if (kw == keyword):
                        count[x] += 1
                        break
    return count

In [13]:
kmeans = KMeans(n_clusters=20).fit(matrix)        
clusters = get_clusters(kmeans.labels_,keywordList)
clust_dist = get_cluster_distribution(clusters)
print_clusters(clusters,clust_dist)

Cluster 0 (Total: 2751, True: 387, False: 678, Mixture: 1640, Other: 46)
{ debt, corporations, stimulus, income, labor, deficit, poverty, message machine 2012, states, job accomplishments } 

Cluster 1 (Total: 4986, True: 722, False: 1353, Mixture: 2820, Other: 91)
{ small business, retirement, unions, drugs, medicaid, families, regulation, city government, social security, government efficiency, message machine 2010, pundits, government regulation, voting record, homeland security, children, women, congress, corrections and updates, transportation, abortion } 

Cluster 2 (Total: 1714, True: 297, False: 372, Mixture: 975, Other: 70)
{ economy } 

Cluster 3 (Total: 1037, True: 157, False: 225, Mixture: 634, Other: 21)
{ jobs } 

Cluster 4 (Total: 979, True: 141, False: 237, Mixture: 586, Other: 15)
{ state budget } 

Cluster 5 (Total: 3310, True: 456, False: 1897, Mixture: 326, Other: 631)
{ asp article } 

Cluster 6 (Total: 1719, True: 186, False: 539, Mixture: 973, Other: 21)
{ health

### Ward Hierarchical Clustering

In [14]:
ward = AgglomerativeClustering(n_clusters=20).fit(matrix)
clusters = get_clusters(ward.labels_,keywordList)
clust_dist = get_cluster_distribution(clusters)
print_clusters(clusters,clust_dist)

  return linkage(y, method='ward', metric='euclidean')


Cluster 0 (Total: 3534, True: 471, False: 986, Mixture: 2020, Other: 57)
{ debt, corporations, government efficiency, stimulus, message machine 2010, pundits, government regulation, voting record, deficit, message machine 2012, congress, corrections and updates, transportation } 

Cluster 1 (Total: 11358, True: 1397, False: 4705, Mixture: 4261, Other: 995)
{ reader suggestion, tourism, employment, sona, space, statistic, melania trump, gun, satire, ebola, trump, cpc, recreation, russia, woman, alcohol, message machine 2014, racism, the last line of defense, your news wire, gun control, election, america 's last line of defense, pensions, conspiracy theory, fauxtography, jacob zuma, state government spending, president trump, weather, oil spill, afghanistan, pop culture, animals, county budget, israel, county government, viral video, bernie sander, new hampshire 2012, marriage, human rights, debates, sexuality, development, wealth, population, congressional rules, infrastructure, china,

### Cosine similarity

In [15]:
cs_matrix = cosine_similarity(matrix, matrix)

In [16]:
#Ward with Cosine Similarity Matrix
ward = AgglomerativeClustering(n_clusters=20).fit(cs_matrix)
clusters = get_clusters(ward.labels_,keywordList)
clust_dist = get_cluster_distribution(clusters)
print_clusters(clusters,clust_dist)

Cluster 0 (Total: 1064, True: 174, False: 292, Mixture: 565, Other: 33)
{ recreation, animals, infrastructure, science, agriculture, housing, regulation, government regulation } 

Cluster 1 (Total: 5777, True: 881, False: 1327, Mixture: 3391, Other: 178)
{ pensions, state government spending, county budget, county government, wealth, retirement, unions, city budget, city government, labor, state finances, transportation, state budget, education, economy } 

Cluster 2 (Total: 413, True: 42, False: 271, Mixture: 68, Other: 32)
{ space, ebola, fact check, television, radio & tv, facebook } 

Cluster 3 (Total: 953, True: 131, False: 313, Mixture: 77, Other: 432)
{ reader suggestion, tourism, employment, sona, statistic, woman, election, jacob zuma, development, government, water, health } 

Cluster 4 (Total: 1490, True: 231, False: 387, Mixture: 849, Other: 23)
{ sexuality, population, diversity, children, women, legal issues } 

Cluster 5 (Total: 3484, True: 468, False: 960, Mixture: 1990

In [17]:
#Kmeans with Cosine Similarity
kmeans = KMeans(n_clusters=20).fit(cs_matrix)        
clusters = get_clusters(kmeans.labels_,keywordList)
clust_dist = get_cluster_distribution(clusters)
print_clusters(clusters,clust_dist)

Cluster 0 (Total: 3605, True: 501, False: 1055, Mixture: 1992, Other: 57)
{ population, technology, diversity, polls and public opinion, government regulation, children, women, congress, corrections and updates, legal issues, candidate biography } 

Cluster 1 (Total: 1324, True: 144, False: 752, Mixture: 286, Other: 142)
{ your news wire, fauxtography, bernie sander, world news daily report, muslim, donald trump } 

Cluster 2 (Total: 2821, True: 433, False: 789, Mixture: 1523, Other: 76)
{ alcohol, sexuality, marijuana, drugs, civil rights, supreme court, public safety, criminal justice, guns, crime } 

Cluster 3 (Total: 424, True: 85, False: 116, Mixture: 161, Other: 62)
{ tourism, development, agriculture, housing } 

Cluster 4 (Total: 1581, True: 209, False: 362, Mixture: 967, Other: 43)
{ financial regulation, trade, stimulus, jobs } 

Cluster 5 (Total: 1227, True: 137, False: 819, Mixture: 131, Other: 140)
{ something for nothing, crusader habit, natural phenomena, music, movies, 

# Occurrence Clustering

### Build matrix

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

def dummy_tokenizer(docs):
    return docs

def tokenize_docs():
    qres = get_claims()
    docs = list()
    for row in qres:
        tokens = [lemmatize_sentence(x) for x in row.keywords.split(',')]
        docs.append(tokens)
    return docs

In [19]:
#Vectorization
docs = tokenize_docs()
vectorizer = TfidfVectorizer(tokenizer=dummy_tokenizer,
    preprocessor=dummy_tokenizer,min_df=50)
doc_kw = vectorizer.fit_transform(docs)

#Transpose the matrix so that keywords become data points and documents become features
kw_doc = np.transpose(doc_kw)

#Get the vocabulary list
vocab = sorted(vectorizer.vocabulary_.items(), key=lambda kv: kv[1])
vocab = collections.OrderedDict(vocab)
vlist = list(vocab)

In [20]:
dense = kw_doc.toarray()
ward = AgglomerativeClustering(n_clusters=20).fit(dense)
clusters = get_clusters(ward.labels_,vlist)
clust_dist = get_cluster_distribution(clusters)
print_clusters(clusters,clust_dist)

Cluster 0 (Total: 21159, True: 2849, False: 7144, Mixture: 9949, Other: 1217)
{ afghanistan, agriculture, alcohol, america 's last line of defense, animal, animals, barack obama, bernie sander, bipartisanship, campaign finance, children, china, city budget, city government, civil rights, climate change, congress, congressional rules, conspiracy theory, corporations, corrections and updates, county budget, county government, cpc, criminal justice, crusader habit, debates, debt, deficit, development, diversity, drugs, ebola, election, election 2016, employment, ethics, facebook, fact check, families, fauxtography, financial regulation, florida, gays and lesbians, government, government efficiency, government regulation, gun, gun control, health, hillary clinton, hoax, homeland security, housing, human rights, income, infrastructure, iraq, islam, israel, jacob zuma, job accomplishments, labor, legal issues, marijuana, marriage, medicaid, medicare, melania trump, message machine 2010, mess

In [21]:
dense = kw_doc.toarray()
ward = AgglomerativeClustering(n_clusters=5).fit(dense)
clusters = get_clusters(ward.labels_,vlist)
clust_dist = get_cluster_distribution(clusters)
print_clusters(clusters,clust_dist)

Cluster 0 (Total: 33361, True: 4453, False: 11396, Mixture: 15867, Other: 1645)
{ abortion, afghanistan, agriculture, alcohol, america 's last line of defense, animal, animals, barack obama, bernie sander, bipartisanship, campaign finance, candidate biography, children, china, city budget, city government, civil rights, climate change, congress, congressional rules, conspiracy theory, corporations, corrections and updates, county budget, county government, cpc, crime, criminal justice, crusader habit, debates, debt, deficit, development, diversity, donald trump, drugs, ebola, education, election, election 2016, elections, employment, energy, environment, ethics, facebook, fact check, fake news, families, fauxtography, federal budget, financial regulation, florida, foreign policy, gays and lesbians, government, government efficiency, government regulation, gun, gun control, guns, health, hillary clinton, history, hoax, homeland security, housing, human rights, immigration, income, infra