# Importing packages and loading file

In [3]:
import os
import pandas as pd
from tqdm import tqdm

import spacy
nlp = spacy.load("en_core_web_sm")

In [4]:
file = os.path.join("..", "data", "labelled_data", "fake_or_real_news.csv")

In [5]:
data = pd.read_csv(file)

In [6]:
real_df = data[data["label"]=="REAL"]["text"]

# Extract entities -> DON'T RUN BELOW

In [7]:
post_entities = []

########## TAKES A LONG TIME!

for post in tqdm(real_df):
    # create temporary list
    tmp_list = []
    # create spacy doc object
    doc = nlp(post)
    # for every named entity in the doc:
    for entity in doc.ents:
        if entity.label_ == "PERSON":
            tmp_list.append(entity.text)
    post_entities.append(tmp_list)

100%|██████████| 3171/3171 [18:06<00:00,  2.92it/s]


In [24]:
post_entities[0]

['John F. Kerry',
 'Kerry',
 'Laurent Fabius',
 'Francois Hollande',
 'Kerry',
 'Obama',
 'Kerry',
 'Obama',
 'Kerry',
 'Benjamin Netanyahu',
 'Jane Hartley',
 'Victoria Nuland',
 'Eric H. Holder Jr.',
 'Kerry',
 'Narendra Modi',
 'Kerry']

# Extract edgelists using itertools.entities

In [13]:
from itertools import combinations

In [14]:
edgelist = []

# Iterate over every document ("post entities)")
for doc in post_entities:
    edges = list(combinations(doc, 2))
    # For each combination (each pair of nodes)
    for edge in edges:
        # Append this to the final edgelist
        edgelist.append(tuple(sorted(edge))) #sorted gives alphabetical order

In [15]:
list(combinations([1,2,3,4,5],2)) # Giving an example of what we are doing -> We're getting all possible combinations within each document

[(1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (2, 3),
 (2, 4),
 (2, 5),
 (3, 4),
 (3, 5),
 (4, 5)]

In [16]:
edgelist[:10]

[('John F. Kerry', 'Kerry'),
 ('John F. Kerry', 'Laurent Fabius'),
 ('Francois Hollande', 'John F. Kerry'),
 ('John F. Kerry', 'Kerry'),
 ('John F. Kerry', 'Obama'),
 ('John F. Kerry', 'Kerry'),
 ('John F. Kerry', 'Obama'),
 ('John F. Kerry', 'Kerry'),
 ('Benjamin Netanyahu', 'John F. Kerry'),
 ('Jane Hartley', 'John F. Kerry')]

In [17]:
len(edgelist) # 1.3 mio. edges

1508000

# Counting edges

In [18]:
from collections import Counter

In [29]:
Counter(edgelist).most_common(5) # return the 10 most common edges

[(('Clinton', 'Clinton'), 74059),
 (('Clinton', 'Trump'), 30544),
 (('Trump', 'Trump'), 21864),
 (('Clinton', 'Hillary Clinton'), 14208),
 (('Clinton', 'Obama'), 14138)]

In [20]:
counted_edges = []

for pair, weight in Counter(edgelist).items():
    nodeA = pair[0]
    nodeB = pair[1]
    counted_edges.append((nodeA, nodeB, weight))

In [34]:
counted_edges[:3]

[('John F. Kerry', 'Kerry', 22),
 ('John F. Kerry', 'Laurent Fabius', 2),
 ('Francois Hollande', 'John F. Kerry', 1)]

In [23]:
len(counted_edges)

286424

# Create dataframe

In [37]:
edges_df = pd.DataFrame(counted_edges, columns = ["nodeA", "nodeB", "weight"])

In [39]:
edges_df.sample(5)

Unnamed: 0,nodeA,nodeB,weight
168836,Daniel Webster,Elizabeth Warren,2
282749,Bill Clinton,Plan A.,3
193870,Khalid Sheikh Mohammed,Valerie Jarrett,1
157215,Josh,Mark Berman,1
99546,Barack Obama,Bob Livingston,6


In [46]:
print(edges_df[edges_df["weight"] > 8000])
filtered_df = edges_df[edges_df["weight"] > 8000]

            nodeA            nodeB  weight
31          Obama            Obama    9438
52        Clinton  Hillary Clinton   14208
55   Donald Trump            Trump    9218
59        Clinton     Donald Trump    9172
65        Clinton            Trump   30544
66          Trump            Trump   21864
67           Cruz            Trump    9975
81        Clinton          Clinton   74059
84           Cruz             Cruz   10206
148       Clinton            Obama   14138
412          Bush             Bush   12400


NameError: name 'filtered_df' is not defined

In [51]:
import networkx as nx
import matplotlib.pyplot as plt

In [52]:
G = nx.from_pandas_edgelist(filtered_df, "nodeA", "nodeB", ["weight"])

# Doesn't work plotting on windows with pygraphviz

In [None]:
# Use this instead:
# https://networkx.org/documentation/stable//reference/drawing.html
# Or matplotlib

# Centrality measures for finding important nodes

In [56]:
bc_metric = nx.betweenness_centrality(G)
ev_metric = nx.eigenvector_centrality(G)

In [58]:
bc_metric

{'Obama': 0.0,
 'Clinton': 0.4666666666666667,
 'Hillary Clinton': 0.0,
 'Donald Trump': 0.0,
 'Trump': 0.26666666666666666,
 'Cruz': 0.0,
 'Bush': 0.0}

In [62]:
ev_metric

{'Obama': 0.27866589297905636,
 'Clinton': 0.6217238696493992,
 'Hillary Clinton': 0.1924203532943302,
 'Donald Trump': 0.3634417992887106,
 'Trump': 0.5525806832969451,
 'Cruz': 0.24767681483282325,
 'Bush': 2.753248468007037e-06}

In [83]:
importance_df = pd.DataFrame(bc_metric.items(), columns = ["node", "betweenness"])

In [84]:
importance_df["eigenvector"] = ev_metric.values()

In [85]:
importance_df

Unnamed: 0,node,betweenness,eigenvector
0,Obama,0.0,0.278666
1,Clinton,0.466667,0.621724
2,Hillary Clinton,0.0,0.19242
3,Donald Trump,0.0,0.363442
4,Trump,0.266667,0.552581
5,Cruz,0.0,0.247677
6,Bush,0.0,3e-06
