# Build Networks from Articles Datasets

## Setup

In [1]:
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
import matplotlib.pyplot as plt

from IPython.display import set_matplotlib_formats
%matplotlib inline
set_matplotlib_formats("svg")

In [3]:
import tqdm
import json

In [4]:
import tldextract

In [5]:
def url_to_domain(url):
    domain = tldextract.extract(url)
    domain = domain.domain + "." + domain.suffix
    return domain

In [6]:
def normalize_url(url):
    url = url.split("?")[0]
    url = url.split("://")[-1] if url[:4] == "http" else url
    if not url: return None
    url = url[:-1] if url[-1] == "/" else url
    return url

In [7]:
def parselist(s):
    s = s[2:-2]
    s = s.split("', '")
    return s

In [28]:
def flatten(l):
    return [x for y in l for x in y]

## Load Data

In [11]:
articles_data_1 = pd.read_csv("../data/articles_collect/aylien_covid_articles_content_selected_1.csv")
articles_data_2 = pd.read_csv("../data/articles_collect/aylien_covid_articles_content_selected_2.csv")
articles_data_3 = pd.read_csv("../data/articles_collect/coaid_covid_articles_content.csv")

In [12]:
articles_data = pd.concat([articles_data_1, articles_data_2, articles_data_3])
articles_data = articles_data.drop_duplicates(subset=["url"])
articles_data = articles_data.reset_index(drop=True)

In [13]:
len(articles_data)

63813

In [14]:
domains = sorted(list(set(articles_data.domain.tolist())))
domains = domains
print(domains)
print(f"n domains: {len(domains)}")

['acs.org', 'androidcentral.com', 'aol.com', 'apa.org', 'apnews.com', 'apple.com', 'aptv.org', 'archdaily.com', 'archive.org', 'arizona.edu', 'avclub.com', 'azcentral.com', 'bgr.com', 'billboard.com', 'bizjournals.com', 'bloomberg.com', 'breitbart.com', 'brobible.com', 'buffalonews.com', 'businessinsider.com', 'bustle.com', 'buzzfeed.com', 'buzzorange.com', 'c-span.org', 'ca.gov', 'cbslocal.com', 'cbsnews.com', 'cbssports.com', 'cdc.gov', 'chicagotribune.com', 'chron.com', 'cisco.com', 'cnbc.com', 'cnet.com', 'cnn.com', 'complex.com', 'cool3c.com', 'cosmopolitan.com', 'cuny.edu', 'dailycaller.com', 'deadline.com', 'delta.com', 'denverpost.com', 'digitaltrends.com', 'ed.gov', 'eonline.com', 'epochtimes.com', 'ew.com', 'fastcompany.com', 'fivethirtyeight.com', 'forbes.com', 'foxbusiness.com', 'foxnews.com', 'go.com', 'google.com', 'gsmarena.com', 'harvard.edu', 'hbr.org', 'healthline.com', 'hindawi.com', 'huffingtonpost.com', 'informationisbeautiful.net', 'intel.com', 'investopedia.com',

## Check Data Issues

In [15]:
articles_per_domain = {d: 0 for d in domains}
for i, row in tqdm.tqdm(articles_data.iterrows(), total=len(articles_data)):
    articles_per_domain[row.domain] += 1

100%|██████████| 63813/63813 [00:05<00:00, 12579.91it/s]


In [16]:
nolink_ind = [i for i,l in enumerate(articles_data.links) if l == "[]"]
nolink_domains = articles_data.domain[nolink_ind].tolist()

In [17]:
proportion_nolink_bydomain = {d: (nolink_domains.count(d)/articles_per_domain[d]) if articles_per_domain[d] != 0 else 1.0 for d in domains}

In [18]:
baddomains = {k:v for k,v in proportion_nolink_bydomain.items() if v > 0.4}
baddomains_list = list(baddomains.keys())
baddomains

{'apa.org': 1.0,
 'bizjournals.com': 1.0,
 'bloomberg.com': 0.998109640831758,
 'cdc.gov': 0.46037099494097805,
 'hindawi.com': 1.0,
 'huffingtonpost.com': 1.0,
 'latimes.com': 1.0,
 'newsweek.com': 1.0,
 'nydailynews.com': 0.483974358974359,
 'princeton.edu': 1.0,
 'rev.com': 0.5,
 'texas.gov': 1.0,
 'thewheelerreport.com': 1.0,
 'weather.gov': 1.0}

## Explore

In [45]:
keywords = flatten([parselist(l) for l in articles_data.keywords.tolist()])
keywords = [k.lower().replace("#", "").strip() for k in keywords]
keywords_unique = sorted(list(set(keywords)))

In [46]:
print(len(keywords))
print(len(keywords_unique))

2628087
190385


In [47]:
# keyword_counts = {k: keywords.count(k) for k in keywords_unique}
# print(keyword_counts)

In [68]:
keywords_important = ["covid", "covid-19", "coronavirus", "pandemic", "cdc", "center for disease control", "nih", "flu", "hospital", "healthcare", "vaccine", "mask", "lock-down", "ventilator", "ppe", "quarantine", "social distancing", "social distance", "antibody", "pcr", "epidemic", "n95", "johns hopkins", "csse", "fauci"]

In [69]:
keywords_important_counts = {k: keywords.count(k) for k in keywords_important}
print(keywords_important_counts)

{'covid': 4135, 'covid-19': 30376, 'coronavirus': 62508, 'pandemic': 34300, 'cdc': 3869, 'center for disease control': 0, 'nih': 306, 'flu': 0, 'hospital': 5651, 'healthcare': 2789, 'vaccine': 2982, 'mask': 1326, 'lock-down': 3, 'ventilator': 763, 'ppe': 827, 'quarantine': 6151, 'social distancing': 339, 'social distance': 653, 'antibody': 680, 'pcr': 128, 'epidemic': 2128, 'n95': 0, 'johns hopkins': 8, 'csse': 5, 'fauci': 730}


In [135]:
keywords.count("fox news")

2964

In [136]:
keyword_clusters = {
    "sources": {"cdc", "who", "un", "fda", "nih", "nhs", "fauci"},
    "healthcare": {"hospital", "healthcare", "nurses", "ventilator"},
    "ppe": {"mask", "ppe"},
    "pandemic-response": {"social distance", "social distancing", "lock-down", "quarantine", "contact-tracing"},
    "vaccines": {"vaccine", "fda", "pfizer", "moderna", "astrazeneca"},
    "testing": {"antibody", "pcr", "testing"},
    "jhu": {"johns hopkins university", "johns hopkins", "csse"},
    "economy": {"stimulus", "relief", "economy", "jobs"},
    "metrics": {"cases", "deaths"},
    "symptoms": {"symptoms"},
    "trump": {"trump"},
    "china": {"china", "wuhan"},
    "conspiracy-theories": {"hydroxychloroquine", "bleach", "herd immunity", "conspiracy"},
}

In [132]:
source_keywords = {
    "cdc": "cdc.gov",
    "nytimes": "nytimes.com", "new york times": "nytimes.com", "the times": "nytimes.com",
    "fox news": "foxnews.com",
    "cnn": "cnn.com",
    "nbc": "nbcnews.com",
}

In [66]:
articles_data["source_links"] = ["[]" for i in range(len(articles_data))]
for i,row in tqdm.tqdm(articles_data.iterrows(), total=len(articles_data)):
    f = []
    for k in source_keywords.keys():
        try:
            if row.content.find(k) > 0:
                f.append(k)
        except:
            continue
    articles_data.loc[i, "source_links"] = str(f)

100%|██████████| 63813/63813 [02:36<00:00, 408.74it/s]


## Network Generators

In [19]:
def generate_network_articlelevel_directlinks():
    G = nx.DiGraph()
    
    article_urls = [normalize_url(l) for l in articles_data.url.tolist()]
    article_domains = articles_data.domain.tolist()
    for i in range(len(article_urls)):
        G.add_node(article_urls[i], domain=article_domains[i])
    
    article_urls_set = set(article_urls)
    for i, row in tqdm.tqdm(articles_data.iterrows(), total=len(articles_data)):
        url = normalize_url(row.url)
        for l in parselist(row.links):
            l = normalize_url(l)
            if l is None: continue
            if l in article_urls_set:
                G.add_edge(url, l)

    return G

In [22]:
def generate_network_articlelevel_alllinks():
    G = nx.DiGraph()
    
    article_urls = [normalize_url(l) for l in articles_data.url.tolist()]
    article_domains = articles_data.domain.tolist()
    for i in range(len(article_urls)):
        G.add_node(article_urls[i], domain=article_domains[i])
    
    article_urls_set = set(article_urls)
    for i, row in tqdm.tqdm(articles_data.iterrows(), total=len(articles_data)):
        url = normalize_url(row.url)
        for l in parselist(row.links):
            l = normalize_url(l)
            if l is None: continue
            if l in article_urls_set:
                G.add_edge(url, l)

    return G

In [137]:
def generate_network_articlelevel_keywordclusters():
    G = nx.DiGraph()
    
    article_urls = [normalize_url(l) for l in articles_data.url.tolist()]
    article_domains = articles_data.domain.tolist()
    for i in range(len(article_urls)):
        G.add_node(article_urls[i], domain=article_domains[i])
    
    article_urls_set = set(article_urls)
    for i, row in tqdm.tqdm(articles_data.iterrows(), total=len(articles_data)):
        url = normalize_url(row.url)
        for l in parselist(row.links):
            l = normalize_url(l)
            if l is None: continue
            if l in article_urls_set:
                G.add_edge(url, l)

    return G

## Generated Networks

In [20]:
graph = generate_network_articlelevel_directlinks()
print(f"|V|={graph.number_of_nodes()}, |E|={graph.number_of_edges()}")

100%|██████████| 63813/63813 [00:07<00:00, 8191.59it/s]


|V|=63514, |E|=9789


In [23]:
graph = generate_network_articlelevel_alllinks()
print(f"|V|={graph.number_of_nodes()}, |E|={graph.number_of_edges()}")

100%|██████████| 63813/63813 [00:08<00:00, 7486.67it/s]

|V|=63514, |E|=9789



