In [11]:
import pandas as pd
import numpy as np
from nltk.text import Text
from nltk.tokenize import word_tokenize
import nltk
import spacy
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from unidecode import unidecode

nlp = spacy.load("en_core_web_lg")

nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")

stemmer = PorterStemmer()

from tqdm import tqdm

covid_words = {"covid", "coronavirus", "pandemic"}

import pickle

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\19083\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\19083\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\19083\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
df_a = pd.read_parquet("parquets/articles.parquet")[
    ["sourcedomain_id", "date", "title", "content"]
]

In [33]:
def covid_docs(df):
    outlets = pd.read_csv("CSVs/outlets.csv")

    demographics = pd.read_csv("CSVs/demographics.csv")

    politics = pd.read_csv("CSVs/politics.csv")

    risks = pd.read_csv("CSVs/risks.csv")

    p1 = politics[~politics.logodds_Trump20.str.contains("None")][
        ["fips", "logodds_Trump20"]
    ]

    p1 = p1.astype({"fips": "int", "logodds_Trump20": "float"})

    p1["trump_20_pct"] = p1.logodds_Trump20.rank(pct=True)

    p1["trump_odds"] = np.exp(p1.logodds_Trump20) / (1 + np.exp(p1.logodds_Trump20))

    op = outlets.merge(p1, on="fips", how="inner")[
        [
            "sourcedomain_id",
            "fips",
            "state",
            "city",
            "logodds_Trump20",
            "trump_20_pct",
            "trump_odds",
        ]
    ]

    opd = op.merge(demographics, on="fips", how="inner")

    states = outlets[["sourcedomain_id", "fips", "state"]]

    df.reset_index(inplace=True)

    covid_df = df.drop("index", axis=1)

    covid_df["content_c"] = [
        re.sub(r"[^a-zA-Z\s]", "", doc.replace("\n", " ")) for doc in df.content
    ]

    covid_df = covid_df[
        covid_df.content_c.str.contains("covid|coronavirus|pandemic", case=False)
    ]

    covid_df["content_c"] = [re.sub(r"\s+", " ", doc) for doc in covid_df.content_c]

    covid_df = covid_df.merge(op, on="sourcedomain_id", how="inner")

    polar_covid_df = covid_df.loc[(covid_df.trump_20_pct - 0.5).abs() >= 0.25]

    polar_covid_df["trump"] = polar_covid_df.trump_20_pct > 0.5
    return polar_covid_df


def get_concordance_df(polar_covid_df):
    c_dat = []

    for index, row in tqdm(polar_covid_df.iterrows()):
        doc = Text(word_tokenize(row.content_c))
        label = row.trump
        tpc = row.trump_20_pct

        for cw in covid_words:
            res = doc.concordance_list(cw, width=300)
            for r in res:
                c_dat.append([r.line, label, tpc])
    cl = pd.DataFrame(c_dat, columns=["line", "trump", "trump_20_pct"])
    cl["line"] = [" ".join(line.split()[1:-1]) for line in cl.line]
    return cl


def clean_document(doc):
    doc = nlp(doc)
    doc_cleaned = " ".join(
        [
            token.text
            for token in doc
            if (
                token.ent_type_ not in {"GPE", "PERSON", "ORG"}
                or (token.text.lower() in covid_words)
            )
        ]
    )
    doc_cleaned = doc_cleaned.lower()
    # tokens = word_tokenize(doc)
    # stemmed = [stemmer.stem(token) for token in tokens]
    # lemmatized = [nlp(token)[0].lemma_ for token in stemmed]
    return doc, doc_cleaned


def get_clean_docs(cl):
    clean_docs = []
    nlp_docs = []
    for doc in tqdm(cl.line):
        nlp_doc, clean_doc = clean_document(doc)
        clean_docs.append(clean_doc)
        nlp_docs.append(nlp_doc)
    cl["clean_lines"] = clean_docs
    return cl, nlp_docs

In [29]:
# df_a = covid_docs(df_a)
# df_a = get_concordance_df(df_a)
df_a_100k = df_a.groupby("trump").apply(lambda x: x.sample(50000))

In [34]:
df_a_100k_c, nlp_100k_c = get_clean_docs(df_a_100k)

100%|██████████| 100000/100000 [2:48:16<00:00,  9.90it/s]      


In [31]:
df_a_100k.to_parquet("covid_mentions_equal.parquet")

In [24]:
len(df_a.line[90000])

291

In [36]:
df_a_100k_c.to_parquet("covid_mentions_equal_clean.parquet")

In [38]:
import pickle

In [39]:
with open("nlp_docs_equal.pkl", "wb") as f:
    pickle.dump(nlp_100k_c, f)

In [8]:
df_a

Unnamed: 0,sourcedomain_id,date,title,content,content_c,fips,state,city,logodds_Trump20,trump_20_pct,trump_odds,trump
0,andalusiastarnews-andalusiastarnews.com,2020-04-04,Remember When: Shehan’s Polio story,Dan Shehan’s “My Polio Story” is continued in ...,Dan Shehans My Polio Story is continued in thi...,1039,Alabama,Andalusia,1.680054,0.984127,0.842912,True
1,andalusiastarnews-andalusiastarnews.com,2020-04-04,Governor issues stay at home order,Governor Kay Ivey announced that a stay at hom...,Governor Kay Ivey announced that a stay at hom...,1039,Alabama,Andalusia,1.680054,0.984127,0.842912,True
2,andalusiastarnews-andalusiastarnews.com,2020-04-03,"Local banks await final rules, guidance for pa...",As the novel coronavirus (COVID-19) disease co...,As the novel coronavirus COVID disease continu...,1039,Alabama,Andalusia,1.680054,0.984127,0.842912,True
3,andalusiastarnews-andalusiastarnews.com,2020-04-06,Changes in place for SNAP,The COVID-19 pandemic has left thousands of Al...,The COVID pandemic has left thousands of Alaba...,1039,Alabama,Andalusia,1.680054,0.984127,0.842912,True
4,andalusiastarnews-andalusiastarnews.com,2020-04-06,Local boy teaches how to make masks without se...,Andalusia Elementary School second grader Blan...,Andalusia Elementary School second grader Blan...,1039,Alabama,Andalusia,1.680054,0.984127,0.842912,True
...,...,...,...,...,...,...,...,...,...,...,...,...
536258,everettindependent-everettindependent.com,2021-04-21,"Bishop Brown, Patti Cheever Focus Radio Show o...","It’s Monday night on Broadway, and upstairs in...",Its Monday night on Broadway and upstairs in t...,25017,Massachusetts,Everett,-0.987032,0.083333,0.271499,False
536259,everettindependent-everettindependent.com,2021-04-21,"Without Visits, Traditional Process, Seniors T...",Choosing a college is typically a rite of pass...,Choosing a college is typically a rite of pass...,25017,Massachusetts,Everett,-0.987032,0.083333,0.271499,False
536260,everettindependent-everettindependent.com,2021-04-21,DiBiaso Leads CM to Mass. Football’s Top Ranking,Everett resident John DiBiaso finds himself in...,Everett resident John DiBiaso finds himself in...,25017,Massachusetts,Everett,-0.987032,0.083333,0.271499,False
536261,everettindependent-everettindependent.com,2021-04-28,COVID-19 Cases Decline Significantly from Last...,The numbers of COVID-19 cases in Everett has d...,The numbers of COVID cases in Everett has decr...,25017,Massachusetts,Everett,-0.987032,0.083333,0.271499,False


In [217]:
cl.to_parquet("covid_mentions.parquet")

In [219]:
import pickle

with open("nlp_docs.pkl", "wb") as f:
    pickle.dump(nlp_docs, f)