**Notebook to pre-process the texts of articles for analysis in a separate notebook.**

In [7]:
import pandas as pd
# pd.set_option('display.max_colwidth',None)
import numpy as np
from datetime import datetime

import dateutil.parser as dparser
#from natsort import natsorted
import glob, os
from tqdm.notebook import tqdm

from textblob import TextBlob
# import texthero as hero
import spacy
nlp = spacy.load("en_core_web_sm")
from spacy.lang.en.stop_words import STOP_WORDS
nlp.Defaults.stop_words |= {'the','we','she','he','said','it','like'}
import textdescriptives as td
nlp.add_pipe('textdescriptives')
# nlp.add_pipe("spacytextblob")

from collections import Counter
import math

from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# keywords
# from multi_rake import Rake

import json
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
tokenizer = Tokenizer(filters='', lower=False)

from ast import literal_eval

**1. Import and setup data**

In [2]:
# need to start with text data (will be saved as encoded later)
csv_text = '../../data/current_articles.csv'
# csv_text = 'test_articles_min.csv'

df = pd.read_csv(csv_text)
# df.drop('subject', axis=1, inplace=True)
df.sample()

Unnamed: 0.1,Unnamed: 0,uniqueID,date,full_text,source,label,clean_text,lemmas,readability,grade_level,pos_all,entities_all
1632,1632,https://www.kansan.com/news/douglas-county-exp...,2021-02-17,Douglas County health officials updated the co...,UDK,Douglas County expands mass gathering limit in...,douglas county health officials updated county...,douglas county health official update the coun...,58.995309,11.724359,"[('Douglas', 'PROPN'), ('County', 'PROPN'), ('...","[('Douglas County', 'GPE'), ('Wednesday', 'DAT..."


In [3]:
df = df[[c for c in df.columns if c not in ['Unnamed: 0','subject']]]

In [4]:
df.count()

uniqueID        3810
date            3810
full_text       3810
source          3810
label           3810
clean_text      3810
lemmas          3810
readability     3810
grade_level     3810
pos_all         3810
entities_all    3810
dtype: int64

In [6]:
df.head()

Unnamed: 0,url,date,full_text,source,title,clean_text,lemmas,readability,grade_level
0,https://www2.ljworld.com/news/general-news/202...,2021-04-29,People who received only one COVID-19 vaccine ...,LJW,Health department will notify residents about ...,people received covid-19 vaccine dose douglas ...,people who receive only one covid-19 vaccine d...,70.034286,11.722857
1,https://www2.ljworld.com/news/general-news/202...,2021-04-28,"Douglas County reported 9,194 cases of COVID-1...",LJW,Douglas County reports 13 new COVID-19 cases s...,"douglas county reported 9,194 cases covid-19 w...","douglas county report 9,194 case of covid-19 a...",77.144488,7.207055
2,https://www2.ljworld.com/news/general-news/202...,2021-04-28,Douglas County hosted its last mass COVID-19 v...,LJW,‘Stronger together’: Douglas County hosts fina...,douglas county hosted mass covid-19 vaccinatio...,douglas county host its last mass covid-19 vac...,70.905222,10.810947
3,https://www2.ljworld.com/news/state-region/202...,2021-04-28,Doctors are reporting that more parents are re...,LJW,Doctors: More parents are refusing COVID tests...,doctors reporting parents refusing sick childr...,doctor be report that more parent be refuse to...,70.375453,8.939178
4,https://www2.ljworld.com/news/state-region/202...,2021-04-27,Fewer than five of Kansas’ 105 counties still ...,LJW,Kansas soon could have just 3 counties with ma...,fewer kansas counties require masks place long...,few than five of kansas county still require m...,68.44157,12.089564


In [None]:
# df = df.sample(20)

**2. Clean text and do some preprocessing**

In [13]:
df['pos_all'] = None
df['entities_all'] = None

In [14]:
def clean_tok(t):
    return t.strip().replace("’","'").replace(" - ","")

def preprocess(df):
    
#     for i,r in tqdm(df[df.clean_text.isnull()].iterrows()):
    for i,r in tqdm(df.iterrows()):

        # call spaCy for cleanup and lemmas - better than textblob's method
        doc = nlp(r['full_text'])

        clean_text =  ' '.join(' '.join(
                [w.text.lower() for w in doc if w.text.lower() not in STOP_WORDS and \
                 not w.is_punct and not w.is_digit and not w.is_space]).split())
        # need to account for blank values here
        if clean_text == '' or clean_text == None:
            clean_text = ''
        df.at[i,'clean_text'] = clean_text
        df.at[i,'lemmas'] = ' '.join([w.lemma_.lower() for w in doc \
                if not w.is_punct and not w.is_digit and not w.is_space])

        # textdescriptives to get readability score
        df.at[i,'grade_level'] = doc._.readability['automated_readability_index']
        df.at[i,'readability'] = doc._.readability['flesch_reading_ease']

        # sentiment and Objectivity ratings from textblob
#         df.at[i,'polarity'] = doc._.blob.polarity
#         df.at[i,'subjectivity'] = doc._.blob.subjectivity

        # pos and ner
        df.at[i,'pos_all'] =  [(clean_tok(t.text), t.pos_) for t in doc if t.pos_ not in ['PUNCT','SPACE']]
        df.at[i, 'entities_all'] = [(clean_tok(ent.text), ent.label_) for ent in doc.ents]

    return df

df = preprocess(df)

0it [00:00, ?it/s]

In [15]:
df.sample()

Unnamed: 0,url,date,full_text,source,title,clean_text,lemmas,readability,grade_level,pos_all,entities_all
1587,http://www.theindianleader.com/2020/04/13/when...,2020-04-13,Haskell Indian Nations University recently cam...,The Indian Leader,When Home Isn’t “Home”,haskell indian nations university recently cam...,haskell indian nations university recently com...,60.257391,11.793145,"[(Haskell, PROPN), (Indian, PROPN), (Nations, ...","[(Haskell Indian Nations University, ORG), (we..."


In [17]:
df.rename(columns={'url':'uniqueID','title':'label'}, inplace=True)
df.head(1)

Unnamed: 0,uniqueID,date,full_text,source,label,clean_text,lemmas,readability,grade_level,pos_all,entities_all
0,https://www2.ljworld.com/news/general-news/202...,2021-04-29,People who received only one COVID-19 vaccine ...,LJW,Health department will notify residents about ...,people received covid-19 vaccine dose douglas ...,people who receive only one covid-19 vaccine d...,65.771286,13.822857,"[(People, NOUN), (who, PRON), (received, VERB)...","[(only one, CARDINAL), (Douglas County, GPE), ..."


**Holding - RAKE keywords**

In [None]:
kwd_df= pd.DataFrame()

# set keyword parameters

for i in range(3,6):

    rake = Rake(min_chars=3, max_words=5, min_freq=i)

    # keywords to omit from results
    kwd_to_omit = ['contact ku','contact reporter']

    for n,g in tqdm(df.groupby(['month','source'])):

        rake_df = pd.DataFrame(columns=['keyword'])

        for keyword, score in (rake.apply((''.join(g.full_text)))):
            rake_df = rake_df.append({'month':n[0], 
                                      'source':n[1], 
                                      'keyword':keyword, 
                                      'rake_score':score,
                                      'min_freq':i}, ignore_index=True)

        # filter df to exclude certain phrases
        rake_df = rake_df[~rake_df.keyword.str.contains('|'.join(kwd_to_omit)).any(level=0)]

        kwd_df = kwd_df.append(rake_df[:10], ignore_index=True)

**3. Save text-based csv**

In [23]:
df = df[df.date<'2022-02-01']

In [5]:
# df.to_csv('../data/current_articles.csv')
df.to_csv('../../data/current_articles.csv')

**4. Encode to numeric and save to Streamlit directory**

In [8]:
# encode for website
# make corpus and fit
corpus = df['full_text'].tolist() + df['clean_text'].tolist() + df['lemmas'].tolist() + df['label'].tolist()

# if you need minimal version
#df = df[['url','date','full_text','source','title']]
#corpus = df['full_text'].tolist() + df['title'].tolist()

# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer
tokenizer.fit_on_texts(corpus)


# write tokenizer to json file for later re-use
tokenizer_json = tokenizer.to_json()
with open('../data/tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

for c in ['label','full_text','clean_text','lemmas']:
# for c in ['title','full_text']: # again, minimal version
    df[c] = df[c].apply(lambda x: tokenizer.texts_to_sequences([x])[0]) 
    

df.to_csv('../data/current_articles.csv', index=None)
df.head()

Unnamed: 0,uniqueID,date,full_text,source,label,clean_text,lemmas,readability,grade_level,pos_all,entities_all
0,https://www2.ljworld.com/news/general-news/202...,2021-04-29,"[2750, 41, 337, 176, 76, 87, 63, 318, 16, 5, 1...",LJW,"[249, 66, 22, 2445, 199, 47, 4535, 3763, 66696...","[19, 337, 29, 63, 318, 61, 24, 925, 188, 561, ...","[19, 41, 164, 176, 76, 29, 63, 318, 16, 5, 61,...",65.771286,13.822857,"[('People', 'NOUN'), ('who', 'PRON'), ('receiv...","[('only one', 'CARDINAL'), ('Douglas County', ..."
1,https://www2.ljworld.com/news/general-news/202...,2021-04-28,"[160, 132, 196, 27408, 50, 3, 87, 15, 3, 1413,...",LJW,"[160, 132, 1300, 2984, 37, 87, 50, 151, 18175,...","[61, 24, 196, 27408, 50, 29, 169, 174, 50, 152...","[61, 24, 171, 27408, 62, 3, 29, 15, 3, 169, 33...",77.144488,7.207055,"[('Douglas', 'PROPN'), ('County', 'PROPN'), ('...","[('Douglas County', 'GPE'), ('9,194', 'CARDINA..."
2,https://www2.ljworld.com/news/general-news/202...,2021-04-28,"[160, 132, 1515, 84, 144, 925, 87, 188, 285, 1...",LJW,"[102723, 102724, 160, 132, 4945, 737, 925, 87,...","[61, 24, 1515, 925, 29, 188, 285, 169, 479, 14...","[61, 24, 839, 84, 144, 925, 29, 188, 285, 11, ...",69.255847,11.623447,"[('Douglas', 'PROPN'), ('County', 'PROPN'), ('...","[('Douglas County', 'GPE'), ('Wednesday', 'DAT..."
3,https://www2.ljworld.com/news/state-region/202...,2021-04-28,"[14420, 39, 1294, 9, 45, 634, 39, 3777, 2, 10,...",LJW,"[102725, 2460, 634, 39, 3777, 1045, 489, 8, 35...","[2218, 1294, 634, 3777, 833, 317, 325, 77, 137...","[1213, 6, 171, 9, 45, 739, 6, 2312, 2, 10, 35,...",69.117276,9.558969,"[('Doctors', 'NOUN'), ('are', 'AUX'), ('report...","[('Kansas City', 'GPE'), ('Overland Park', 'GP..."
4,https://www2.ljworld.com/news/state-region/202...,2021-04-27,"[18068, 85, 498, 3, 1672, 4567, 532, 167, 266,...",LJW,"[64, 598, 125, 10, 116, 1777, 532, 12, 95, 575]","[1410, 21, 532, 266, 251, 179, 1201, 388, 24, ...","[469, 85, 498, 3, 21, 24, 167, 266, 95, 4, 89,...",66.986736,12.792539,"[('Fewer', 'ADJ'), ('than', 'ADP'), ('five', '...","[('Fewer than five', 'CARDINAL'), ('Kansas', '..."


In [None]:
df.count()

**5. Update COVID case count csv**

In [None]:
# update cases cv
cv_data = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv')
cv_data.to_csv('../streamlit/data/us-counties-full.csv', index=False)

cv_data = cv_data[(cv_data.county=='Douglas') & (cv_data.state=='Kansas')]
cv_data.to_csv('../streamlit/data/us-counties-douglas-ks.csv', index=False)

cv_data.head()

In [None]:
# cv_data = pd.read_csv('../streamlit/data/us-counties-douglas-ks.csv')

# cv_data['month'] = cv_data['date'].apply(lambda x: x[:7])
# cv_data  = cv_data[(cv_data['month'] >= '2020-06') & (cv_data['month'] <= '2021-01')]

# # cv_data['date'] = pd.to_datetime(cv_data['date'])
# cv_data = cv_data.set_index('date')
# cv_data = cv_data.resample('M')

In [None]:
# len(cv_data)

In [None]:
# cv_data['cases'].iloc[0]