In [206]:
# RUN THIS CHUNK FOR ENGLISH NEWS / MULTIPLE SPREADSHEETS
import collections
import os
import numpy as np
import pandas as pd
import re
import string

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

docs_location = 'Downloads/cv_us_20191201_to_20200320/'
content_full = []
# nltk.download('stopwords')
# nltk.download('wordnet')
stopwords = stopwords.words('english')
stopwords = stopwords + ['coronavirus', 'covid', 'corona']
lemmatizer = WordNetLemmatizer()
universal_count = collections.Counter()
dfs = []

# ^^ Here we're just importing libraries and instantiating some global variables.
dont_stem = {'Sanders'}
special_words = [
'donald trump',
'white house',
'social distancing',
'stock market',
'new york',
'joe biden',
'bernie sanders',
'world health organization',
'face mask',
'mike pence',
'vice president',
'hong kong',
'united states',
'diamond princess',
'new hampshire',
'whats happening',
'lunar year',
'los angeles',
'san francisco',
'elissa slotkin']

urls_mainstream = [
    'nytimes.com',
    'washingtonpost.com',
    'usatoday.com',
    'wsj.com',
    'newsweek.com',
    'nbcnews.com',
    'cbsnews.com',
    'abcnews.go.com',
    'cnn.com',
    'pbs.org',
    'npr.org',
    'latimes.com',
    'chicagotribune.com'
]

urls_conservative = [
    'foxnews.com',
    'breitbart.com',
    'newsmax.com',
    'theblaze.com',
    'dailycaller.com',
    'drudgereport.com'
]

urls_liberal = [
    'msnbc.com',
    'motherjones.com',
    'theatlantic.com',
    'huffingtonpost.com',
    'vox.com',
    'slate.com',
    'buzzfeednews.com',
    'dailykos.com'
]

def get_category_of_news_outlet(url):
    for url_m in urls_mainstream:
        if url_m in url:
            return 'mainstream'
    for url_c in urls_conservative:
        if url_c in url:
            return 'conservative'
    for url_l in urls_liberal:
        if url_l in url:
            return 'liberal'
    return None

def substitute_special_words(content):
    content = content.translate(str.maketrans('', '', ''.join(punctuation_no_underscore) + string.digits))
    content = content.lower()
    for w in special_words:
        if w in content:
            content = re.sub(w,  '_'.join(w.split()), content)
            
    return content

punctuation_no_underscore = set(string.punctuation)
punctuation_no_underscore.add('’')
punctuation_no_underscore.add('”')
punctuation_no_underscore.remove('_')

# here we go through a directory containing all the Excel spreadsheets
for doc in os.listdir(docs_location):
    print(doc)
    df = pd.read_excel(os.path.join(docs_location + doc))
    # pd.read_excel seems to want to grab the header line, so we make sure to ignore that; 
    # column 'Unnamed: 3' is the actual article content.
    content_not_clean_yet = df['Unnamed: 3'][1:]
    # we remove punctuation here
    content_not_clean_yet = [substitute_special_words(c) for c in content_not_clean_yet]
    content_no_punctuation = [[word for word in c.split() if re.match('[a-zA-Z0-9]+', word)] for c in content_not_clean_yet]
    # here we remove stopwords
    content_no_stopwords = [
        [c for c in content if c not in stopwords]
        for content in content_no_punctuation
        
    ]
    # I'm using gensim's native lemmatizer to lemmatize our content
    content_lemmatized = [
        [lemmatizer.lemmatize(c) for c in content \
         if c != 'sanders' and c not in special_words] 
    for content in content_no_stopwords]
    # here we get rid of short words
    content_full = [[c for c in content if len(c) > 2] for content in content_lemmatized]
    # get dates (splitting up by week of year)
    # the timedelta is necessary to align so that Sunday is the first day of the week.
    dates_full = list(df['Unnamed: 1'][1:].apply(lambda b: pd.to_datetime(b + pd.Timedelta(days=1)).week))
    # and get string dates so we can confirm we're getting correct weeks
    str_times = list(df['Unnamed: 1'][1:].apply(lambda b: pd.to_datetime(b).strftime('%Y%m%d')))
    df = df[1:]
    df['dates_full'] = dates_full
    df['content_full'] = content_full
    df['dt_str'] = str_times
    df['political_leaning'] = df['Unnamed: 2'].apply(get_category_of_news_outlet)
    dfs.append(df)
    
# df_final contains all data.
df_final = pd.concat(dfs)

Posts from 2020-03-03 to 2020-03-09.xls
Posts from 2020-04-18 to 2020-04-18.xls
Posts from 2020-04-10 to 2020-04-11.xls
Posts from 2020-04-19 to 2020-04-21.xls
Posts from 2020-04-15 to 2020-04-17.xls
Posts from 2020-03-13 to 2020-03-16.xls
Posts from 2020-02-14 to 2020-03-02.xls
Posts from 2020-03-28 to 2020-03-30.xls
Posts from 2020-04-12 to 2020-04-14.xls
Posts from 2020-03-10 to 2020-03-12.xls
Posts from 2020-03-31 to 2020-04-01.xls
Posts from 2020-03-19 to 2020-03-20.xls
Posts from 2020-03-21 to 2020-03-23.xls
Posts from 2020-04-22 to 2020-04-23.xls
Posts from 2019-12-01 to 2020-02-13.xls
Posts from 2020-03-24 to 2020-03-25.xls
Posts from 2020-04-02 to 2020-04-04.xls
Posts from 2020-04-08 to 2020-04-09.xls
Posts from 2020-04-24 to 2020-04-25.xls
Posts from 2020-03-26 to 2020-03-27.xls
Posts from 2020-03-17 to 2020-03-18.xls
Posts from 2020-04-05 to 2020-04-07.xls


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [142]:
# RUN THIS CHUNK FOR CHINESE NEWS / SINGLE SPREADSHEET
import collections
import os
import pandas as pd
import re
import string
import numpy as np


docs_location = 'Downloads/chinese_news/' # PUT THE FOLDER WHERE YOU ARE KEEPING THE SPREADSHEET HERE
doc = os.listdir(docs_location)[0]
df = pd.read_excel(os.path.join(docs_location + doc))
content_series = df['content_seg'].apply(lambda b: [i.strip() for i in b.split()])
# here we get rid of short words
content_full = [[c for c in content if len(c) >= 2] for content in content_series]
# get dates (splitting up by week of year)
# dates_full = list(df['Unnamed: 1'][1:].apply(lambda b: pd.to_datetime(b).week))
# inserting a dummy value because we want to look at all days, even though they're not the same week.
dates_full = np.zeros(len(content_full))
# and get string dates so we can confirm we're getting correct weeks
str_times = list(df['Date (GMT)'].apply(lambda b: pd.to_datetime(b).strftime('%Y%m%d')))
df['dates_full'] = dates_full
df['content_full'] = content_full
df['dt_str'] = str_times
df_final = df

KeyboardInterrupt: 

In [207]:
def get_topic_proportions(corpus, model, best_n_clusters):
    """
    Given a corpus and a model and a number of topics, 
    get the topic probability distribution for each document in the corpus 
    and use it to get the average topic proportions in that corpus for the model
    """
    group_topic_proba = np.zeros(best_n_clusters)
    topics = model[corpus]
    for td in topics:
        try:
            group_topic_proba = group_topic_proba + np.array([t[1] for t in td])
        except IndexError as e:
            print(len(td), len(group_topic_proba))
            print(group_topic_proba, td)
            print(e)
            print()
    z = group_topic_proba / sum(group_topic_proba)
    return z

In [208]:
from gensim import models, corpora
mallet_path = 'Downloads/mallet-2.0.8/bin/mallet'
def do_topics(content_full, best_n_clusters):
    """
    Given a processed corpus and a number of topics, compute an LDA model with that number of topics
    based on the given corpus.
    
    content_full should be a list of lists of words, where each list of words corresponds to a processed article.
    best_n_clusters should be an integer - specifically, the number of topics you're looking for.
    
    this will return an LDA model (gensim object right now) and a corpus object (dict --> BOW for each article)
    """
    dictionary = corpora.Dictionary(content_full)
    # We filter out rare or overly common words here; 
    # note that since the number of documents per week is changing a lot, 
    # I make the filtering dependent on number of documents in the corpus.
    dictionary.filter_extremes(no_below=int(len(content_full)/50), no_above=0.5)
    corpus = [dictionary.doc2bow(text) for text in content_full]
    lda_model = models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=best_n_clusters, optimize_interval=10, id2word=dictionary)
    topic_keywords = []
    for idx, topic in lda_model.show_topics(num_topics=10, num_words=20, formatted=False):
        print('Topic: {} \nWords: {}'.format(idx, [w[0] for w in topic]))
        for my_word in [w[0] for w in topic]:
            print(my_word)
        topic_keywords.append([w[0] for w in topic])
    p = get_topic_proportions(corpus, lda_model, best_n_clusters)
    final = pd.DataFrame()
    for w_idx in range(len(topic_keywords[0])):
        final['word_{}'.format(str(w_idx))] = [topic_keywords[i][w_idx] for i in range(len(topic_keywords))]
    final['proportions'] = p
    return lda_model, p, final

In [210]:
models_all = {}
proportions_all = {}
best_n_clusters = 10
writer = pd.ExcelWriter('us_topics_aligned_0.xlsx') # or your preferred filename
pol_leaning = set()
full_dfs = []
for index, gr in df_final.groupby(['dates_full', 'political_leaning']):
    print(index)
    print(len(gr))
    dts = set(list(gr['dt_str']))
    print(dts)
    pol_leaning.add(index[1])
    my_model, my_p, my_df = do_topics(gr['content_full'], best_n_clusters) 
    full_dfs.append(my_df.T)
    ls_thing = ["n = " + str(len(gr['content_full']))]
    full_dfs.append(pd.Series(ls_thing))
    if len(pol_leaning) == 3:
        df_full = pd.concat(full_dfs)
        df_full.to_excel(writer, sheet_name=min(list(dts)) + ' - ' + max(list(dts)))
        pol_leaning = set()
        full_dfs = []
    print()
    print('---------------------------------------------')
    print()
    print()
    print()
writer.save() # make sure you run this line; otherwise the data doesn't get written to the sheet!!

(2, 'conservative')
4
{'20200111', '20200109'}
(2, 'liberal')
1
{'20200109'}
(2, 'mainstream')
36
{'20200110', '20200111', '20200106', '20200109', '20200108', '20200107'}
(3, 'conservative')
16
{'20200115', '20200117', '20200114', '20200113', '20200116', '20200118'}
(3, 'liberal')
3
{'20200117'}
(3, 'mainstream')
77
{'20200112', '20200115', '20200117', '20200114', '20200113', '20200116', '20200118'}
(4, 'conservative')
136
{'20200125', '20200119', '20200120', '20200123', '20200124', '20200121', '20200122'}
(4, 'liberal')
82
{'20200125', '20200120', '20200123', '20200124', '20200121', '20200122'}
(4, 'mainstream')
1323
{'20200125', '20200120', '20200119', '20200123', '20200124', '20200121', '20200122'}
(5, 'conservative')
346
{'20200129', '20200128', '20200201', '20200130', '20200127', '20200131', '20200126'}
(5, 'liberal')
165
{'20200129', '20200128', '20200201', '20200130', '20200127', '20200131', '20200126'}
(5, 'mainstream')
2652
{'20200129', '20200128', '20200201', '20200130', '202

Topic: 0 
Words: ['mother', 'jones', 'newsletter', 'crisis', 'reporting', 'indispensable', 'subscribe', 'state', 'week', 'wednesday', 'published', 'american', 'year', 'month', 'place', 'sign', 'election', 'country', 'show', 'part']
mother
jones
newsletter
crisis
reporting
indispensable
subscribe
state
week
wednesday
published
american
year
month
place
sign
election
country
show
part
Topic: 1 
Words: ['people', 'pandemic', 'worker', 'health', 'life', 'virus', 'new_york', 'hospital', 'american', 'spread', 'center', 'test', 'mask', 'million', 'country', 'april', 'city', 'medical', 'patient', 'week']
people
pandemic
worker
health
life
virus
new_york
hospital
american
spread
center
test
mask
million
country
april
city
medical
patient
week
Topic: 2 
Words: ['free', 'slate', 'reader', 'making', 'coverage', 'support', 'april', 'journalism', 'subscribe', 'image', 'start', 'trial', 'popular', 'politics', 'recently', 'work', 'news', 'skip', 'note', 'getty']
free
slate
reader
making
coverage
suppo


---------------------------------------------



(17, 'liberal')
2240
{'20200420', '20200425', '20200424', '20200422', '20200421', '20200419', '20200423'}
Topic: 0 
Words: ['trump', 'good', 'day', 'year', 'thing', 'show', 'feel', 'time', 'election', 'dont', 'campaign', 'today', 'news', 'people', 'note', 'change', 'post', 'new_york', 'point', 'make']
trump
good
day
year
thing
show
feel
time
election
dont
campaign
today
news
people
note
change
post
new_york
point
make
Topic: 1 
Words: ['state', 'governor', 'order', 'republican', 'business', 'pandemic', 'gov', 'georgia', 'protest', 'stayathome', 'local', 'health', 'florida', 'federal', 'home', 'economy', 'people', 'reopen', 'told', 'monday']
state
governor
order
republican
business
pandemic
gov
georgia
protest
stayathome
local
health
florida
federal
home
economy
people
reopen
told
monday
Topic: 2 
Words: ['business', 'senate', 'small', 'leader', 'bill', 'house', 'program', 'billion', 'majority', 'pandemic', 'republican', 'response', 'con