## Topic Modeling on Newspaper Articles 

* Collects, cleans, fits topic models and visualizes topics on articles from the New York Times, New York Post, Washington Post, Los Angeles Times, Chicago Tribune and Boston Globe mentioning immigrants between December 15, 2017 and January 15, 2018.

## Collecting the data

In [None]:
import eventregistry
from eventregistry import *
er = EventRegistry(apiKey = "api-Key")

In [None]:
# Downloading the articles with liberal leaning
qLib = QueryArticlesIter(
    dateStart = datetime.date(2017, 12, 15), dateEnd = datetime.date(2018, 1, 15),
    keywords = QueryItems.OR(["immigrants", "immigrant"]),
    sourceUri = QueryItems.OR([er.getNewsSourceUri("New York Times"), er.getNewsSourceUri("The Washington Post"),
                               er.getNewsSourceUri("The Boston Globe")]))

artLib = []

for article in qLib.execQuery(er, sortBy = "date", 
                              returnInfo = ReturnInfo(articleInfo = ArticleInfoFlags(location = True)), 
                              articleBatchSize = 100):
    artLib.append(article)

In [None]:
# Saving the data
import json
with open('liberal1', 'w') as fout:
    json.dump(artLib, fout)

In [None]:
# Downloading articles with a conservative leaning
qCons = QueryArticlesIter(
    dateStart = datetime.date(2017, 12, 15), dateEnd = datetime.date(2018, 1, 15),
    keywords = QueryItems.OR(["immigrants", "immigrant"]),
    sourceUri = QueryItems.OR([er.getNewsSourceUri("New York Post"), er.getNewsSourceUri("Chicago Tribune"),
                               er.getNewsSourceUri("Los Angeles Times")]))

artCons = []

for article in qCons.execQuery(er, sortBy = "date", 
                              returnInfo = ReturnInfo(articleInfo = ArticleInfoFlags(location = True)), 
                              articleBatchSize = 100):
    artCons.append(article)

In [None]:
# Saving the data 
with open('conservative', 'w') as fout:
    json.dump(artLib, fout)

## Preparing corpora

In [1]:
import json
with open('liberal1') as json_data:
    artLib = json.load(json_data)
    print(artLib[0])

{u'body': u'The Women\'s March a year ago aimed to launch a movement of women from all walks of life who would continue their activism long after they had gone home.\n\nIn many ways, that goal has been realized. In the wake of the march on Washington -- and simultaneous marches in more than 600 towns and cities across the country -- thousands of women threw themselves into activism for the first time in their lives, especially in red states where the events provided a rare chance to build a network of like-minded people.\n\nIn Texas, emails collected by the organizers of the Women\'s March in Austin are being repurposed to promote candidates who support abortion rights. In Arkansas, Gwen Combs, the elementary schoolteacher who organized the Little Rock march, is now running for Congress. Thousands of women in October attended a convention in Detroit training them on everything from lobbying elected officials to confronting white supremacy.\n\nBut as the movement evolves, differing prio

In [2]:
len(artLib)

559

In [3]:
with open('conservative') as json_data:
    artCons = json.load(json_data)
    print(artCons[0])

{u'body': u'President Donald Trump lashed out at Democratic U.S. Sen Dick Durbin on Monday, taking to Twitter to say Illinois\' senior senator "totally misrepresented what was said" at a meeting on immigration during which Trump was accused of using vulgar language to describe certain countries.\n\nTrump said in his tweet that Durbin "blew" efforts to reach a deal on immigration, including addressing the fate of so-called Dreamers who came to the U.S. illegally as children under a policy known as the Deferred Action for Childhood Arrivals, or DACA.\n\n"Senator Dicky Durbin totally misrepresented what was said at the DACA meeting. Deals can\'t get made when there is no trust! Durbin blew DACA and is hurting our Military," Trump said in the Tweet posted Monday afternoon.\n\nDurbin had criticized Trump for using what he said was "hate-filled, vile and racist" language when talking about immigrants from Haiti and Africa during an Oval Office meeting last week. Durbin said Trump referred to

In [4]:
len(artCons)

290

In [5]:
# Separating the text fields for liberal and conservative articles
textLib = []
for i in range(len(artLib)):
    textLib.append(artLib[i].values()[0])

textCons = []
for i in range(len(artCons)):
    textCons.append(artCons[i].values()[0])

In [6]:
# Word tokenizing, removing punctuation, capitalization, numbers and stopwords
# Stemming
import nltk
from nltk.corpus import stopwords

tokenizer = nltk.RegexpTokenizer(r'\w+')
porter = nltk.PorterStemmer()

textLib_tok = []
for i in range(len(artLib)):
    temp = [s for s in tokenizer.tokenize(textLib[i].lower()) if s.isalpha()]
    temp = [w for w in temp if w not in stopwords.words('english')]
    textLib_tok.append([porter.stem(t) for t in temp])

textCons_tok = []
for i in range(len(artCons)):
    temp = [s for s in tokenizer.tokenize(textCons[i].lower()) if s.isalpha()]
    temp = [w for w in temp if w not in stopwords.words('english')]
    textCons_tok.append([porter.stem(t) for t in temp])

In [7]:
# Remove tokens that appear in less than 10 articles

#1. get unique tokens for each document
uniqueLib = []
for i in range(len(textLib_tok)):
    uniqueLib.append(list(set(textLib_tok[i])))
    
uniqueCons = []
for i in range(len(textCons_tok)):
    uniqueCons.append(list(set(textCons_tok[i])))
    
#2. flatten list of unique tokens and get metalist of unique tokens for all documents
uniqueLib_flat = list(set([y  for x in uniqueLib for y in x]))
uniqueCons_flat = list(set([y  for x in uniqueCons for y in x]))

In [8]:
#3. for each token in metalist, count in how many articles it appears in list of articles 
countsLib = []
for i in uniqueLib_flat:
    count = 0
    for j in range(len(uniqueLib)):
        count += i in uniqueLib[j]
    countsLib.append(count)

countsCons = []
for i in uniqueCons_flat:
    count = 0
    for j in range(len(uniqueCons)):
        count += i in uniqueCons[j]
    countsCons.append(count)

In [9]:
#4. create list with tokens whose frequency count across articles is >= 10
tok_freqLib =  dict(zip(uniqueLib_flat, countsLib))
tok_freqLib10 = { k:v for k, v in tok_freqLib.items() if v >= 10} # 2994
# remove tokens that consist of only 1 or 2 letters
tokeepLib = [t for t in tok_freqLib10.keys() if len(t) >2]

tok_freqCons =  dict(zip(uniqueCons_flat, countsCons))
tok_freqCons10 = { k:v for k, v in tok_freqCons.items() if v >= 10} # 1790
tokeepCons = [t for t in tok_freqCons10.keys() if len(t) >2]

#5. for each article keep tokens in metalist of tokens with frequency >= 10 
textLib_tok10 = []
for i in range(len(textLib_tok)):
    textLib_tok10.append([t for t in textLib_tok[i] if t in tokeepLib])

textCons_tok10 = []
for i in range(len(textCons_tok)):
    textCons_tok10.append([t for t in textCons_tok[i] if t in tokeepCons])

## Modeling topics

In [10]:
# Create document term matrix
import gensim
from gensim import corpora

term_dict_Lib = corpora.Dictionary(textLib_tok10)
term_dict_Cons = corpora.Dictionary(textCons_tok10)

# Creating the document term matrix
doc_term_mat_Lib = [term_dict_Lib.doc2bow(text) for text in textLib_tok10]
doc_term_mat_Cons = [term_dict_Cons.doc2bow(text) for text in textCons_tok10]

In [11]:
# Fitting topic models
Lda = gensim.models.ldamodel.LdaModel
ldamodel_Lib = Lda(doc_term_mat_Lib, num_topics=15, id2word = term_dict_Lib, passes=50, iterations = 200, random_state = 235)
ldamodel_Cons = Lda(doc_term_mat_Cons, num_topics=15, id2word = term_dict_Cons, passes=50, iterations = 200, random_state = 15)

In [12]:
# Displaying topics
ldamodel_Lib.print_topics()

[(0,
  u'0.027*"percent" + 0.024*"american" + 0.022*"trump" + 0.012*"poll" + 0.012*"year" + 0.009*"support" + 0.007*"nation" + 0.007*"countri" + 0.007*"found" + 0.007*"major"'),
 (1,
  u'0.020*"state" + 0.013*"said" + 0.012*"polic" + 0.012*"feder" + 0.011*"immigr" + 0.010*"censu" + 0.010*"california" + 0.010*"citi" + 0.009*"crime" + 0.008*"report"'),
 (2,
  u'0.024*"republican" + 0.019*"democrat" + 0.014*"tax" + 0.013*"senat" + 0.011*"bill" + 0.011*"said" + 0.011*"hous" + 0.010*"year" + 0.010*"would" + 0.009*"vote"'),
 (3,
  u'0.025*"immigr" + 0.018*"said" + 0.015*"state" + 0.014*"unit" + 0.011*"countri" + 0.009*"administr" + 0.009*"year" + 0.008*"trump" + 0.008*"peopl" + 0.008*"would"'),
 (4,
  u'0.024*"said" + 0.012*"compani" + 0.011*"year" + 0.011*"new" + 0.010*"health" + 0.009*"state" + 0.009*"fund" + 0.009*"care" + 0.009*"march" + 0.009*"organ"'),
 (5,
  u'0.017*"christma" + 0.012*"music" + 0.011*"dress" + 0.010*"fashion" + 0.009*"nativ" + 0.009*"year" + 0.008*"one" + 0.007*"first

In [13]:
# Visualizing topic distances, overall term frequencies, term frequencies by topic and conditional topic distribution given salient terms
import pyLDAvis
import pyLDAvis.gensim
visData_lib = pyLDAvis.gensim.prepare(ldamodel_Lib, doc_term_mat_Lib, term_dict_Lib)
pyLDAvis.display(visData_lib)

In [14]:
ldamodel_Cons.print_topics()

[(0,
  u'0.025*"countri" + 0.021*"salvador" + 0.021*"statu" + 0.020*"salvadoran" + 0.018*"protect" + 0.018*"temporari" + 0.017*"said" + 0.016*"administr" + 0.013*"immigr" + 0.012*"legal"'),
 (1,
  u'0.021*"california" + 0.019*"said" + 0.016*"marijuana" + 0.014*"brown" + 0.014*"state" + 0.011*"student" + 0.010*"spanish" + 0.010*"santa" + 0.010*"latino" + 0.009*"one"'),
 (2,
  u'0.034*"immigr" + 0.025*"said" + 0.014*"law" + 0.013*"feder" + 0.011*"year" + 0.010*"state" + 0.010*"administr" + 0.010*"legal" + 0.010*"case" + 0.009*"enforc"'),
 (3,
  u'0.043*"trump" + 0.023*"presid" + 0.008*"white" + 0.008*"year" + 0.008*"bannon" + 0.007*"time" + 0.006*"american" + 0.006*"state" + 0.006*"countri" + 0.005*"new"'),
 (4,
  u'0.023*"border" + 0.015*"hire" + 0.013*"tax" + 0.012*"immigr" + 0.011*"credit" + 0.011*"year" + 0.010*"incom" + 0.010*"agent" + 0.009*"million" + 0.008*"state"'),
 (5,
  u'0.019*"democrat" + 0.018*"immigr" + 0.016*"trump" + 0.015*"republican" + 0.014*"hous" + 0.014*"said" + 0.

In [15]:
visData_Cons = pyLDAvis.gensim.prepare(ldamodel_Cons, doc_term_mat_Cons, term_dict_Cons)
pyLDAvis.display(visData_Cons)