# Text  Visualization

In [1]:
import pandas as pd
import numpy as np
import gensim, spacy
from gensim.utils import simple_preprocess
import nltk

import altair as alt



In [2]:
df = pd.read_csv('VoxData.csv', header=0).dropna()

In [3]:
print(df.shape)

(6903, 8)


In [4]:
df.head()

Unnamed: 0,title,author,category,published_date,updated_on,slug,blurb,body
0,Every year of a prison term makes a couple 32 ...,Dara Lind,Criminal Justice,2014-05-29 12:30:05,2014-05-29 12:30:07,http://www.vox.com/2014/5/29/5756646/every-yea...,But even a short jail stay can strain a marria...,A new study by criminologists Sonja Siennick a...
1,Making sense of Donald Trump,John Patty,Mischiefs of Faction,2016-01-12 19:50:08,2016-01-12 19:50:09,http://www.vox.com/mischiefs-of-faction/2016/1...,Social science predicted that it can't predict...,The current fight for the GOP presidential nom...
2,Acting white: the most insidious myth about bl...,JenÃ©e Desmond-Harris,Race in America,2015-03-04 13:40:02,2015-05-04 02:51:51,http://www.vox.com/2015/3/4/8138739/acting-whi...,This popular theory about how African-American...,You've probably heard it before: too many blac...
3,Hillary Clintonâ€™s pitch: Tim Kaine will be t...,Dylan Matthews,Hillary Clinton,2016-07-23 21:23:13,2016-07-25 15:56:38,http://www.vox.com/2016/7/23/12263516/tim-kain...,He's not Tom Perez or Cory Booker. But...,"To many on the left, Tim Kaine’s selection as ..."
4,"Democratic debate 2015: start time, schedule, ...",Andrew Prokop,Debates,2015-11-13 16:20:02,2015-11-14 23:47:28,http://www.vox.com/2015/11/13/9728432/democrat...,The three remaining candidates will debate in ...,The horrific attacks in Paris will loom large ...


# Text with metadata

In [5]:
''' distribution of authors'''
author_count = df.groupby('author').size().reset_index(name='count')

alt.Chart(author_count).mark_bar().encode(
    x = alt.X('author:N',  sort='-y'),
    y = alt.Y('count:Q'),
)


In [6]:
''' distribution of categories '''
cat_count = df.groupby('category').size().reset_index(name='count')

alt.Chart(cat_count).mark_bar().encode(
    x = alt.X('category:N',  sort='-y'),
    y = alt.Y('count:Q'),
)


In [18]:
df[df['category']=='Politics & Policy']['body'].iloc[0]

' It was February 24, 2009. Bobby Jindal, then in his first term as governor of Louisiana and a rising star in the Republican Party, was tapped to deliver the GOP response to President Barack Obama’s joint address to Congress. Obama’s speech focused on economic recovery and health-care reform and, despite its grim focus, was greeted with applause and standing ovations. Jindal’s response, televised immediately after, was supposed to be his big political "coming out."\xa0If the speech went well, many thought it would provide a jumping-off point for a possible presidential run in 2012. Instead, Jindal became the object of bipartisan ridicule. Jon Stewart skewered his Mr. Rogers–esque delivery — Jindal\'s over-enunciation and earnest stare seemed more appropriate for addressing 5-year-olds than adults. Republican strategist David Johnson characterized the speech simply as a "flop." Even Fox News criticized Jindal’s delivery as "amateurish." To be fair, such responses are often derided. In 

# Processing for Filtering and Machine Learning Models

## Tokenization

In [19]:
''' we focus on the category of culture here '''
processed = df[df['category']=='Politics & Policy'].reset_index(drop=True)

In [20]:
processed

Unnamed: 0,title,author,category,published_date,updated_on,slug,blurb,body
0,How one speech changed the course of Republica...,Tez Clark,Politics & Policy,2015-06-22 17:45:02,2016-02-15 13:12:50,http://www.vox.com/2015/6/22/8824553/bobby-jin...,Louisiana governor Bobby Jindal's disastrous s...,"It was February 24, 2009. Bobby Jindal, then ..."
1,Ted Cruz is sorry he ruined his colleagues' we...,Dara Lind,Politics & Policy,2014-12-16 21:10:02,2014-12-16 21:10:02,http://www.vox.com/2014/12/16/7404457/cruz-unp...,Especially after they sunk his vote.,"Last weekend, Ted Cruz forced the Senate to st..."
2,Did Loretta Lynch just set a time bomb for Oba...,Dara Lind,Politics & Policy,2015-01-28 20:23:07,2015-01-28 20:32:06,http://www.vox.com/2015/1/28/7929095/lynch-imm...,"Her legal standard for ""prosecutorial discreti...","At her confirmation hearing Wednesday, Attorne..."
3,Internet mob justice is random and severe. So ...,German Lopez,Politics & Policy,2015-07-31 15:40:02,2015-08-06 05:03:11,http://www.vox.com/2015/7/31/9078777/criminal-...,It's like a random lightning bolt from the sky.,Today's criminal justice system shares a big p...
4,Tax credits to pay for child care is the next ...,Matthew Yglesias,Politics & Policy,2015-09-08 11:00:01,2015-09-08 11:00:02,http://www.vox.com/2015/9/8/9262901/high-quali...,A top progressive think tank proposes Obamacar...,Parents of young children often struggle with ...
...,...,...,...,...,...,...,...,...
410,Clinton seizes the center on race and guns in ...,Jonathan Allen,Politics & Policy,2015-06-20 18:20:02,2015-06-22 16:43:42,http://www.vox.com/2015/6/20/8818611/clinton-C...,Hillary Clinton focused on racism and guns in ...,Hillary Clinton is not seen as honest and trus...
411,Why the federal government is forgiving millio...,Libby Nelson,Politics & Policy,2015-06-08 21:50:02,2015-06-08 22:21:10,http://www.vox.com/2015/6/8/8748535/corinthian...,Students at now-bankrupt Corinthian Colleges c...,Some students from the for-profit Corinthian ...
412,2 school shootings in a day is shocking. But t...,German Lopez,Politics & Policy,2015-10-09 21:30:52,2015-10-10 03:52:54,http://www.vox.com/2015/10/9/9489599/school-sh...,"America's gun problem goes much, much further ...",America has been horrified by what seems like ...
413,Obamaâ€™s dialogue with Marilynne Robinson is ...,Ezra Klein,Politics & Policy,2015-10-15 17:40:02,2016-02-15 13:25:47,http://www.vox.com/2015/10/15/9542015/obama-ma...,"""We had this idea that why donâ€™t I just have...","According to a September 2015 CNN/ORC poll, 29..."


In [21]:
''' first trial of tokenization using simple_preprocess '''
data_words = gensim.utils.simple_preprocess(processed['body'][0])

In [22]:
data_words

['it',
 'was',
 'february',
 'bobby',
 'jindal',
 'then',
 'in',
 'his',
 'first',
 'term',
 'as',
 'governor',
 'of',
 'louisiana',
 'and',
 'rising',
 'star',
 'in',
 'the',
 'republican',
 'party',
 'was',
 'tapped',
 'to',
 'deliver',
 'the',
 'gop',
 'response',
 'to',
 'president',
 'barack',
 'obama',
 'joint',
 'address',
 'to',
 'congress',
 'obama',
 'speech',
 'focused',
 'on',
 'economic',
 'recovery',
 'and',
 'health',
 'care',
 'reform',
 'and',
 'despite',
 'its',
 'grim',
 'focus',
 'was',
 'greeted',
 'with',
 'applause',
 'and',
 'standing',
 'ovations',
 'jindal',
 'response',
 'televised',
 'immediately',
 'after',
 'was',
 'supposed',
 'to',
 'be',
 'his',
 'big',
 'political',
 'coming',
 'out',
 'if',
 'the',
 'speech',
 'went',
 'well',
 'many',
 'thought',
 'it',
 'would',
 'provide',
 'jumping',
 'off',
 'point',
 'for',
 'possible',
 'presidential',
 'run',
 'in',
 'instead',
 'jindal',
 'became',
 'the',
 'object',
 'of',
 'bipartisan',
 'ridicule',
 'jon',

In [23]:
data_word_list = [simple_preprocess(sentence) for sentence in processed['body']]

In [24]:
print("length of data_word_list: " , len(data_word_list))
print("length of data_word_list[0]: " , len(data_word_list[0]))

length of data_word_list:  415
length of data_word_list[0]:  283


In [25]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/junyuan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
#NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['com', 'from', 'subject', 're', 'edu', 'use', 'not', 'would', 
                   'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 
                   'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 
                   'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 
                   'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])


In [27]:
data_words = [[word for word in doc if word not in stop_words] for doc in data_word_list]

In [28]:
print("length of data_words: " , len(data_words))
print("length of data_words[0]: " , len(data_words[0]))

length of data_words:  415
length of data_words[0]:  176


## Stemming

In [29]:
#Stemming using porter Stemming Algorithm
from gensim.parsing.porter import PorterStemmer
p = PorterStemmer()

data_ready = []

for text in data_words:
    data_stemmed = p.stem_documents(text)
    data_ready.append(data_stemmed)
# data_ready

In [30]:
len(data_ready)

415

## Lemmatization

In [106]:
'''
too slow, do not run here
'''


'''
# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en', disable=['parser', 'ner'])
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
for sent in data_words:
    # Parse the sentence using the loaded 'en' model object `nlp`. Extract the lemma for each token and join
    doc = nlp(" ".join(sent)) 
    data_ready.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
# remove stopwords once more after lemmatization
data_ready = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in data_ready]

'''

'\n# Initialize spacy \'en\' model, keeping only tagger component needed for lemmatization\nnlp = spacy.load(\'en\', disable=[\'parser\', \'ner\'])\nallowed_postags=[\'NOUN\', \'ADJ\', \'VERB\', \'ADV\']\nfor sent in data_words:\n    # Parse the sentence using the loaded \'en\' model object `nlp`. Extract the lemma for each token and join\n    doc = nlp(" ".join(sent)) \n    data_ready.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])\n# remove stopwords once more after lemmatization\ndata_ready = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in data_ready]\n\n'

## Construct TF-IDF

In [31]:
import gensim.corpora as corpora
from gensim.sklearn_api import TfIdfTransformer

In [32]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

model = TfIdfTransformer(dictionary=id2word)

In [33]:
# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

num_docs = id2word.num_docs
num_terms = len(id2word.keys())

In [34]:
for doc in corpus[:1]:
    print([[id, id2word[id], freq] for id, freq in doc])

[[0, 'address', 3], [1, 'adult', 1], [2, 'amateurish', 1], [3, 'anyth', 1], [4, 'applaus', 1], [5, 'appropri', 1], [6, 'arkansa', 1], [7, 'awai', 1], [8, 'bad', 1], [9, 'barack', 1], [10, 'becam', 1], [11, 'big', 1], [12, 'bill', 2], [13, 'bipartisan', 1], [14, 'bobbi', 1], [15, 'budget', 1], [16, 'care', 1], [17, 'career', 1], [18, 'character', 1], [19, 'clinch', 1], [20, 'clinton', 3], [21, 'collaps', 1], [22, 'come', 1], [23, 'congress', 1], [24, 'correct', 2], [25, 'critic', 1], [26, 'david', 1], [27, 'debat', 1], [28, 'deliv', 1], [29, 'deliveri', 3], [30, 'democrat', 3], [31, 'derid', 1], [32, 'despit', 1], [33, 'earlier', 1], [34, 'earnest', 1], [35, 'econom', 1], [36, 'embarrass', 1], [37, 'emce', 1], [38, 'enter', 1], [39, 'enunci', 1], [40, 'esqu', 1], [41, 'expect', 1], [42, 'fair', 1], [43, 'featur', 1], [44, 'februari', 1], [45, 'first', 1], [46, 'flip', 1], [47, 'flop', 2], [48, 'focu', 1], [49, 'focus', 1], [50, 'fortun', 1], [51, 'fox', 1], [52, 'frequent', 1], [53, 'fu

In [35]:
tfidf_corpus = model.fit_transform(corpus)

In [36]:
tfidf_corpus[0]

[(0, 0.1365146488735365),
 (1, 0.05925227824461458),
 (2, 0.12347593136674762),
 (3, 0.03559537413815109),
 (4, 0.08361827263312022),
 (5, 0.0559680142562467),
 (6, 0.07436035863150357),
 (7, 0.03346537521941306),
 (8, 0.03247806194318948),
 (9, 0.0446141867699824),
 (10, 0.03797285543784491),
 (11, 0.021397789635934184),
 (12, 0.048506979933901004),
 (13, 0.06211499202707896),
 (14, 0.06800752199467913),
 (15, 0.04843598673624611),
 (16, 0.02541476848802842),
 (17, 0.06316562153534018),
 (18, 0.07631257652929586),
 (19, 0.10927834686453071),
 (20, 0.10350809969362858),
 (21, 0.06211499202707896),
 (22, 0.029355080557836307),
 (23, 0.0270116379610387),
 (24, 0.09287215796092659),
 (25, 0.03176760512485286),
 (26, 0.04102551912646951),
 (27, 0.0270116379610387),
 (28, 0.050075484288646534),
 (29, 0.2715304830945383),
 (30, 0.06419336890780256),
 (31, 0.10097329232991398),
 (32, 0.04951427699787234),
 (33, 0.03645491779566848),
 (34, 0.10927834686453071),
 (35, 0.03041660226776549),
 (36

In [37]:
# construct an array of tf-idf vectors
from gensim.matutils import corpus2dense, corpus2csc

corpus_tfidf_dense = corpus2dense(tfidf_corpus, num_terms, num_docs)

In [38]:
corpus_tfidf_dense.shape

(10396, 415)

In [39]:
X = corpus_tfidf_dense[corpus_tfidf_dense.max(axis=1) > 0.1]
X.shape

(3074, 415)

## Clustering & Projection

In [40]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.cluster import KMeans


In [44]:
# result_pca = PCA(n_components=2).fit_transform(X.T)
result_tsne = TSNE(n_components=2, perplexity=10).fit_transform(X.T)

tsne_df = pd.DataFrame(data=result_tsne, columns=['x','y'])

alt.Chart(tsne_df).mark_circle().encode(
    x='x:Q',
    y='y:Q'
)

In [45]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(tsne_df[['x','y']])
tsne_df['label'] = kmeans.labels_

alt.Chart(tsne_df).mark_circle(opacity = .7).encode(
    x='x:Q',
    y='y:Q',
    color='label:N'
)

In [46]:
words = []
for id in id2word.keys():
    words.append(id2word[id])

In [47]:
mat = pd.DataFrame(data=corpus_tfidf_dense, index=words)
mat = mat[mat.max(axis=1) > 0.1]

wordtfidf = pd.DataFrame(data=mat.values.T, columns=mat.index)
wordtfidf.head()

Unnamed: 0,address,adult,amateurish,bill,bipartisan,bobbi,budget,care,clinch,clinton,...,lester,martha,nightli,quijano,raddatz,slant,trustworthi,gilead,medit,sinist
0,0.136515,0.059252,0.123476,0.048507,0.062115,0.068008,0.048436,0.025415,0.109278,0.103508,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.014576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.24648,0.0,0.017611,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
mat.max(axis=1).shape

(3074,)

In [52]:
''' top 10 words for each cluster '''
group_key_words = []
for label in range(10):
    # get the sum tf-idf for each word, do sum() across rows for each column
    group_df = wordtfidf[kmeans.labels_ == label].mean(axis=0)
    # sort the tf-idf values
    to_sort = [{'freq': group_df[x], 'word': x} for x in group_df.index]
    sorted(to_sort, key=lambda d: d['freq'], reverse=True)
    # add the words to the list
    for i in range(10):
        group_key_words.append([label, to_sort[i]['word'], to_sort[i]['freq']])
        
keyword_df = pd.DataFrame(data=group_key_words, columns=['label', 'keyword', 'tfidf'])

In [87]:
''' try to plot bar chart for one cluster '''
alt.Chart(keyword_df[keyword_df['label']==2]).mark_bar().encode(
    x=alt.X('tfidf:Q',  scale=alt.Scale(domain=[0, 0.05]), title='label'),
    y=alt.Y('keyword:N', sort='-x')
).properties(
    width = 50
)

In [89]:
chart = alt.hconcat()
for label in range(10):
    chart |= alt.Chart(keyword_df[keyword_df['label']==label]).mark_bar().encode(
    x=alt.X('tfidf:Q', scale=alt.Scale(domain=[0,0.05])),
    y=alt.Y('keyword:N', sort='-x')
).properties(
    width = 50
)
    
chart

## Trend

In [65]:
''' distribution of authors'''
author_count = processed.groupby('author').size().reset_index(name='count')

alt.Chart(author_count).mark_bar().encode(
    x = alt.X('author:N',  sort='-y'),
    y = alt.Y('count:Q'),
)

In [67]:
author_count = author_count.sort_values(by='count', ascending=False)

In [68]:
wordtfidf['author'] = processed['author']

In [79]:
author_key_words = []

''' key words for top 10 authors'''
for author in author_count['author'][:10]:
    # get the mean tf-idf for each word, do sum() across rows for each column
    group_df = wordtfidf[wordtfidf['author'] == author].mean(axis=0)
    # sort the tf-idf values
    to_sort = [{'freq': group_df[x], 'word': x} for x in group_df.index]
    sorted(to_sort, key=lambda d: d['freq'], reverse=True)
    # add the words to the list
    for i in range(10):
        author_key_words.append([author, to_sort[i]['word'], to_sort[i]['freq']])
        
author_keyword_df = pd.DataFrame(data=author_key_words, columns=['author', 'keyword', 'tfidf'])

In [80]:
author_keyword_df

Unnamed: 0,author,keyword,tfidf
0,Andrew Prokop,address,0.005656
1,Andrew Prokop,adult,0.002295
2,Andrew Prokop,amateurish,0.000000
3,Andrew Prokop,bill,0.009444
4,Andrew Prokop,bipartisan,0.003900
...,...,...,...
95,Jonathan Allen,bobbi,0.000000
96,Jonathan Allen,budget,0.003615
97,Jonathan Allen,care,0.011858
98,Jonathan Allen,clinch,0.000000


In [88]:
chart = alt.hconcat()
for author in author_count['author'][:10]:
    chart |= alt.Chart(author_keyword_df[author_keyword_df['author']==author]).mark_bar().encode(
        x=alt.X('tfidf:Q', scale=alt.Scale(domain=[0,0.05]), title=author),
        y=alt.Y('keyword:N', sort='-x'),
    ).properties(
        width = 50
    )

chart
