# Jason Kessler - Using Scattertext and the Python NLP Ecosystem for Text Visualization


In [1]:
!pip install scattertext agefromname

Collecting scattertext
[?25l  Downloading https://files.pythonhosted.org/packages/75/f6/de9274f016d5e33e74f61c79f218f7982d93a6ccf926e0d0dc2e7ff90e73/scattertext-0.0.2.29-py3-none-any.whl (22.5MB)
[K    100% |████████████████████████████████| 22.5MB 770kB/s eta 0:00:01   13% |████▍                           | 3.1MB 19.7MB/s eta 0:00:01    28% |█████████▏                      | 6.4MB 17.7MB/s eta 0:00:01    37% |████████████                    | 8.4MB 31.6MB/s eta 0:00:01    49% |███████████████▊                | 11.1MB 11.0MB/s eta 0:00:02
[?25hCollecting agefromname
[?25l  Downloading https://files.pythonhosted.org/packages/98/50/69576f906bc57a91adff7eed26fbb2b84d8b99365c45b9f04cea2c4b017f/agefromname-0.0.7-py3-none-any.whl (8.7MB)
[K    100% |████████████████████████████████| 8.7MB 4.0MB/s eta 0:00:01
Collecting beautifulsoup4 (from agefromname)
[?25l  Downloading https://files.pythonhosted.org/packages/fe/62/720094d06cb5a92cd4b3aa3a7c678c0bb157526a95c4025d15316d594c4b/beautiful

In [2]:
%matplotlib inline
import scattertext as st
import re, io
from pprint import pprint
import pandas as pd
import numpy as np
from scipy.stats import rankdata, hmean, norm
import spacy
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer
display(HTML("<style>.container { width:98% !important; }</style>"))

In [3]:
nlp = spacy.load('en')
# If this doesn't work, please uncomment the following line and use a regex-based parser instead
#nlp = st.whitespace_nlp_with_sentences

In [4]:
convention_df = st.SampleCorpora.ConventionData2012.get_data()


In [8]:
convention_df.head()

Unnamed: 0,party,speaker,text,parsed
0,democrat,BARACK OBAMA,Thank you. Thank you. Thank you. Thank you so ...,"(Thank, you, ., Thank, you, ., Thank, you, ., ..."
1,democrat,MICHELLE OBAMA,"Thank you so much. Tonight, I am so thrilled a...","(Thank, you, so, much, ., Tonight, ,, I, am, s..."
2,democrat,RICHARD DURBIN,Thank you. It is a singular honor to be here t...,"(Thank, you, ., It, is, a, singular, honor, to..."
3,democrat,JOSEPH BIDEN,"Hey, Delaware. \nAnd my favorite Democrat, Jil...","(Hey, ,, Delaware, ., \n, And, my, favorite, D..."
4,democrat,JILL BIDEN,"Hello. \nThank you, Angie. I'm so proud of how...","(Hello, ., \n, Thank, you, ,, Angie, ., I, 'm,..."


In [6]:
print("Document Count")
print(convention_df.groupby('party')['text'].count())
print("Word Count")
convention_df.groupby('party').apply(lambda x: x.text.apply(lambda x: len(x.split())).sum())
convention_df['parsed'] = convention_df.text.apply(nlp)

Document Count
party
democrat      123
republican     66
Name: text, dtype: int64
Word Count


In [7]:
corpus = st.CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parsed').build()


In [9]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['dem_precision'] = term_freq_df['democrat freq'] * 1./(term_freq_df['democrat freq'] + term_freq_df['republican freq'])
term_freq_df['dem_freq_pct'] = term_freq_df['democrat freq'] * 1./term_freq_df['democrat freq'].sum()
term_freq_df['dem_hmean'] = term_freq_df.apply(lambda x: (hmean([x['dem_precision'], x['dem_freq_pct']])
                                                                   if x['dem_precision'] > 0 and x['dem_freq_pct'] > 0 
                                                                   else 0), axis=1)                                                        
term_freq_df.sort_values(by='dem_hmean', ascending=False).iloc[:10]

Unnamed: 0_level_0,democrat freq,republican freq,dem_precision,dem_freq_pct,dem_hmean
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
the,3402,2532,0.573306,0.022408,0.04313
and,2709,2233,0.548159,0.017843,0.034562
to,2340,1667,0.583978,0.015413,0.030033
a,1602,1346,0.543419,0.010552,0.020702
of,1569,1377,0.532587,0.010335,0.020276
that,1400,1051,0.571195,0.009221,0.01815
we,1318,1146,0.534903,0.008681,0.017085
in,1291,986,0.566974,0.008503,0.016756
i,1098,851,0.563366,0.007232,0.014281
's,1037,631,0.621703,0.00683,0.013512


In [10]:
def normcdf(x):
    return norm.cdf(x, x.mean(), x.std())
term_freq_df['dem_precision_normcdf'] = normcdf(term_freq_df['dem_precision'])
term_freq_df['dem_freq_pct_normcdf'] = normcdf(term_freq_df['dem_freq_pct'])
term_freq_df['dem_scaled_f_score'] = hmean([term_freq_df['dem_precision_normcdf'], term_freq_df['dem_freq_pct_normcdf']])
term_freq_df.sort_values(by='dem_scaled_f_score', ascending=False).iloc[:10]

Unnamed: 0_level_0,democrat freq,republican freq,dem_precision,dem_freq_pct,dem_hmean,dem_precision_normcdf,dem_freq_pct_normcdf,dem_scaled_f_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
middle class,148,18,0.891566,0.000975,0.001948,0.769898,1.0,0.869991
auto,37,0,1.0,0.000244,0.000487,0.836137,0.888261,0.861411
fair,45,3,0.9375,0.000296,0.000593,0.799618,0.933098,0.861217
insurance,54,6,0.9,0.000356,0.000711,0.775533,0.965349,0.860093
forward,105,16,0.867769,0.000692,0.001382,0.753579,0.999849,0.85942
president barack,47,4,0.921569,0.00031,0.000619,0.789581,0.941764,0.858984
class,161,25,0.865591,0.00106,0.002118,0.752056,1.0,0.858484
middle,164,27,0.858639,0.00108,0.002158,0.747158,1.0,0.855284
the middle,98,17,0.852174,0.000645,0.00129,0.742558,0.99962,0.852124
medicare,84,15,0.848485,0.000553,0.001106,0.739914,0.997972,0.849783


In [11]:
term_freq_df['dem_corner_score'] = corpus.get_corner_scores('democrat')
term_freq_df.sort_values(by='dem_corner_score', ascending=False).iloc[:10]

Unnamed: 0_level_0,democrat freq,republican freq,dem_precision,dem_freq_pct,dem_hmean,dem_precision_normcdf,dem_freq_pct_normcdf,dem_scaled_f_score,dem_corner_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
auto,37,0,1.0,0.000244,0.000487,0.836137,0.888261,0.861411,0.919547
america forward,28,0,1.0,0.000184,0.000369,0.836137,0.81598,0.825935,0.919515
insurance companies,24,0,1.0,0.000158,0.000316,0.836137,0.776128,0.805015,0.919492
auto industry,24,0,1.0,0.000158,0.000316,0.836137,0.776128,0.805015,0.919492
pell,23,0,1.0,0.000151,0.000303,0.836137,0.765448,0.799233,0.919483
last week,22,0,1.0,0.000145,0.00029,0.836137,0.754493,0.793219,0.919472
pell grants,21,0,1.0,0.000138,0.000277,0.836137,0.743268,0.786972,0.91946
women 's,20,0,1.0,0.000132,0.000263,0.836137,0.731782,0.780487,0.919447
platform,20,0,1.0,0.000132,0.000263,0.836137,0.731782,0.780487,0.919447
coverage,18,0,1.0,0.000119,0.000237,0.836137,0.708061,0.766787,0.919411


In [12]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Republican Score'] = corpus.get_scaled_f_scores('republican')
term_freq_df['Democratic Score'] = corpus.get_scaled_f_scores('democrat')
print("Top 10 Democratic terms")
pprint(list(term_freq_df.sort_values(by='Democratic Score', ascending=False).index[:10]))
print("Top 10 Republican terms")
pprint(list(term_freq_df.sort_values(by='Republican Score', ascending=False).index[:10]))

Top 10 Democratic terms
['middle class',
 'forward',
 'class',
 'middle',
 'the middle',
 'pay',
 'medicare',
 'education',
 'health',
 'president obama']
Top 10 Republican terms
['government',
 'administration',
 'business',
 'can do',
 'success',
 'story',
 'unemployment',
 'freedom',
 'paul',
 'do better']



## Make and visualize chart, scale based on raw frequency.
- A word used 10 times by Republicans will be at position 10 on the on the x-axis
- This isn't very useful. Everything but the most frequent terms are squished the lower-left corner
- The corner-distance scores are largely stopwords
- By default, color words by Scaled F-Score

In [14]:
html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    transform=st.Scalers.scale,
                                    metadata=convention_df['speaker'])
file_name = 'output/Conventions2012ScattertextScale.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)


Using log scales seems to help a bit, but blank space and stop words still dominate the graph¶
The chracteristic terms look much more informative

In [15]:
html = st.produce_scattertext_explorer(corpus,
                                       category='democrat',
                                       category_name='Democratic',
                                       not_category_name='Republican',
                                       minimum_term_frequency=5,
                                       width_in_pixels=1000,
                                       transform=st.Scalers.log_scale_standardize)
file_name = 'output/Conventions2012ScattertextLog.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

## Rank terms by frequency percentiles instead of raw frequenies.¶
A term at the middle of the x-axis will be mentioned by Republicans at the median frequency.
This nicely distributes terms throughout the space
But, terms occuring with the same frequencies in both classes are stacked atop each other.
Can't mouseover points not at top of stack

In [17]:
html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    transform=st.Scalers.percentile,
                                    metadata=convention_df['speaker'])
file_name = 'output/Conventions2012ScattertextRankData.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)