In [91]:
import scattertext as st
import re, io
import spacy
from pprint import pprint
import pandas as pd
import numpy as np
from scipy.stats import rankdata, hmean, norm
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer, SampleCorpora
display(HTML("<style>.container {width:98% !important;} </style"))

In [108]:
df = pd.read_csv('test.csv', encoding = "ISO-8859-1")

In [109]:
df.head()

Unnamed: 0,text,country
0,"Since its founding a century and a half ago, C...",Can
1,We will set Grand Challenges to put the United...,UK


In [113]:
df.isnull().sum()

text       0
country    0
dtype: int64

In [114]:
nlp = spacy.load('en')
corpus = st.CorpusFromPandas(df,
                            category_col='country',
                            text_col='text',
                            nlp=nlp).build()

In [118]:
html = st.produce_scattertext_explorer(corpus, 
                                     category='Can',
                                     category_name='Canada',
                                       not_category_name='UK',
                                     width_in_pixels=1000)
open("PolicyVis.html", 'wb').write(html.encode('utf-8'))

302354

In [122]:
# preprocessing UK and Canada csv files
# remove missing data, punctuation, ensure that everything is in two columns

can_df = pd.read_table('Can.txt', error_bad_lines=False)

b'Skipping line 511: expected 3 fields, saw 13\nSkipping line 562: expected 3 fields, saw 6\nSkipping line 827: expected 3 fields, saw 4\nSkipping line 977: expected 3 fields, saw 8\nSkipping line 1123: expected 3 fields, saw 4\nSkipping line 1199: expected 3 fields, saw 6\nSkipping line 1239: expected 3 fields, saw 10\nSkipping line 1240: expected 3 fields, saw 10\nSkipping line 5826: expected 3 fields, saw 4\nSkipping line 5948: expected 3 fields, saw 6\nSkipping line 8194: expected 3 fields, saw 5\nSkipping line 9261: expected 3 fields, saw 7\nSkipping line 9578: expected 3 fields, saw 7\n'


In [123]:
can_df.head()

Unnamed: 0,Unnamed: 1.1,Unnamed: 1,Unnamed: 3
0,Tabled in the House of Commons,,
1,"by the Honourable William Francis Morneau, P.C...",,
2,"March 22, 2017",,
3,©Her Majesty the Queen in Right of Canada (201...,,
4,All requests for permission to reproduce this ...,,


In [127]:
can_df.notnull().sum()

              7900
Unnamed: 1     596
               307
dtype: int64

In [128]:
can_df.isnull().sum()

               303
Unnamed: 1    7607
              7896
dtype: int64

In [125]:
can_df.shape

(8203, 3)

In [130]:
can_df.columns

Index(['       ', 'Unnamed: 1', '   '], dtype='object')

In [132]:
can_df.columns = ['text', 'country', 'other']

In [133]:
can_df.columns

Index(['text', 'country', 'other'], dtype='object')

In [134]:
can_df2 = can_df

In [139]:
can_df2['country'].value_counts()
#303 of the 596 non-null values in country are bullets - can get rid of all these 

•                                      303
0                                       10
15                                       6
127                                      4
191                                      4
170                                      3
71                                       3
130                                      3
65                                       3
100                                      3
72                                       3
179                                      3
190                                      3
128                                      3
85                                       3
142                                      3
119                                      3
189                                      3
93                                       3
66                                       3
161                                      2
187                                      2
Projected                                2
59         

In [140]:
can_df2['other'].value_counts()
#this text seems to be worth keeping 

Double the number of high-growth companies in Canada, particularly in the digital, clean technology and health technology sectors, from 14,000 to 28,000 by 2025.                                                                                                                                                                                                                                                                                                                                                                                                                               2
Expanding e-prescribing and virtual care initiatives, supporting the continued adoption and use of electronic medical records, helping patients to access their own health records electronically, and better linking electronic health record systems to improve access by all providers and institutions through an investment of $300 million over five years, starting in 2017–18, for Canada Health Infoway.                     

In [148]:
can_df2['combined_txt'] = can_df2['text'] + can_df2['other']

In [149]:
can_df2.notnull().sum()

text            7900
country          596
other            307
combined_txt       4
dtype: int64

In [150]:
can_df.notnull().sum()

text            7900
country          596
other            307
combined_txt       4
dtype: int64

In [147]:
can_df2.drop('country', axis=1)

Unnamed: 0,text,other,combined_txt
0,Tabled in the House of Commons,,
1,"by the Honourable William Francis Morneau, P.C...",,
2,"March 22, 2017",,
3,©Her Majesty the Queen in Right of Canada (201...,,
4,All requests for permission to reproduce this ...,,
5,the Department of Finance Canada.,,
6,"For more information, please contact Service C...",,
7,TTY: 1-800-926-9105,,
8,Cat. No.: F1-23/3E-PDF ISSN: 1719-7740,,
9,This document is available on the Internet at ...,,


In [156]:
can_df3 = can_df2.drop(['combined_txt', 'country', 'other'], axis=1)

In [157]:
can_df3.notnull().sum()

text    7900
dtype: int64

In [158]:
can_df3['country'] = 'canada'

In [159]:
can_df3.head()
# final dataset to be combined with UK data

Unnamed: 0,text,country
0,Tabled in the House of Commons,canada
1,"by the Honourable William Francis Morneau, P.C...",canada
2,"March 22, 2017",canada
3,©Her Majesty the Queen in Right of Canada (201...,canada
4,All requests for permission to reproduce this ...,canada


In [161]:
uk_df = pd.read_table('UK.txt', error_bad_lines=False)

b'Skipping line 76: expected 1 fields, saw 2\nSkipping line 77: expected 1 fields, saw 2\nSkipping line 78: expected 1 fields, saw 2\nSkipping line 79: expected 1 fields, saw 2\nSkipping line 80: expected 1 fields, saw 2\nSkipping line 81: expected 1 fields, saw 2\nSkipping line 82: expected 1 fields, saw 2\nSkipping line 83: expected 1 fields, saw 2\nSkipping line 84: expected 1 fields, saw 2\nSkipping line 85: expected 1 fields, saw 2\nSkipping line 659: expected 1 fields, saw 17\nSkipping line 745: expected 1 fields, saw 11\nSkipping line 804: expected 1 fields, saw 4\nSkipping line 805: expected 1 fields, saw 2\nSkipping line 806: expected 1 fields, saw 3\nSkipping line 1268: expected 1 fields, saw 2\nSkipping line 1283: expected 1 fields, saw 5\nSkipping line 1418: expected 1 fields, saw 7\nSkipping line 1693: expected 1 fields, saw 13\nSkipping line 1835: expected 1 fields, saw 2\nSkipping line 1836: expected 1 fields, saw 6\nSkipping line 2091: expected 1 fields, saw 3\nSkipping

In [162]:
uk_df.describe

<bound method NDFrame.describe of                                     Industrial Strategy
0                 Building a Britain fit for the future
1                       Industrial Strategy White Paper
2                                              Contents
3                                                     2
4                      Foreword from the Prime Minister
5     Over the last seven years, we have made huge p...
6     We should take enormous pride in these achieve...
7     For me it is not enough to see growth in the n...
8     local economy is shrinking. It is not ambitiou...
9     
That is why one of my first actions as Prime ...
10    Strategy that would help businesses to create ...
11    a vital step in delivering that vision. More t...
12                                                    4
13    That is exactly what this Industrial Strategy ...
14    a partnership between government and industry ...
15    and big data to clean energy and self-driving ...
16    Two cent

In [163]:
uk_df.columns = ['text']

In [164]:
uk_df['country'] = 'uk'

In [165]:
uk_df.head()
# final dataset

Unnamed: 0,text,country
0,Building a Britain fit for the future,uk
1,Industrial Strategy White Paper,uk
2,Contents,uk
3,2,uk
4,Foreword from the Prime Minister,uk


In [166]:
frames = [uk_df, can_df3]

In [167]:
combined = pd.concat(frames)

In [171]:
combined.describe()

Unnamed: 0,text,country
count,11111,11414
unique,6966,2
top,0,canada
freq,452,8203


In [176]:
combined.groupby('country').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
canada,7900,4233,0,447
uk,3211,2939,Industrial Strategy White Paper,51


In [177]:
nlp = spacy.load('en')
corpus = st.CorpusFromPandas(combined,
                            category_col='country',
                            text_col='text',
                            nlp=nlp).build()

In [178]:
html = st.produce_scattertext_explorer(corpus, 
                                     category='canada',
                                     category_name='Canada',
                                       not_category_name='UK',
                                     width_in_pixels=1000)
open("PolicyVis.html", 'wb').write(html.encode('utf-8'))

1946068

In [179]:
combined2 = combined

In [180]:
from gensim.parsing.preprocessing import preprocess_documents

In [182]:
preprocess_documents(combined2)

[['text'], ['countri']]

In [183]:
combined2.head()

Unnamed: 0,text,country
0,Building a Britain fit for the future,uk
1,Industrial Strategy White Paper,uk
2,Contents,uk
3,2,uk
4,Foreword from the Prime Minister,uk


In [194]:
#preprocessing text - remove page numbers, remove pluralised words, punctuation
combined2['text'] = combined2['text'].str.replace('[^\W\S]', '')

In [202]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

SyntaxError: invalid syntax (<ipython-input-202-0826e909d3a4>, line 9)

In [199]:
combined2.describe()

Unnamed: 0,text,country
count,11111,11414
unique,6966,2
top,0,canada
freq,452,8203


In [200]:
nlp = spacy.load('en')
corpus = st.CorpusFromPandas(combined2,
                            category_col='country',
                            text_col='text',
                            nlp=nlp).build()

html = st.produce_scattertext_explorer(corpus, 
                                     category='canada',
                                     category_name='Canada',
                                       not_category_name='UK',
                                     width_in_pixels=1000)
open("PolicyVis2.html", 'wb').write(html.encode('utf-8'))

1946068

In [204]:
feat_builder = st.FeatsFromOnlyEmpath()
empath_corpus = st.CorpusFromParsedDocuments(combined2,
                                             category_col='country',
                                             feats_from_spacy_doc=feat_builder,
                                             parsed_col='text').build()
html = st.produce_scattertext_explorer(empath_corpus,
                                        category='canada',
                                        category_name='Canada',
                                        not_category_name='UK',
                                        width_in_pixels=1000,
                                        use_non_text_features=True,
                                        use_full_doc=True,
                                        topic_model_term_lists=feat_builder.get_top_model_term_lists())
open("Convention-Visualization-Empath.html", 'wb').write(html.encode('utf-8'))

2123716

In [205]:
from scattertext import word_similarity_explorer
html = word_similarity_explorer(corpus,
                                 category='canada',
                                 category_name='Canada',
                                 not_category_name='UK',
                                 target_term='jobs',
                                 minimum_term_frequency=5,
                                 pmi_threshold_coefficient=4,
                                 width_in_pixels=1000,
                                 alpha=0.01,
                                 max_p_val=0.05,
                                 save_svg_button=True)
open("Convention-Visualization-Jobs.html", 'wb').write(html.encode('utf-8'))

  bigram_prob[bigram] / np.product([unigram_prob[word] for word in bigram.split(' ')])


2948771

In [206]:
from scattertext import word_similarity_explorer
html = word_similarity_explorer(corpus,
                                 category='canada',
                                 category_name='Canada',
                                 not_category_name='UK',
                                 target_term='innovation',
                                 minimum_term_frequency=5,
                                 pmi_threshold_coefficient=4,
                                 width_in_pixels=1000,
                                 alpha=0.01,
                                 max_p_val=0.05,
                                 save_svg_button=True)
open("Convention-Visualization-Innovation.html", 'wb').write(html.encode('utf-8'))

  bigram_prob[bigram] / np.product([unigram_prob[word] for word in bigram.split(' ')])


2910192