In [1]:
import spacy
from spacy.lang.en import English
import pandas as pd

### Text Comparison with spaCy and Scattertext

In [8]:
import scattertext as st
nlp = spacy.load("en_core_web_sm")

#### Analyzing text data from the party conventions during the 2012 US Presidential elections 

In [13]:
import nltk
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dvellanki\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
## Customizing the pipeline
from spacy_wordnet.wordnet_annotator import WordnetAnnotator
print("before", nlp.pipe_names)
if "WordnetAnnotator" not in nlp.pipe_names:
    nlp.add_pipe("spacy_wordnet", after="tagger")
    print("after", nlp.pipe_names)

before ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
after ['tok2vec', 'tagger', 'spacy_wordnet', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [16]:
if "merge_entities" not in nlp.pipe_names:
    nlp.add_pipe("merge_entities")

if "merge_noun_chunks" not in nlp.pipe_names:
    nlp.add_pipe("merge_noun_chunks")

convention_df = st.SampleCorpora.ConventionData2012.get_data()
corpus = st.CorpusFromPandas(convention_df, category_col="party", text_col="text", nlp=nlp).build()

#### Generate interactive visualization in HTML 

In [17]:
html = st.produce_scattertext_explorer(corpus, category="democrat", 
                                       category_name="Democratic",
                                       not_category_name="Republican",
                                       width_in_pixels=1000,
                                       metadata=convention_df["speaker"])

In [18]:
## Render the HTML
from IPython.display import IFrame

file_name = "foo.html"

with open(file_name, "wb") as f:
    f.write(html.encode("utf-8"))

IFrame(src=file_name, width = 1200, height=700)