In [None]:
!python -m spacy download en
!pip install --no-cache-dir wordcloud

In [None]:
import spacy
from datascience import *
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
with open('data/islandpoetry1_22.txt', "r") as f:
    raw = f.read()

In [None]:
raw

In [None]:
wordcloud = WordCloud().generate(raw)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

# lower max_font_size
wordcloud = WordCloud(max_font_size=40).generate(raw)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

# formal analysis

In [None]:
len(raw.split("\n\n"))

In [None]:
[len(p) for p in raw.split("\n\n")]

In [None]:
np.mean([len(p) for p in raw.split("\n\n")])

In [None]:
from string import punctuation

poems = raw.split("\n\n")

all_poems_enjambment = []
for p in poems:
    lines = p.split("\n")
    enjambment = 0
    for l in lines:
        try:
            if l[-1] in punctuation:
                pass
            else:
                enjambment += 1
        except:
            pass
    enj = enjambment/len(lines)
    all_poems_enjambment.append(enj)
    
print(np.mean(all_poems_enjambment))

# NLP text analysis

In [None]:
nlp = spacy.load('en', parser=False)

In [None]:
parsed_text = nlp(raw.replace("\n", " "))

In [None]:
toks_tab = Table()
toks_tab.append_column(label="Word", values=[word.text for word in parsed_text])
toks_tab.show()

In [None]:
toks_tab.append_column(label="POS", values=[word.pos_ for word in parsed_text])
toks_tab.show()

In [None]:
toks_tab.where("POS", are.equal_to("ADJ")).group("Word").sort("count", descending=True)

In [None]:
def tablefy(parsed_text):
    toks_tab = Table()
    toks_tab.append_column(label="Word", values=[word.text for word in parsed_text])
    toks_tab.append_column(label="POS", values=[word.pos_ for word in parsed_text])
    toks_tab.append_column(label="Lemma", values=[word.lemma_ for word in parsed_text])
    toks_tab.append_column(label="Stop Word", values=[word.is_stop for word in parsed_text])
    toks_tab.append_column(label="Punctuation", values=[word.is_punct for word in parsed_text])
    toks_tab.append_column(label="Space", values=[word.is_space for word in parsed_text])
    toks_tab.append_column(label="Number", values=[word.like_num for word in parsed_text])
    toks_tab.append_column(label="OOV", values=[word.is_oov for word in parsed_text])
    toks_tab.append_column(label="Dependency", values=[word.dep_ for word in parsed_text])
    return toks_tab

In [None]:
tablefy(parsed_text).show()

In [None]:
tablefy(parsed_text).where("Stop Word", are.equal_to(False)).where("Punctuation", are.equal_to(False)).group("Word").sort("count",descending=True)

In [None]:
tablefy(parsed_text).where("Stop Word", are.equal_to(False)).where("Punctuation", are.equal_to(False)).where("Space", are.equal_to(False)).group("Lemma").sort("count",descending=True)

# NER

In [None]:
ner_tab = Table()
ner_tab.append_column(label="NER Label", values=[ent.label_ for ent in parsed_text.ents])
ner_tab.append_column(label="NER Text", values=[ent.text for ent in parsed_text.ents])
ner_tab.show()

In [None]:
ner_tab.where("NER Label", are.equal_to("GPE")).show()

In [None]:
ner_tab.where("NER Label", are.equal_to("GPE")).to_df()['NER Text'].value_counts().plot.barh()

In [None]:
ner_tab.where("NER Label", are.equal_to("PERSON")).to_df()['NER Text'].value_counts().plot.barh()

In [None]:
ner_tab.where("NER Label", are.equal_to("ORG")).to_df()['NER Text'].value_counts().plot.barh()

In [None]:
ner_tab.where("NER Label", are.equal_to("DATE")).to_df()['NER Text'].value_counts().plot.barh()

In [None]:
ner_tab.where("NER Label", are.equal_to("TIME")).to_df()['NER Text'].value_counts().plot.barh()