# Code

## Collecting the text in a DataFrame and Tokenizing

In [None]:
# imports
import requests
import pandas as pd
import spacy

In [2]:
# set up nlp pipline
nlp = spacy.load("en_core_web_sm")
nlp.disable_pipes('ner', 'parser')

['ner', 'parser']

In [3]:
def get_text(url):
    response = requests.get(url)
    text = response.text
    return text

In [4]:
def divide_paras(text, start, end, para_break):
    text = text[start:end]
    paras = text.split(para_break)
    return paras

In [5]:
tragedies = [
    {
        'genre': 'tragedy',
        'title': 'hamlet',
        'url': 'https://www.gutenberg.org/cache/epub/1524/pg1524.txt',
    },
    {
        'genre': 'tragedy',
        'title': 'lear',
        'url': 'https://www.gutenberg.org/cache/epub/1532/pg1532.txt',
    },
    {
        'genre': 'tragedy',
        'title': 'romeo_juliet',
        'url': 'https://www.gutenberg.org/cache/epub/1513/pg1513.txt',
    },
    {
        'genre': 'tragedy',
        'title': 'macbeth',
        'url': 'https://www.gutenberg.org/cache/epub/1533/pg1533.txt',
    },
    {
        'genre': 'tragedy',
        'title': 'othello',
        'url': 'https://www.gutenberg.org/cache/epub/1531/pg1531.txt',
    },
    {
        'genre': 'tragedy',
        'title': 'coriolanus',
        'url': 'https://www.gutenberg.org/cache/epub/1535/pg1535.txt',
    }
]

In [6]:
start = '*** START OF THE PROJECT GUTENBERG EBOOK'
end = '*** END OF THE PROJECT GUTENBERG EBOOK'
para_break = '\r\n\r\n'
data = {'genre': [], 'title': [], 'text': []}
for item in tragedies:
    genre = item['genre']
    title = item['title']
    text = get_text(item['url'])
    start_index = text.find(start)
    end_index = text.find(end)
    paras = divide_paras(text=text, start=start_index, end=end_index, para_break=para_break)
    for para in paras:
        data['genre'].append(genre)
        data['title'].append(title)
        data['text'].append(para)
    tragedies_df = pd.DataFrame.from_dict(data)

In [7]:
tragedies_df.sample(10)

Unnamed: 0,genre,title,text
2592,tragedy,lear,GONERIL.\r\nCombine together ’gainst the enemy...
6962,tragedy,coriolanus,MENENIUS.\r\nHis nature is too noble for the w...
2024,tragedy,lear,CORNWALL.\r\nLet us withdraw; ’twill be a storm.
6145,tragedy,othello,"OTHELLO.\r\nNay, stare not, masters, it is tru..."
6362,tragedy,coriolanus,"Enter Volumnia and Virgilia, mother and wife t..."
6429,tragedy,coriolanus,MARTIUS.\r\nThey fear us not but issue forth t...
3623,tragedy,romeo_juliet,"Sirrah, go hire me twenty cunning cooks."
3592,tragedy,romeo_juliet,JULIET.\r\nWhat must be shall be.
313,tragedy,hamlet,"HORATIO.\r\nO day and night, but this is wondr..."
491,tragedy,hamlet,"GUILDENSTERN.\r\nO, there has been much throwi..."


In [9]:
def get_noun_lemmas(text):
    doc = nlp(text)
    tokens = [token for token in doc if token.pos_ == 'NOUN']
    lemmas = [token.lemma_ for token in tokens]
    results_str = ' '.join(lemmas)
    return results_str

In [10]:
def get_adj_lemmas(text):
    doc = nlp(text)
    tokens = [token for token in doc if token.pos_ == 'ADJ']
    lemmas = [token.lemma_ for token in tokens]
    results_str = ' '.join(lemmas)
    return results_str

In [11]:
def get_verb_lemmas(text):
    doc = nlp(text)
    tokens = [token for token in doc if token.pos_ == 'VERB']
    lemmas = [token.lemma_ for token in tokens]
    results_str = ' '.join(lemmas)
    return results_str

In [12]:
tragedies_df['nouns'] = tragedies_df['text'].apply(get_noun_lemmas)

In [13]:
tragedies_df['adjectives'] = tragedies_df['text'].apply(get_adj_lemmas)

In [14]:
tragedies_df['verbs'] = tragedies_df['text'].apply(get_verb_lemmas)

In [15]:
# remove rows who have only whitespace in nouns or adjectives or verbs column
tragedies_df = tragedies_df[tragedies_df['nouns'].str.strip().astype(bool)]
tragedies_df = tragedies_df[tragedies_df['adjectives'].str.strip().astype(bool)]
tragedies_df = tragedies_df[tragedies_df['verbs'].str.strip().astype(bool)]

In [16]:
tragedies_df.sample(10)

Unnamed: 0,genre,title,text,nouns,adjectives,verbs
461,tragedy,hamlet,"GUILDENSTERN.\r\nWhich dreams, indeed, are amb...",ambition substance shadow dream,very ambitious,dream
48,tragedy,hamlet,"BARNARDO.\r\nSit down awhile,\r\nAnd let us on...",BARNARDO ear story night,fortified,sit let assail see
1973,tragedy,lear,REGAN.\r\nI am glad to see your highness.,highness,glad,see
204,tragedy,hamlet,OPHELIA.\r\nAnd hath given countenance to his ...,OPHELIA hath countenance speech vow,holy,give
3795,tragedy,romeo_juliet,[_Breaking open the door of the monument._],door monument,open,break
3812,tragedy,romeo_juliet,"Enter, at the other end of the Churchyard, Fr...",end lantern crow,other,spade
5427,tragedy,othello,OTHELLO.\r\nAvaunt! be gone! Thou hast set me ...,hast rack abus’d,thou much little,go set swear tis know’t
815,tragedy,hamlet,QUEEN.\r\nWhat wilt thou do? Thou wilt not mur...,wilt thou wilt,thou,queen murder help help
7173,tragedy,coriolanus,VOLUMNIA.\r\nTake my prayers with you.\r\nI wo...,prayer god curse day heart to’t,heavy,take confirm meet unclog lie
938,tragedy,hamlet,"And England, if my love thou hold’st at aught,...",love thou aught power sense cicatrice sword ho...,great thee raw red danish free sovereign full ...,give look pay set conjure rage cure know do ho...


In [17]:
tragedies_df.to_csv('tragedies.csv', index=False)

In [18]:
comedies = [
    {
        'genre': 'comedy',
        'title': 'midsummer',
        'url': 'https://www.gutenberg.org/cache/epub/1514/pg1514.txt',
    },
    {
        'genre': 'comedy',
        'title': 'shrew',
        'url': 'https://www.gutenberg.org/cache/epub/1508/pg1508.txt',
    },
    {
        'genre': 'comedy',
        'title': 'twelfth',
        'url': 'https://www.gutenberg.org/cache/epub/1526/pg1526.txt',
    },
    {
        'genre': 'comedy',
        'title': 'winters',
        'url': 'https://www.gutenberg.org/cache/epub/1539/pg1539.txt',
    },
    {
        'genre': 'comedy',
        'title': 'much_ado',
        'url': 'https://www.gutenberg.org/cache/epub/1519/pg1519.txt',
    },
    {
        'genre': 'comedy',
        'title': 'tempest',
        'url': 'https://www.gutenberg.org/cache/epub/1540/pg1540.txt',
    }
]

In [19]:
start = '*** START OF THE PROJECT GUTENBERG EBOOK'
end = '*** END OF THE PROJECT GUTENBERG EBOOK'
para_break = '\r\n\r\n'
data = {'genre': [], 'title': [], 'text': []}
for item in comedies:
    genre = item['genre']
    title = item['title']
    text = get_text(item['url'])
    start_index = text.find(start)
    end_index = text.find(end)
    paras = divide_paras(text=text, start=start_index, end=end_index, para_break=para_break)
    for para in paras:
        data['genre'].append(genre)
        data['title'].append(title)
        data['text'].append(para)
    comedies_df = pd.DataFrame.from_dict(data)

In [None]:
comedies_df.sample(10)

Unnamed: 0,genre,title,text
437,comedy,midsummer,"PUCK.\r\nHo, ho, ho! Coward, why com’st thou not?"
5179,comedy,tempest,"ARIEL.\r\nMy lord, it shall be done."
3792,comedy,winters,Enter Shepherd and Clown.
4814,comedy,much_ado,"BENEDICK.\r\nSir, I shall meet your wit in the..."
2557,comedy,twelfth,"MALVOLIO.\r\nMy prayers, minx?"
3362,comedy,winters,
2759,comedy,twelfth,"CLOWN.\r\nAlas, sir, how fell you besides your..."
2590,comedy,twelfth,FABIAN.\r\nHere he comes with your niece; give...
68,comedy,midsummer,"HERMIA.\r\nI give him curses, yet he gives me ..."
3154,comedy,winters,LEONTES.\r\nCease; no more.\r\nYou smell this ...


In [None]:
comedies_df['nouns'] = comedies_df['text'].apply(get_noun_lemmas)

In [None]:
comedies_df['adjectives'] = comedies_df['text'].apply(get_adj_lemmas)

In [None]:
comedies_df['verbs'] = comedies_df['text'].apply(get_verb_lemmas)

In [None]:
# remove rows who have only whitespace in nouns or adjectives column
comedies_df = comedies_df[comedies_df['nouns'].str.strip().astype(bool)]
comedies_df = comedies_df[comedies_df['adjectives'].str.strip().astype(bool)]
comedies_df = comedies_df[comedies_df['verbs'].str.strip().astype(bool)]

In [None]:
comedies_df.to_csv('comedies.csv', index=False)