# Spacy

an nlp library that comes with a pretrained model that recognizes parts of speech, named entities and other linguistic features

In [14]:
import spacy
import pandas as pd
import re

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
emma_ja = "Emma woodhouse is a fictional character created by Jane Austen. She is the protagonist of Austen's novel 'Emma,' which was published in 1815. Emma is a young, wealthy, and intelligent woman living in the fictional village of Highbury, England. She is known for her beauty, charm, and strong-willed personality. Throughout the novel, Emma engages in matchmaking and meddling in the lives of those around her, often leading to humorous and sometimes complicated situations. Despite her flaws, Emma is a well-meaning character who ultimately learns valuable lessons about love, friendship, and self-awareness. She is a beloved figure in English literature and has been portrayed in various film and television adaptations. She represents themes of social class, personal growth, and the complexities of human relationships. She is a beloved figure in English literature and has been portrayed in various film and television adaptations. She represents themes of social class, personal growth, and the complexities of human relationships. "

In [12]:
emma = emma_ja.lower()
emma

"emma woodhouse is a fictional character created by jane austen. she is the protagonist of austen's novel 'emma,' which was published in 1815. emma is a young, wealthy, and intelligent woman living in the fictional village of highbury, england. she is known for her beauty, charm, and strong-willed personality. throughout the novel, emma engages in matchmaking and meddling in the lives of those around her, often leading to humorous and sometimes complicated situations. despite her flaws, emma is a well-meaning character who ultimately learns valuable lessons about love, friendship, and self-awareness. she is a beloved figure in english literature and has been portrayed in various film and television adaptations. she represents themes of social class, personal growth, and the complexities of human relationships. she is a beloved figure in english literature and has been portrayed in various film and television adaptations. she represents themes of social class, personal growth, and the c

In [18]:
#removing punctuations

emma = re.sub(r"[^\w\s]", "", emma)
emma

'emma woodhouse is a fictional character created by jane austen she is the protagonist of austens novel emma which was published in 1815 emma is a young wealthy and intelligent woman living in the fictional village of highbury england she is known for her beauty charm and strongwilled personality throughout the novel emma engages in matchmaking and meddling in the lives of those around her often leading to humorous and sometimes complicated situations despite her flaws emma is a wellmeaning character who ultimately learns valuable lessons about love friendship and selfawareness she is a beloved figure in english literature and has been portrayed in various film and television adaptations she represents themes of social class personal growth and the complexities of human relationships she is a beloved figure in english literature and has been portrayed in various film and television adaptations she represents themes of social class personal growth and the complexities of human relations

In [None]:
spacy_doc = nlp(emma) #process the text with spaCy and create a Doc object where each token has linguistic annotations

In [20]:
pos_df = pd.DataFrame(columns=["token", "pos_tag"])

In [None]:
for token in spacy_doc:
    pos_df = pd.concat([pos_df,
                        pd.DataFrame.from_records([{"token": token.text, "pos_tag": token.pos_}])], ignore_index=True)
    
# what this cell is doing is that it is iterating over each token in the spacy_doc object and for each token, it creates a new DataFrame with the token text and its corresponding part-of-speech tag. This new DataFrame is then concatenated to the existing pos_df DataFrame, effectively building up a complete DataFrame of tokens and their POS tags.

In [26]:
pos_df.head(15)

Unnamed: 0,token,pos_tag
0,emma,PROPN
1,woodhouse,PROPN
2,is,AUX
3,a,DET
4,fictional,ADJ
5,character,NOUN
6,created,VERB
7,by,ADP
8,jane,PROPN
9,austen,PROPN


In [27]:
# finding the most common tokens and their POS tags

pos_df_counts = pos_df.groupby(['token', 'pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

#what is happening in this cell is that it is grouping the pos_df DataFrame by both the 'token' and 'pos_tag' columns, counting the occurrences of each unique combination of token and POS tag. The result is then reset to a new DataFrame with a 'counts' column that contains the frequency of each token-POS tag pair. Finally, the DataFrame is sorted in descending order based on the 'counts' column, allowing us to see the most common tokens along with their corresponding POS tags at the top of the DataFrame.

In [28]:
pos_df_counts.head(15)

Unnamed: 0,token,pos_tag,counts
4,and,CCONJ,22
36,in,ADP,16
51,of,ADP,14
38,is,AUX,14
67,the,DET,12
61,she,PRON,12
1,a,DET,10
19,emma,NOUN,8
32,her,PRON,6
31,has,AUX,4


In [30]:
pos_df_poscounts = pos_df_counts.groupby(['pos_tag'])['token'].count().sort_values(ascending=False)
pos_df_poscounts.head(15)

pos_tag
NOUN     28
ADJ      13
VERB     12
ADP       8
PROPN     6
PRON      5
AUX       4
ADV       3
DET       2
CCONJ     1
NUM       1
SCONJ     1
Name: token, dtype: int64

In [32]:
nouns = pos_df_counts[pos_df_counts['pos_tag'] == 'NOUN'][:10]
nouns.head(15)

Unnamed: 0,token,pos_tag,counts
19,emma,NOUN,8
66,television,NOUN,4
68,themes,NOUN,4
25,figure,NOUN,4
30,growth,NOUN,4
26,film,NOUN,4
58,relationships,NOUN,4
3,adaptations,NOUN,4
44,literature,NOUN,4
15,complexities,NOUN,4
