# Lost in spaCy

## Dependencies

In [4]:
import pandas as pd #This manages data frames
import spacy # SpaCy is a text analysis tool
nlp = spacy.load("en_core_web_lg") #this is the library used in nlp

## Load Data
> 📌I am using a sample data set to do this that is a CSV with two columns called id and text, the file is very simple by design so that the only thing going in is what is needed to come out i.e. the id of the text value and the text value - you can then join this up any way you like in Python or Power BI but the idea is to keep things as simple as possible. The text I am using is a dataset of tweets for demonstration purposes.

In [32]:
df = pd.read_csv('./data/text_small_example.csv', nrows=2) #loading just the first 2 rows to demonstrate code
pd.set_option('display.max_colwidth', None) #changing column width to show full text
df


Unnamed: 0,id,text
0,1,"@elephantbird Hey dear, Happy Friday to You Already had your rice's bowl for lunch ?"
1,2,Ughhh layin downnnn Waiting for zeina to cook breakfast


## Position of Speech

In [30]:
#create a for loop of the rows in the df dataframe
for idx, row in df.iterrows():
    #checks to see if the value in text is a string i.e. contains data if so continue
    if not isinstance(row['text'], str):
        continue
    #doc is the nlp results of the current text value
    doc = nlp(row['text'])
    #for loop for each token of the outputs
    for token in doc:
        #print id of row, token text and token pos code
        print(row["id"],token.text,token.pos_)

1 @elephantbird NOUN
1 Hey INTJ
1 dear INTJ
1 , PUNCT
1 Happy PROPN
1 Friday PROPN
1 to ADP
1 You PRON
1   SPACE
1 Already ADV
1 had VERB
1 your PRON
1 rice NOUN
1 's PART
1 bowl NOUN
1 for ADP
1 lunch NOUN
1 ? PUNCT
2 Ughhh INTJ
2 layin PROPN
2 downnnn ADV
2     SPACE
2 Waiting VERB
2 for ADP
2 zeina PROPN
2 to PART
2 cook VERB
2 breakfast NOUN


## Named Entity Regcognition
> 📌 This is almost identical to the POS code and works exactly the same way except instead of using the POS outputs of nlp it uses the entity outputs 

In [31]:
#create a for loop of the rows in the df dataframe
for idx, row in df.iterrows():
    #checks to see if the value in text is a string i.e. contains data if so continue
    if not isinstance(row['text'], str):
        continue
    #doc is the nlp results of the current text value
    doc = nlp(row['text'])
    #for loop for each entity of the outputs
    for e in doc.ents:
        #print id of row, entity text and entity label
        print(row["id"],e.text,e.label_)

1 @elephantbird ORG
1 Friday DATE
2 Ughhh PERSON
2 zeina PERSON
