# Lost in spaCy

## Dependencies

In [4]:
import pandas as pd #This manages data frames
import spacy # SpaCy is a text analysis tool
nlp = spacy.load("en_core_web_lg") #this is the library used in nlp

## Load Data
> 📌I am using a sample data set to do this that is a CSV with two columns called id and text, the file is very simple by design so that the only thing going in is what is needed to come out i.e. the id of the text value and the text value - you can then join this up any way you like in Python or Power BI but the idea is to keep things as simple as possible. The text I am using is a dataset of tweets for demonstration purposes.

In [10]:
df = pd.read_csv('./data/text_small_example.csv')
df

Unnamed: 0,id,text
0,1,"@elephantbird Hey dear, Happy Friday to You A..."
1,2,Ughhh layin downnnn Waiting for zeina to co...
2,3,"@greeniebach I reckon he'll play, even if he's..."
3,4,@vaLewee I know! Saw it on the news!
4,5,very sad that http://www.fabchannel.com/ has c...
...,...,...
95,96,i don't feel like art-ing now
96,97,pro-cras-ti-nating. (verb) to put of till anot...
97,98,oh and i cant forget my COW who doesnt even kn...
98,99,I really need to work on my sleep schedule.


## Position of Speech

In [6]:
#create a for loop of the rows in the df dataframe
for idx, row in df.iterrows():
    #checks to see if the value in text is a string i.e. contains data if so continue
    if not isinstance(row['text'], str):
        continue
    #doc is the nlp results of the current text value
    doc = nlp(row['text'])
    #for loop for each token of the outputs
    for token in doc:
        #print id of row, token text and token pos code
        print(row["id"],token.text,token.pos_)

1 @elephantbird NOUN
1 Hey INTJ
1 dear INTJ
1 , PUNCT
1 Happy PROPN
1 Friday PROPN
1 to ADP
1 You PRON
1   SPACE
1 Already ADV
1 had VERB
1 your PRON
1 rice NOUN
1 's PART
1 bowl NOUN
1 for ADP
1 lunch NOUN
1 ? PUNCT
2 Ughhh INTJ
2 layin PROPN
2 downnnn ADV
2     SPACE
2 Waiting VERB
2 for ADP
2 zeina PROPN
2 to PART
2 cook VERB
2 breakfast NOUN
3 @greeniebach INTJ
3 I PRON
3 reckon VERB
3 he PRON
3 'll AUX
3 play VERB
3 , PUNCT
3 even ADV
3 if SCONJ
3 he PRON
3 's VERB
3 not PART
3 100% NUM
3 ... PUNCT
3 but CCONJ
3 i PRON
3 know VERB
3 nothing PRON
3 ! PUNCT
3 ! PUNCT
3 ;) PUNCT
3 It PRON
3 wo AUX
3 n't PART
3 be VERB
3 the DET
3 same ADJ
3 without ADP
3 him PRON
3 . PUNCT
4 @vaLewee PROPN
4 I PRON
4 know VERB
4 ! PUNCT
4   SPACE
4 Saw VERB
4 it PRON
4 on ADP
4 the DET
4 news NOUN
4 ! PUNCT
5 very ADV
5 sad ADJ
5 that SCONJ
5 http://www.fabchannel.com/ NOUN
5 has AUX
5 closed VERB
5 down ADV
5 . PUNCT
5 One NUM
5 of ADP
5 the DET
5 few ADJ
5 web NOUN
5 services NOUN
5 that DET
5 I PR

## Named Entity Regcognition

In [7]:
#create a for loop of the rows in the df dataframe
for idx, row in df.iterrows():
    #checks to see if the value in text is a string i.e. contains data if so continue
    if not isinstance(row['text'], str):
        continue
    #doc is the nlp results of the current text value
    doc = nlp(row['text'])
    #for loop for each entity of the outputs
    for e in doc.ents:
        #print id of row, entity text and entity label
        print(row["id"],e.text,e.label_)

1 @elephantbird ORG
1 Friday DATE
2 Ughhh PERSON
2 zeina PERSON
3 @greeniebach ORG
5 One CARDINAL
5 over 5 years DATE
6 Radio 1 ORG
9 Expression Engine ORG
9 CMS ORG
11 VH1 ORG
13 tonight TIME
14 4 CARDINAL
15 Ashley   PERSON
15 the Hush Hush WORK_OF_ART
15 Hush Hush PERSON
16 summer DATE
17 2 CARDINAL
17 2 CARDINAL
18 morning TIME
20 &quot;Good ORG
20 Morning&quot NORP
20 the next month DATE
21 @katyperry ORG
22 quarter past midnight TIME
22 6 hours TIME
23 @yelyahwilliams ORG
24 tomorrow DATE
24 6 CARDINAL
24 wednesday DATE
25 YOU!x PERSON
26 3 yr old DATE
29 london GPE
29 American NORP
31 1.5 CARDINAL
31 3.5 CARDINAL
36 Longbranch NORP
37 St Louis GPE
41 Katy Perry's PERSON
45 Missy PERSON
45 thirteen DATE
46 Zexion PERSON
46 Feria DATE
47 Dis ORG
49 1 CARDINAL
49 15o PERSON
52 @MrPeterAndre PERSON
57 Jon PERSON
57 Kate PERSON
58 London GPE
58 today DATE
58 a lovely weekend DATE
59 TY ORG
59 Cruiser PRODUCT
59 20 yrs DATE
59 Vikki PERSON
60 two CARDINAL
60 7s CARDINAL
62 today DATE
