# Tokenization

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
#help(nlp)

In [4]:
text = "Apple is looking for buying. a U.K. startup for $1 billion"

In [5]:
doc = nlp(text)

In [6]:
for token in doc:
    print(token.text) #print(token) is also okey.

Apple
is
looking
for
buying
.
a
U.K.
startup
for
$
1
billion


# Part of Speech (POS)

In [7]:
doc

Apple is looking for buying. a U.K. startup for $1 billion

In [8]:
for token in doc:
    print(token.text,token.pos)

Apple 96
is 87
looking 100
for 85
buying 100
. 97
a 90
U.K. 96
startup 92
for 85
$ 99
1 93
billion 93


In [9]:
for token in doc:
    print(token.text,token.pos_)

Apple PROPN
is AUX
looking VERB
for ADP
buying VERB
. PUNCT
a DET
U.K. PROPN
startup NOUN
for ADP
$ SYM
1 NUM
billion NUM


# Visualization

In [10]:
from spacy import displacy

In [11]:
doc

Apple is looking for buying. a U.K. startup for $1 billion

In [12]:
displacy.render(doc,style = 'dep',options = {'distance':100, 'compact':True})

# Name Entity Recognition

In [13]:
for ent in doc.ents:
    print(ent.text,ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [14]:
displacy.render(doc,style='ent')

# Sentence Segmentation

In [15]:
text = 'Apple is looking for U.K. company to invest. But no permission given by goverment'

In [16]:
doc = nlp(text)

In [17]:
doc

Apple is looking for U.K. company to invest. But no permission given by goverment

In [18]:
for sent in doc.sents:
    print(sent)

Apple is looking for U.K. company to invest.
But no permission given by goverment


# Matcher

In [19]:
from spacy.matcher import Matcher
from spacy.tokens import Span

In [20]:
text = 'Hello, world! hello world'

In [21]:
doc = nlp(text)  

In [22]:
# sentiment value of docs
doc.sentiment

0.0

In [23]:
for token in doc:
    print(token)

Hello
,
world
!
hello
world


In [40]:
pattern = [{'LOWER':'hello'},{'IS_PUNCT':True,'OP':'?'},{'LOWER':'world'}]

In [41]:
#pattern = [[{'LOWER':'hello'},{'IS_PUNCT':True,'OP':'?'},{'LOWER':'world'}], [{'LOWER':'hey'}] ]

In [43]:
matcher = Matcher(nlp.vocab, validate = True)
matcher.add('hw',None,pattern)

In [44]:
matches = matcher(doc)

In [45]:
matches

[(17790654416186116455, 0, 3), (17790654416186116455, 4, 6)]

In [46]:
for match_id,start,end in matches:
    string_id = nlp.vocab.strings[match_id]
    matched_string = doc[start:end]
    print(match_id,string_id,start,end,matched_string)

17790654416186116455 hw 0 3 Hello, world
17790654416186116455 hw 4 6 hello world


# Regex

In [156]:
textregex = 'get the number 123. Extended version 1234567890.' 

In [157]:
import re

In [158]:
re.search(r'\d{3,10}',textregex)

<re.Match object; span=(15, 18), match='123'>

In [159]:
re.findall(r'\d{3,10}',textregex)

['123', '1234567890']

In [160]:
re.findall(r'\w+',textregex)

['get', 'the', 'number', '123', 'Extended', 'version', '1234567890']

# Process

In [161]:
import spacy

In [162]:
texts = ['net income was $9.4 million compared','revenue exceeds twelve billion dollars']

In [163]:
nlp = spacy.load('en_core_web_sm')

In [164]:
docs = nlp.pipe(texts,disable=['tagger','parser'])

for doc in docs:
    for ent in doc.ents:
        print(ent.text,ent.label_)
        
    print()

$9.4 million MONEY

twelve billion dollars MONEY



## Hashtags and Emoji Detection

In [165]:
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"]  # Positive emoji
neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒"]  # Negative emoji

In [166]:
pos = [[{'ORTH': emoji}] for emoji in pos_emoji]
neg = [[{'ORTH': emoji}] for emoji in neg_emoji]

In [81]:
neg

[[{'ORTH': '😞'}],
 [{'ORTH': '😠'}],
 [{'ORTH': '😩'}],
 [{'ORTH': '😢'}],
 [{'ORTH': '😭'}],
 [{'ORTH': '😒'}]]

In [82]:
matcher = Matcher(nlp.vocab)

In [83]:
# run when on_match
def label_sentiment(matcher, doc, i , matches):
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id]== 'happy':
        doc.sentiment += 0.1
    elif doc.vocab.strings[match_id] == 'sad':
        doc.sentiment -= 0.1

In [86]:
# 3 Different Rules
matcher.add("happy", label_sentiment, *pos)
matcher.add("sad", label_sentiment, *neg)
matcher.add('HASHTAG', None, [{'TEXT': '#'}, {'IS_ASCII': True}])


In [87]:
doc = nlp('Hello guys 😀😂 #kgptalkie')

In [88]:
matches = matcher(doc)
matches

[(244022080605231780, 2, 3),
 (244022080605231780, 3, 4),
 (16536914698459818706, 4, 6)]

In [94]:
# Reading matches
for match_id, starts, end in matches:
    string_id = doc.vocab.strings[match_id]
    span = doc[start:end]
    print(string_id, span)

happy 
happy 
HASHTAG #kgptalkie


In [95]:
doc.sentiment

0.20000000298023224