# Day 1

In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
doc = nlp("Tea is healthy and calming, don't you think?") 

In [None]:
for token in doc:
  print(token)

Tea
is
healthy
and
calming
,
do
n't
you
think
?


In [None]:
print(f"Token \t\tLemma \t\tStopword". format('Token', 'Lemma', 'Stopword'))
print("----"*10)
for token in doc:
  print(f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}")

Token 		Lemma 		Stopword
----------------------------------------
Tea		tea		False
is		be		True
healthy		healthy		False
and		and		True
calming		calm		False
,		,		False
do		do		True
n't		not		True
you		-PRON-		True
think		think		False
?		?		False


## Day 2

In [None]:
from spacy.matcher import PhraseMatcher

In [None]:
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

In [None]:
terms1 = ['Galaxy Note','iPhone 11']

In [None]:
terms2 = ['iPhone XS','Google Pixel']

In [None]:
patterns1 = [nlp(text) for text in terms1]

In [None]:
patterns2 = [nlp(text) for text in terms2]

In [None]:
matcher.add("A",patterns1) ## Match id is A

In [None]:
matcher.add("B",patterns2) ## Match id is B

In [None]:
text_doc = nlp("Glowing review overall, and some really interesting side-by-side "
                "photography tests pitting the iPhone 11 Pro against the "
                "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3 .")

In [None]:
for token in text_doc:
  print(token)

Glowing
review
overall
,
and
some
really
interesting
side
-
by
-
side
photography
tests
pitting
the
iPhone
11
Pro
against
the
Galaxy
Note
10
Plus
and
last
year
’s
iPhone
XS
and
Google
Pixel
3
.


In [None]:
matches = matcher(text_doc)  ##Match is for id 'A'

In [None]:
print(matches)

[(14862748245026736845, 17, 19), (14862748245026736845, 22, 24), (14230521632333904559, 30, 32), (14230521632333904559, 33, 35)]


In [None]:
match_id, start, end = matches[2]

In [None]:
match_id

14230521632333904559

In [None]:
start

30

In [None]:
end 

32

In [None]:
print(text_doc[start:end])

iPhone XS


In [None]:
print(nlp.vocab.strings[match_id])

B


In [None]:
ls

[0m[01;34msample_data[0m/  spam3.csv


## Day 3
## Text classification

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('spam3.csv',encoding='cp437')

In [None]:
data.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [None]:
data = data[['v1','v2']]

In [None]:
data.head(10)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [None]:
import spacy

In [None]:
#create an empty model
nlp = spacy.blank("en")

In [None]:
textcat = nlp.create_pipe(  
    "textcat", config={"exclusive_classes": True,
    "architecture":"bow"})

In [None]:
nlp.add_pipe(textcat)

In [None]:
textcat.add_label("ham")
textcat.add_label("spam")

1

In [None]:
train_texts = data['v2'].values

In [None]:
train_labels = [{'cats':{'ham':label=='ham',
                         'spam': label=='spam'}}
                for label in data['v1']]

In [None]:
train_data = list(zip(train_texts, train_labels))

In [None]:
train_data

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  {'cats': {'ham': True, 'spam': False}}),
 ('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}}),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  {'cats': {'ham': False, 'spam': True}}),
 ('U dun say so early hor... U c already then say...',
  {'cats': {'ham': True, 'spam': False}}),
 ("Nah I don't think he goes to usf, he lives around here though",
  {'cats': {'ham': True, 'spam': False}}),
 ("FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, σú1.50 to rcv",
  {'cats': {'ham': False, 'spam': True}}),
 ('Even my brother is not like to speak with me. They treat me like aids patent.',
  {'cats': {'ham': True, 'spam': False}}),
 ("As per your request 'Melle

In [None]:
from spacy.util import minibatch

In [None]:
spacy.util.fix_random_seed(1)

In [None]:
optimizer = nlp.begin_training()

In [None]:
#create a batch generator with batch size 8
batches = minibatch(train_data, size=8)

In [None]:
losses1 = {}
for batch in batches:
  texts, labels = zip(*batch)
  nlp.update(texts, labels, sgd=optimizer,losses=losses1)
print(losses1)

{'textcat': 1.3199655965818238}


For multiple loops (epochs)

In [None]:
import random

In [None]:
random.seed(1)
spacy.util.fix_random_seed(1)

In [None]:
optimizer = nlp.begin_training()

In [None]:
losses = {}
for epoch in range(10):
  random.shuffle(train_data)
  batches=minibatch(train_data,size=8)
  for batch in batches:
    texts,labels = zip(*batch)
    nlp.update(texts,labels,sgd=optimizer,losses=losses)
  print(losses)

{'textcat': 0.43034495428997843}
{'textcat': 0.6449593603104802}
{'textcat': 0.7809537190790161}
{'textcat': 0.8668792340289992}
{'textcat': 0.922076835382523}
{'textcat': 0.9587877583270735}
{'textcat': 0.9865144553330739}
{'textcat': 1.0048029720447071}
{'textcat': 1.019113017788869}
{'textcat': 1.0290381479702098}


In [None]:
texts = ["Are you ready for the tea party?","URGENT Reply to this message for Guranteed money"]

In [None]:
docs = [nlp.tokenizer(text) for text in texts]

In [None]:
textcat = nlp.get_pipe('textcat')

In [None]:
scores, _ = textcat.predict(docs)

In [None]:
scores

array([[0.99467224, 0.00532773],
       [0.24507679, 0.75492316]], dtype=float32)

In [None]:
predicted_labels = scores.argmax(axis=1)

In [None]:
print([textcat.labels[label] for label in predicted_labels])

['ham', 'spam']


## Day 4

In [None]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.2MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp37-none-any.whl size=829180945 sha256=dda786b50006a52571f9e66ba6ef6ddbc552ca7f91017577e614cbde0baad975
  Stored in directory: /tmp/pip-ephem-wheel-cache-dgqjwi_m/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
import numpy as np
import spacy

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
text = "These vectors can be used as features for machine learning models."

In [None]:
vectors = np.array([token.vector for token in nlp(text)])

In [None]:
vectors.shape

(12, 300)

In [None]:
print(vectors)

[[-0.1965    -0.13995   -0.52495   ... -0.097467   0.34578   -0.14233  ]
 [-0.25205   -0.16047   -0.6089    ...  0.19218   -0.40028    0.51894  ]
 [-0.23857    0.35457   -0.30219   ... -0.35283    0.41888    0.13168  ]
 ...
 [ 0.047511   0.1404    -0.11736   ...  0.03169   -0.14208    0.42548  ]
 [ 0.0065037  0.2064     0.0089077 ...  0.033444  -0.030121  -0.12998  ]
 [ 0.012001   0.20751   -0.12578   ...  0.13871   -0.36049   -0.035    ]]


In [None]:
print(vectors[0])

[-1.9650e-01 -1.3995e-01 -5.2495e-01 -2.4756e-01 -1.9766e-01  1.1652e-01
 -3.5867e-01  2.2617e-01  1.9777e-01  2.4174e+00 -1.4429e-01 -2.2122e-01
  3.6110e-01 -1.9718e-01  8.2148e-02 -8.2776e-02 -8.8343e-02  1.4324e+00
 -2.5540e-01 -1.5997e-01  2.6879e-02 -2.1031e-01 -6.6274e-02 -1.2712e-01
 -3.4628e-01  1.7385e-01 -1.1765e-01 -1.1228e-01  7.3088e-02  1.0363e-01
  4.4076e-02  4.9083e-01  1.4805e-02 -1.3679e-01  9.2625e-02 -9.1950e-02
  5.9530e-02  8.7379e-02 -7.0591e-02  2.2661e-01 -4.9048e-01 -9.7290e-02
  6.9595e-02  3.2322e-04  1.0677e-01 -6.9300e-02  5.2730e-02  3.2494e-01
  8.4942e-02 -5.4211e-02 -4.3229e-01  1.9370e-01 -3.4058e-01 -9.9482e-02
  1.4862e-02  9.6617e-02 -1.3798e-01  9.1556e-02 -1.3270e-01  2.2825e-01
  3.6487e-01 -3.3364e-01 -2.4517e-01  4.2809e-01  1.0876e-01  1.6533e-01
  2.0959e-01  3.9454e-01  1.8574e-01 -1.8073e-01  5.1004e-01 -2.0578e-01
  7.0082e-01  1.0200e-01  2.0585e-01 -4.6654e-02 -7.0956e-02  1.2990e-01
 -4.5945e-02  5.6986e-02  2.9182e-02 -2.0468e-02 -4

In [None]:
import pandas as pd

In [None]:
doc_vectors = np.array([nlp(text).vector for text in data.v2])

In [None]:
doc_vectors.shape

(5572, 300)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(doc_vectors, data.v1, test_size=0.1, random_state=1)

In [None]:
from sklearn.svm import LinearSVC

In [None]:
svc = LinearSVC(random_state=1, dual=False, max_iter=10000)

In [None]:
svc.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=1, tol=0.0001,
          verbose=0)

In [None]:
svc.score(X_test, y_test)

0.974910394265233

# Day 5
# Cosine Similarity

In [None]:
!python -m spacy download en_core_web_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
import numpy as np
import spacy

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
def cosine_similarity(a,b):
  return a.dot(b)/np.sqrt(a.dot(a) * b.dot(b))

In [None]:
a = nlp("A boy").vector

In [None]:
b = nlp("A girl").vector

In [None]:
cosine_similarity(a,b)

0.9178547