# Setup



## Install necessary libraries & download models here

In [1]:
!pip install spacy
!python -m spacy download en_core_web_md

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/6d/0d/4379e9aa35a444b6440ffe1af4c612533460e0d5ac5c7dca1f96ff6f2e23/spacy-3.0.6.tar.gz (7.1MB)
[K     |████████████████████████████████| 7.1MB 4.9MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Collecting pathy>=0.3.5 (from spacy)
  Using cached https://files.pythonhosted.org/packages/13/87/5991d87be8ed60beb172b4062dbafef18b32fa559635a8e2b633c2974f85/pathy-0.5.2-py3-none-any.whl
Collecting typer<0.4.0,>=0.3.0 (from spacy)
  Using cached https://files.pythonhosted.org/packages/90/34/d138832f6945432c638f32137e6c79a3b682f06a63c488dcfaca6b166c64/typer-0.3.2-py3-none-any.whl
Collecting catalogue<2.1.0,>=2.0.3 (from spacy)
  Using cached https://files.pythonhosted.org/packages/9c/10/dbc1203a4b1367c7b02fddf08cb2981d9aa3e688d398f587cea

# Bag of Words

#### Define some training utterances

In [2]:
class Category:
  BOOKS = "BOOKS"
  CLOTHING = "CLOTHING"

train_x = ["i love the book", "this is a great book", "the fit is great", "i love the shoes"]
train_y = [Category.BOOKS, Category.BOOKS, Category.CLOTHING, Category.CLOTHING]

#### Fit vectorizer to transform text to bag-of-words vectors

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True)
train_x_vectors = vectorizer.fit_transform(train_x)

print(vectorizer.get_feature_names())
print(train_x_vectors.toarray())

['book', 'fit', 'great', 'is', 'love', 'shoes', 'the', 'this']
[[1 0 0 0 1 0 1 0]
 [1 0 1 1 0 0 0 1]
 [0 1 1 1 0 0 1 0]
 [0 0 0 0 1 1 1 0]]


#### Train SVM Model

In [4]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

SVC(kernel='linear')

#### Test new utterances on trained model

In [5]:
test_x = vectorizer.transform(['i love the books'])

clf_svm.predict(test_x)

array(['CLOTHING'], dtype='<U8')

# Word Vectors

In [6]:
import spacy

nlp = spacy.load("en_core_web_md")

In [7]:
print(train_x)

['i love the book', 'this is a great book', 'the fit is great', 'i love the shoes']


In [8]:
docs = [nlp(text) for text in train_x]
train_x_word_vectors = [x.vector for x in docs]

In [9]:
from sklearn import svm

clf_svm_wv = svm.SVC(kernel='linear')
clf_svm_wv.fit(train_x_word_vectors, train_y)

SVC(kernel='linear')

In [10]:
test_x = ["I went to the bank and wrote a check", "let me check that out"]
test_docs = [nlp(text) for text in test_x]
test_x_word_vectors =  [x.vector for x in test_docs]

clf_svm_wv.predict(test_x_word_vectors)

array(['BOOKS', 'BOOKS'], dtype='<U8')

# Regexes

In [11]:
import re

regexp = re.compile(r"\bread\b|\bstory\b|book")

phrases = ["I liked that story.", "the car treaded up the hill", "this hat is nice"]

matches = []
for phrase in phrases:
  if re.search(regexp, phrase):
    matches.append(phrase)

print(matches)




['I liked that story.']


# Stemming/Lemmatization

### Setup

In [12]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Stemming

In [13]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

phrase = "reading the books"
words = word_tokenize(phrase)

stemmed_words = []
for word in words:
  stemmed_words.append(stemmer.stem(word))

" ".join(stemmed_words)

'read the book'

### Lemmatizing

In [14]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

phrase = "reading the books"
words = word_tokenize(phrase)

lemmatized_words = []
for word in words:
  lemmatized_words.append(lemmatizer.lemmatize(word, pos='v'))

" ".join(lemmatized_words)



'read the book'

# Stopwords
### Tokenize, then remove Stopwords

In [15]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

phrase = "Here is an example sentence demonstrating the removal of stopwords"

words = word_tokenize(phrase)

stripped_phrase = []
for word in words:
  if word not in stop_words:
    stripped_phrase.append(word)

" ".join(stripped_phrase)


'Here example sentence demonstrating removal stopwords'

# Various other techniques (spell correction, sentiment, & pos tagging)

In [16]:
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


In [17]:
from textblob import TextBlob

phrase = "the book was horrible"

tb_phrase = TextBlob(phrase)

tb_phrase.correct()

tb_phrase.tags

tb_phrase.sentiment

Sentiment(polarity=-1.0, subjectivity=1.0)

## Transformer Architecture

### Setup

In [18]:
!pip install spacy-transformers
# !python -m spacy download en_trf_bertbaseuncased_lg
!python -m spacy download en_core_web_trf

Collecting spacy-transformers
  Downloading https://files.pythonhosted.org/packages/f3/58/e470e8217c1c93db41c50ef210e02f7302fbf252a56b66708f8ecb579aa3/spacy_transformers-1.0.3-py2.py3-none-any.whl
Collecting transformers<4.7.0,>=3.4.0 (from spacy-transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 130kB/s eta 0:00:01
Collecting torch>=1.5.0 (from spacy-transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/d2/a9/b3cea4a97ffabd6639e71608814dbd08081e202e8ac9580250273c0541ff/torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl (831.4MB)
[K     |████████████████████████████████| 831.4MB 3.5kB/s eta 0:00:016    |█▋                              | 41.7MB 7.0MB/s eta 0:01:53     |██                              | 50.1MB 9.2MB/s eta 0:01:25     |██                              | 52.5MB 9.2MB/s eta 0:01:2

### Using Spacy to utilize BERT Model

In [23]:
## RESTART KERNEL BEFORE RUNNING THIS CELL

import spacy
import torch

# nlp = spacy.load("en_trf_bertbaseuncased_lg")
# nlp = spacy.load('/opt/conda/lib/python3.6/site-packages/en_trf_bertbaseuncased_lg/en_trf_bertbaseuncased_lg-2.2.0')
nlp = spacy.load("en_core_web_trf")
doc = nlp("Here is some text to encode.")

In [2]:
class Category:
  BOOKS = "BOOKS"
  BANK = "BANK"

train_x = ["good characters and plot progression", "check out the book", "good story. would recommend", "novel recommendation", "need to make a deposit to the bank", "balance inquiry savings", "save money"]
train_y = [Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BANK, Category.BANK, Category.BANK]

In [21]:
from sklearn import svm

docs = [nlp(text) for text in train_x]
print(docs)
train_x_vectors = [doc.vector for doc in docs]
# train_x_vectors = [doc._.trf_data.tensors[-1] for doc in nlp.pipe(train_x)]
print(train_x_vectors)

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x = ["check this story out"]
docs = [nlp(text) for text in test_x]
test_x_vectors = [doc.vector for doc in docs]

clf_svm.predict(test_x_vectors)

[good characters and plot progression, check out the book, good story. would recommend, novel recommendation, need to make a deposit to the bank, balance inquiry savings, save money]
[array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32)]


ValueError: Found array with 0 feature(s) (shape=(14, 0)) while a minimum of 1 is required.