## 1. Bag of words

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

### Define some training utterances

In [2]:
class Category:
  BOOKS = "BOOKS"
  CLOTHING = "CLOTHING"

train_x = ["i love the book", "this is a great book", "the fit is great", "i love the shoes"]
train_y = [Category.BOOKS, Category.BOOKS, Category.CLOTHING, Category.CLOTHING]
     

In [3]:
train_x = ["I love the book", "This is a great book", "This fit is great", "I love the shoes"]

### Fit vectorizer to transform text to bag-of-words vectors


In [4]:
vectorizer = CountVectorizer(binary = True, ngram_range=(1, 2)) #only 0 or 1. does not give count of words.bigram.
train_x_vectors = vectorizer.fit_transform(train_x)
print(vectorizer.get_feature_names())
print(train_x_vectors.toarray())
print(train_x_vectors[1])

['book', 'fit', 'fit is', 'great', 'great book', 'is', 'is great', 'love', 'love the', 'shoes', 'the', 'the book', 'the shoes', 'this', 'this fit', 'this is']
[[1 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0]
 [1 0 0 1 1 1 1 0 0 0 0 0 0 1 0 1]
 [0 1 1 1 0 1 1 0 0 0 0 0 0 1 1 0]
 [0 0 0 0 0 0 0 1 1 1 1 0 1 0 0 0]]
  (0, 0)	1
  (0, 13)	1
  (0, 5)	1
  (0, 3)	1
  (0, 15)	1
  (0, 6)	1
  (0, 4)	1




In [5]:
# Sequence is not maintained

### Train SVM Model

In [6]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

### Test new utterances on trained model

In [7]:
test_x = vectorizer.transform(['i like the book'])

clf_svm.predict(test_x)

array(['BOOKS'], dtype='<U8')

In [8]:
# it will work with 'book' but not 'books'. It fails badly if does not know about a word.

## 2. Tfidf vectorization

In [9]:
#TFIDF works by proportionally increasing the number of times a word appears in the document but is counterbalanced by the number of documents in which it is present. 
#Hence, words like ‘this’, ’are’ etc., that are commonly present in all the documents are not given a very high rank. 
#However, a word that is present too many times in a few of the documents will be given a higher rank as it might be indicative of the context of the document.

In [10]:
#The limitation of TFIDF is again 
#that this vectorization doesn’t help in bringing in the contextual meaning of the words as it is just based on the frequency.

### formula
### tf-idf(t, d) = tf(t, d) * idf(t)

### where,
### tf(t,d) = count of term t in document d / total number of words in d

### idf(t) = log(number of documents in the corpus / number of documents in the corpus containing that term)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
train_x_vectors_tfidf = vectorizer.fit_transform(train_x)
print(vectorizer.get_feature_names())
print(train_x_vectors_tfidf.toarray())
print(train_x_vectors_tfidf[1])

['book', 'fit', 'great', 'is', 'love', 'shoes', 'the', 'this']
[[0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.        ]
 [0.5        0.         0.5        0.5        0.         0.
  0.         0.5       ]
 [0.         0.59081908 0.46580855 0.46580855 0.         0.
  0.         0.46580855]
 [0.         0.         0.         0.         0.52640543 0.66767854
  0.52640543 0.        ]]
  (0, 2)	0.5
  (0, 3)	0.5
  (0, 7)	0.5
  (0, 0)	0.5




In [12]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors_tfidf, train_y)

In [13]:
test_x = vectorizer.transform(['i like the book'])

clf_svm.predict(test_x)

array(['BOOKS'], dtype='<U8')

## 3.Word vectors using spacy

In [14]:
# converting words into numbers and find the association or context.
#Ref: https://spacy.pythonhumanities.com/01_03_word_vectors.html

In [15]:
#!pip3 install spacy



In [16]:
#!pip install --upgrade click



In [17]:
#!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [18]:
import spacy

nlp = spacy.load("en_core_web_md")

In [19]:
print(train_x)

['I love the book', 'This is a great book', 'This fit is great', 'I love the shoes']


In [20]:
docs = [nlp(text) for text in train_x]
train_x_word_vectors = [x.vector for x in docs]

In [21]:
print(docs)

[I love the book, This is a great book, This fit is great, I love the shoes]


In [22]:
print(docs[0].vector)

[-1.3978975  -0.94314    -1.1927751  -4.3043246  -1.69008    -2.18875
  2.8568673   4.1011753  -3.6249747   4.17611     7.12575     2.2885249
 -6.495055   -0.703155    3.46178    -0.9427      4.1357403  -3.3983903
 -0.04899997  1.3973      1.5476775   1.4060001  -0.07001507 -4.591998
 -1.038875   -1.8461976  -3.6312752   0.4407499  -1.7652375   3.388475
 -0.4016     -1.471375   -0.39702505  0.24449998 -0.04592732 -1.4172026
 -1.167125    0.5882125   2.6957998  -0.5626705  -1.7447001   3.973075
 -0.671685   -1.0611899   4.576425    2.9842675  -2.49175    -2.6355624
  0.5972425   0.59040004 -0.792125   -0.590725    0.33869502 -3.42171
 -3.4163604  -0.1711675  -0.786485    1.4665233   3.89455     1.9638373
  5.5787754  -1.3022224  -0.651945    0.43172497 -2.4435027   0.596875
 -3.6072845  -5.0790253   3.3520503   3.8547673  -0.87257504  2.2705574
 -0.5900501  -2.054635    3.19281     3.36905    -2.8925076   1.6652
 -2.5049374  -2.7379746  -2.37408     0.8923325   5.0625834  -1.2852752
  1

In [23]:
print(train_x_word_vectors)

[array([-1.3978975 , -0.94314   , -1.1927751 , -4.3043246 , -1.69008   ,
       -2.18875   ,  2.8568673 ,  4.1011753 , -3.6249747 ,  4.17611   ,
        7.12575   ,  2.2885249 , -6.495055  , -0.703155  ,  3.46178   ,
       -0.9427    ,  4.1357403 , -3.3983903 , -0.04899997,  1.3973    ,
        1.5476775 ,  1.4060001 , -0.07001507, -4.591998  , -1.038875  ,
       -1.8461976 , -3.6312752 ,  0.4407499 , -1.7652375 ,  3.388475  ,
       -0.4016    , -1.471375  , -0.39702505,  0.24449998, -0.04592732,
       -1.4172026 , -1.167125  ,  0.5882125 ,  2.6957998 , -0.5626705 ,
       -1.7447001 ,  3.973075  , -0.671685  , -1.0611899 ,  4.576425  ,
        2.9842675 , -2.49175   , -2.6355624 ,  0.5972425 ,  0.59040004,
       -0.792125  , -0.590725  ,  0.33869502, -3.42171   , -3.4163604 ,
       -0.1711675 , -0.786485  ,  1.4665233 ,  3.89455   ,  1.9638373 ,
        5.5787754 , -1.3022224 , -0.651945  ,  0.43172497, -2.4435027 ,
        0.596875  , -3.6072845 , -5.0790253 ,  3.3520503 ,  3.8

In [24]:
clf_svm_wv = svm.SVC(kernel='linear')
clf_svm_wv.fit(train_x_word_vectors, train_y)

In [25]:
#test_x = ["I went to the bank and wrote a check", "let me check that out"]

test_x = ["I love the stories", "let me check the dress out"]
test_docs = [nlp(text) for text in test_x]
test_x_word_vectors =  [x.vector for x in test_docs]

clf_svm_wv.predict(test_x_word_vectors)

array(['BOOKS', 'CLOTHING'], dtype='<U8')

## 4. Regex

In [34]:
import re

regexp = re.compile("^ab[^\s]*cd")

phrases = ["abcd", "xxx", "abxxxcd", "ab cd", "aaa abxxxcd zzz"]

matches = []
for i in phrases:
    if re.match(regexp,i):
        matches.append(i)

print(matches)
    

['abcd', 'abxxxcd']


In [35]:
import re

regexp = re.compile("^ab[^\s]*cd")

phrases2 = ["abcd", "xxx", "abxxxcd", "ab cd", "aaa abxxxcd ccc"]

matches2 = []
for i in phrases2:
    if re.search(regexp,i):
        matches2.append(i)

print(matches2)

['abcd', 'abxxxcd']


In [36]:
import re

regexp = re.compile(r"\bread\b|\bstory\b|book")

phrases = ["I liked that story.", "the car treaded up the hill", "this hat is nice"]

matches = []
for phrase in phrases:
  if re.search(regexp, phrase):
    matches.append(phrase)

print(matches)


['I liked that story.']


## 5.Stemming/Lemmatization
### Normalizing text

In [37]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Amrita\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Amrita\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Amrita\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

### Stemming

In [40]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

phrase = "reading the stories"
words = word_tokenize(phrase)

stemmed_words = []
for word in words:
  stemmed_words.append(stemmer.stem(word))

print(stemmed_words)

['read', 'the', 'stori']


In [41]:
" ".join(stemmed_words)

'read the stori'

### Lemmatizing

In [43]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

phrase = "reading the stories"
words = word_tokenize(phrase)

lemmatized_words = []
for word in words:
  lemmatized_words.append(lemmatizer.lemmatize(word, pos='v')) #specifying perts of speech

print(lemmatized_words)

['read', 'the', 'stories']


In [44]:
" ".join(lemmatized_words)

'read the stories'

## 6. Stopwords

In [45]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

phrase = "Here is an example sentence demonstrating the removal of stopwords"

words = word_tokenize(phrase)

stripped_phrase = []
for word in words:
  if word not in stop_words:
    stripped_phrase.append(word)

In [46]:
print(stripped_phrase)

['Here', 'example', 'sentence', 'demonstrating', 'removal', 'stopwords']


In [47]:
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [48]:
print(len(stop_words))

179


## 7. Spell correction

In [50]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Installing collected packages: textblob
Successfully installed textblob-0.17.1


In [53]:
!python -m textblob.download_corpora

Finished.

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Amrita\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Amrita\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Amrita\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Amrita\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\Amrita\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\conll2000.zip.
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Amrita\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.





In [57]:
from textblob import TextBlob

phrase = "this is an very good boek"

tb_phrase = TextBlob(phrase)

tb_phrase.correct()

TextBlob("this is an very good book")

## 8. POS tagging

In [58]:
tb_phrase.tags

[('this', 'DT'),
 ('is', 'VBZ'),
 ('an', 'DT'),
 ('very', 'RB'),
 ('good', 'JJ'),
 ('boek', 'NN')]

## 9. Sentiment Analysis

In [59]:
tb_phrase.sentiment

Sentiment(polarity=0.9099999999999999, subjectivity=0.7800000000000001)

## 10. BERT model (need GPU/CUDA to run)

In [60]:
!pip install spacy-transformers
#!python -m spacy download en_trf_bertbaseuncased_lg

Collecting spacy-transformers
  Downloading spacy_transformers-1.2.5-cp38-cp38-win_amd64.whl (304 kB)
Collecting torch>=1.8.0
  Downloading torch-2.0.1-cp38-cp38-win_amd64.whl (172.4 MB)
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.9.0-cp38-cp38-win_amd64.whl (186 kB)
Collecting transformers<4.31.0,>=3.4.0
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp38-cp38-win_amd64.whl (3.5 MB)
Collecting safetensors>=0.3.1
  Downloading safetensors-0.3.1-cp38-cp38-win_amd64.whl (263 kB)
Collecting huggingface-hub<1.0,>=0.14.1
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
Installing collected packages: tokenizers, safetensors, huggingface-hub, transformers, torch, spacy-alignments, spacy-transformers
Successfully installed huggingface-hub-0.15.1 safetensors-0.3.1 spacy-alignments-0.9.0 spacy-transformers-1.2.5 tokenizers-0.13.3 torch-2.0.1 transformers-4.30.2



In [62]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl (460.3 MB)
Installing collected packages: en-core-web-trf
Successfully installed en-core-web-trf-3.5.0
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


In [64]:
!pip install spacy-transformers



In [1]:
import spacy
import torch

nlp = spacy.load("en_trf_bertbaseuncased_lg")
doc = nlp("Here is some text to encode.")

OSError: [E050] Can't find model 'en_trf_bertbaseuncased_lg'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
class Category:
  BOOKS = "BOOKS"
  BANK = "BANK"

train_x = ["good characters and plot progression", "check out the book", "good story. would recommend", "novel recommendation", "need to make a deposit to the bank", "balance inquiry savings", "save money"]
train_y = [Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BANK, Category.BANK, Category.BANK]

In [None]:
from sklearn import svm

docs = [nlp(text) for text in train_x]
train_x_vectors = [doc.vector for doc in docs]
clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x = ["check this story out"]
docs = [nlp(text) for text in test_x]
test_x_vectors = [doc.vector for doc in docs]

clf_svm.predict(test_x_vectors)