In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
class Category:
  BOOKS = "BOOKS"
  CLOTHING = "CLOTHING"

train_x = ['i love the book', 'this is a great book','the fit is great','i love the shoes']
train_y = [Category.BOOKS, Category.BOOKS, Category.CLOTHING, Category.CLOTHING]

In [3]:
vectorizer = CountVectorizer(binary=True)

In [4]:
vectors = vectorizer.fit_transform(train_x)


print(vectorizer.get_feature_names())
print(vectors.toarray())

['book', 'fit', 'great', 'is', 'love', 'shoes', 'the', 'this']
[[1 0 0 0 1 0 1 0]
 [1 0 1 1 0 0 0 1]
 [0 1 1 1 0 0 1 0]
 [0 0 0 0 1 1 1 0]]




In [5]:
from sklearn import svm

In [6]:
clf_svm = svm.SVC(kernel='linear')

In [7]:
clf_svm.fit(vectors, train_y)

SVC(kernel='linear')

In [8]:
test_x = vectorizer.transform(['shoes are weird'])

In [9]:
clf_svm.predict(test_x)

array(['CLOTHING'], dtype='<U8')

In [10]:
## Word Vectors

In [11]:
import spacy

In [12]:
nlp = spacy.load("en_core_web_sm")

In [13]:
docs = [nlp(text) for text in train_x]

In [14]:
train_x_word_vector = [x.vector for x in docs]

In [15]:
from sklearn import svm

In [16]:
clf_wv = svm.SVC(kernel='linear')

In [17]:
clf_wv.fit(train_x_word_vector, train_y)

SVC(kernel='linear')

In [18]:
test_x =['i love the books']
test_doc = [nlp(text) for text in test_x]
test_vectors = [x.vector for x in test_doc]


In [19]:
 clf_wv.predict(test_vectors)

array(['CLOTHING'], dtype='<U8')

#**REGEX**

In [20]:
import re

In [21]:
regexp = re.compile(r"story|book|bread")

In [22]:
phrases = ['I liked that story', "the car treaded up the hill book", "this hat is nice"]

In [23]:
matches = []

for phrase in phrases:
  if re.search(regexp,phrase):
    matches.append(phrase)

print(matches)

['I liked that story', 'the car treaded up the hill book']


# **Stemming and Lemmatization**

In [24]:
import nltk

In [25]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [26]:
from nltk.tokenize import word_tokenize  
from nltk.stem import PorterStemmer

In [27]:
stemmer = PorterStemmer()

In [28]:
phrase = "Reading the books and playing"


In [29]:
word = word_tokenize(phrase)

In [30]:
stemmed_words = []
for words in word:
  stemmed_words.append(stemmer.stem(words))
  

" ".join(stemmed_words)

'read the book and play'

**Lemmatization**

In [35]:
from nltk.stem import WordNetLemmatizer

In [36]:
 lematizer = WordNetLemmatizer()

In [37]:
phrase = "reading and reads and reader leaving and leaves the books and playing"
word = word_tokenize(phrase)

In [46]:
# lemmatized_words =[]

# for words in word:
#   lemmatized_words.append(lematizer.lemmatize(words,pos='v'))

# " ".join(lemmatized_words)

In [None]:
def lem_words(words):
  [lematizer.lemmatize(words) for words in words]
  print(" ".join(lemmatized_words))

In [47]:
# lem_words(words)

**Stopwords**

In [40]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [41]:
stop_words = stopwords.words('english')

In [42]:
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [43]:
phrase = 'This is an example sentence showing removal of the would was a then stopwords'

In [44]:
words = word_tokenize(phrase)

stripped_phase = []

for word in words:
  if word not in stop_words:
    stripped_phase.append(word)

" ".join(stripped_phase)

'This example sentence showing removal would stopwords'