## Text Classification using K Nearest Neighbors

In [3]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import genesis
nltk.download('genesis')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
genesis_ic = wn.ic(genesis, False, 0.0)

import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import roc_auc_score

[nltk_data] Downloading package genesis to
[nltk_data]     /Users/giorgiomondauto/nltk_data...
[nltk_data]   Package genesis is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/giorgiomondauto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/giorgiomondauto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/giorgiomondauto/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [19]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

In [20]:
from sklearn.datasets import fetch_20newsgroups


In [101]:
categories = ['rec.motorcycles', 'sci.electronics',
              'comp.graphics', 'sci.med']

# sklearn provides us with subset data for training and testing
train_data = fetch_20newsgroups(subset='train',
                                categories=categories, shuffle=True, random_state=42)

print(train_data.target_names)

print("\n".join(train_data.data[0].split("\n")[:3]))
print(train_data.target_names[train_data.target[0]])

# Let's look at categories of our first ten training data
for t in train_data.target[:10]:
    print(train_data.target_names[t])

['comp.graphics', 'rec.motorcycles', 'sci.electronics', 'sci.med']
From: kreyling@lds.loral.com (Ed Kreyling 6966)
Subject: Sun-os and 8bit ASCII graphics
Organization: Loral Data Systems
comp.graphics
comp.graphics
comp.graphics
rec.motorcycles
comp.graphics
sci.med
sci.electronics
sci.electronics
comp.graphics
rec.motorcycles
sci.electronics


In [102]:
print(train_data.data[5])
train_data.target_names[5]

From: ragee@vdoe386.vak12ed.edu (Randy Agee)
Subject: Radar detector DETECTORS?
Organization: Virginia's Public Education Network (Richmond)
Lines: 27

Several years back one of the radar detectors manufacturers, in
defiance to Virginia's law against radar detectors, passed out
thousands of fake cardboard radar detectors at truck stops near
the Virginia State lines.  At that time there were no radar
detector Detectors!  I am not sure of the impact but I would
imagine that enforcement of the law by visually sighting a
radar detector became difficult - if not impossible!

As I said earlier, efforts to throw out or eliminate the VA law
against radar detectors has been in vain.  In fact, effective
Jan. 1, 1993, the fine for possession of a radar detector
accessable to the driver of a vehicle in VA is now $250.00.  

I have noted an interesting anomality with my Alinco DR-100 2
meter ham transceiver.... It will make a *cheap* radar detector
scream!  I am not sure of the range, but it is obv

IndexError: list index out of range

In [22]:
# Builds a dictionary of features and transforms documents to feature vectors and convert our text documents to a
# matrix of token counts (CountVectorizer)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_data.data)

# transform a count matrix to a normalized tf-idf representation (tf-idf transformer)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [23]:
knn = KNeighborsClassifier(n_neighbors=7)

# training our classifier ; train_data.target will be having numbers assigned for each category in train data
clf = knn.fit(X_train_tfidf, train_data.target)

# Input Data to predict their classes of the given categories
docs_new = ['I have a Harley Davidson and Yamaha.', 'I have a GTX 1050 GPU']
# building up feature vector of our input
X_new_counts = count_vect.transform(docs_new)
# We call transform instead of fit_transform because it's already been fit
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [24]:
predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, train_data.target_names[category]))


'I have a Harley Davidson and Yamaha.' => rec.motorcycles
'I have a GTX 1050 GPU' => sci.med


In [26]:
# We can use Pipeline to add vectorizer -> transformer -> classifier all in a one compound classifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', knn),
])
# Fitting our train data to the pipeline
text_clf.fit(train_data.data, train_data.target)

# Test data 
test_data = fetch_20newsgroups(subset='test',
                               categories=categories, shuffle=True, random_state=42)
docs_test = test_data.data
# Predicting our test data
predicted = text_clf.predict(docs_test)
print('We got an accuracy of',np.mean(predicted == test_data.target)*100, '% over the test data.')

We got an accuracy of 82.67766497461929 % over the test data.


In [71]:
train_data  = train_data.data[:2]
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_data)

In [75]:
X_train_counts.data

array([1, 2, 1, 2, 1, 1, 1, 1, 4, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       4, 1, 1, 1, 1, 1, 2, 1, 1, 5, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 3, 1, 3, 1, 2, 1, 3, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3,
       1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [76]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [79]:
X_train_tfidf.data

array([0.08195596, 0.05831234, 0.08195596, 0.08195596, 0.08195596,
       0.16391192, 0.08195596, 0.08195596, 0.08195596, 0.23324936,
       0.08195596, 0.2915617 , 0.08195596, 0.08195596, 0.32782383,
       0.05831234, 0.08195596, 0.08195596, 0.08195596, 0.08195596,
       0.05831234, 0.08195596, 0.05831234, 0.24586788, 0.24586788,
       0.05831234, 0.08195596, 0.16391192, 0.05831234, 0.08195596,
       0.08195596, 0.16391192, 0.05831234, 0.05831234, 0.05831234,
       0.08195596, 0.05831234, 0.08195596, 0.08195596, 0.16391192,
       0.08195596, 0.05831234, 0.08195596, 0.08195596, 0.08195596,
       0.05831234, 0.08195596, 0.08195596, 0.08195596, 0.08195596,
       0.05831234, 0.24586788, 0.08195596, 0.24586788, 0.08195596,
       0.08195596, 0.05831234, 0.24586788, 0.08195596, 0.05831234,
       0.08195596, 0.11662468, 0.05831234, 0.08195596, 0.08195596,
       0.08195596, 0.08195596, 0.08195596, 0.08195596, 0.08195596,
       0.09698017, 0.06900219, 0.09698017, 0.09698017, 0.09698

In [84]:
train_data.target_names[:2]

['comp.graphics', 'rec.motorcycles']

In [92]:
knn = KNeighborsClassifier(n_neighbors=2)

# training our classifier ; train_data.target will be having numbers assigned for each category in train data
clf = knn.fit(X_train_tfidf, train_data.target[:2])

# Input Data to predict their classes of the given categories
docs_new = ['I have a Harley Davidson and Yamaha.', 'I have a GTX 1050 GPU']
# building up feature vector of our input
X_new_counts = count_vect.transform(docs_new)
# We call transform instead of fit_transform because it's already been fit
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [93]:
predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, train_data.target_names[category]))

'I have a Harley Davidson and Yamaha.' => comp.graphics
'I have a GTX 1050 GPU' => comp.graphics
