In [1]:
# Fetch messages from two newsgroups - these are toy datasets provided by sklearn for understanding/practice.
# This dataset provides separate subset of training and testing data.
from sklearn.datasets import fetch_20newsgroups

categories = ['talk.religion.misc', 'comp.graphics']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))

In [2]:
newsgroups_train.data

['Hi! Everyone,\n\nSince some people quickly solved the problem of determining a sphere from\n4 points, I suddenly recalled a problem which is how to find the ellipse\nfrom its offset. For example, given 5 points on the offset, can you find\nthe original ellipse analytically?\n\nI spent two months solving this problem by using analytical method last year,\nbut I failed. Under the pressure, I had to use other method - nonlinear\nprogramming technique to deal with this problem approximately.\n\nAny ideas will be greatly appreciated. Please post here, let the others\nshare our interests.',
 "\n\nYou know, everybody scoffed at that guy they hung up on a cross too.\nHe claimed also to be the son of God; and it took almost two thousand \nyears to forget what he preached.\n\n\tLove thy neighbor as thyself.\n\n\nAnybody else wonder if those two guys setting the fires were 'agent \nprovacateurs.'\n\n",
 "\nNot entirely.  Its not a premise, its a conclusion.  Second, that scientists\n(for the mo

In [3]:
# Vectorize (convert text to vectors) the training data with CountVectorizer by calling 'fit_transform'.
# Input is list of texts -> Output is a sparse matrix where output[i, j] is count of jth word in ith input text.

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(newsgroups_train.data)
X_train_counts.shape

(961, 16646)

In [4]:
# Count vectorizer builds a vocabulary when fit or fit_transform is called. Every word gets mapped to a unique index.
count_vect.vocabulary_

{'hi': 7425,
 'everyone': 5886,
 'since': 13700,
 'some': 13903,
 'people': 11220,
 'quickly': 12188,
 'solved': 13900,
 'the': 14889,
 'problem': 11868,
 'of': 10670,
 'determining': 4846,
 'sphere': 14033,
 'from': 6621,
 'points': 11549,
 'suddenly': 14427,
 'recalled': 12399,
 'which': 16227,
 'is': 8357,
 'how': 7605,
 'to': 15080,
 'find': 6320,
 'ellipse': 5544,
 'its': 8410,
 'offset': 10693,
 'for': 6472,
 'example': 5913,
 'given': 6866,
 'on': 10726,
 'can': 3081,
 'you': 16583,
 'original': 10853,
 'analytically': 1789,
 'spent': 14031,
 'two': 15408,
 'months': 10020,
 'solving': 13901,
 'this': 14968,
 'by': 3009,
 'using': 15717,
 'analytical': 1788,
 'method': 9738,
 'last': 8852,
 'year': 16561,
 'but': 2995,
 'failed': 6133,
 'under': 15504,
 'pressure': 11788,
 'had': 7173,
 'use': 15703,
 'other': 10878,
 'nonlinear': 10462,
 'programming': 11919,
 'technique': 14764,
 'deal': 4561,
 'with': 16325,
 'approximately': 1977,
 'any': 1886,
 'ideas': 7742,
 'will': 16274

In [5]:
# X_train_counts is training data in vectorized form. It is a sparse matrix i.e. most of the elements are zero.
X_train_counts

<961x16646 sparse matrix of type '<class 'numpy.int64'>'
	with 87417 stored elements in Compressed Sparse Row format>

In [6]:
#  Let us check counts of word 'ellipse' in the 0th document.
X_train_counts[0, count_vect.vocabulary_['ellipse']].item()


2

In [7]:
# Vectorize the test data exactly like training data by calling 'transform' on vectorizer.
X_test_counts = count_vect.transform(newsgroups_test.data)
y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [8]:
X_test_counts

<640x16646 sparse matrix of type '<class 'numpy.int64'>'
	with 62030 stored elements in Compressed Sparse Row format>

In [9]:
# Build a knn classifier, run prediction and print classification metrics.

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
knn = KNeighborsClassifier(n_neighbors=25)
knn.fit(X_train_counts, y_train) 

predicted = knn.predict(X_test_counts)

print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.80      0.68      0.74       389
           1       0.60      0.73      0.66       251

    accuracy                           0.70       640
   macro avg       0.70      0.71      0.70       640
weighted avg       0.72      0.70      0.70       640



In [10]:
# Check predicted values. ith test data gets output as predicted[i]

predicted

array([1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,

In [11]:
# Now let us try the same steps by using tf-idf. We can run the TfidfTransformer over the output of CountVectorizer
# or we can directly use TfidfVectorizer on the input text as in the next cell.

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(961, 16646)

In [12]:
# Compute tf-idf directly on input text.
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(newsgroups_train.data)
X_train_tfidf.shape

(961, 16646)

In [13]:
# Transform the test data.
X_test_tfidf = tfidf_vect.transform(newsgroups_test.data)

In [14]:
X_train, y_train = X_train_tfidf, newsgroups_train.target
X_test, y_test = X_test_tfidf, newsgroups_test.target

In [15]:
# Train and predict knn classifier on the tf-idfs.

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
knn = KNeighborsClassifier(n_neighbors=35)
knn.fit(X_train, y_train) 

predicted = knn.predict(X_test)

print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.94      0.83      0.88       389
           1       0.77      0.92      0.84       251

    accuracy                           0.86       640
   macro avg       0.86      0.87      0.86       640
weighted avg       0.88      0.86      0.87       640



In [16]:
# Next let us try LSA on the text. LSA is done by running TruncatedSVD on tf-idf matrix or count-matrix.
# We choose to keep the top 900 dimensions.

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=900, algorithm='arpack')
new_x_train = svd.fit_transform(X_train)

# Sum of variances explained by the 900 dimensions.
svd.explained_variance_ratio_.sum()

0.9961382581340129

In [17]:
# Check the cumulative sum till ith dimension.
svd.explained_variance_ratio_.cumsum()

array([0.02002933, 0.03095081, 0.03928479, 0.04495268, 0.05037292,
       0.05543451, 0.06024143, 0.06482484, 0.06927155, 0.07348506,
       0.07756543, 0.08150194, 0.08530923, 0.08902841, 0.09269733,
       0.0963392 , 0.09989072, 0.10330668, 0.1066529 , 0.10999162,
       0.11326955, 0.11644643, 0.11959151, 0.12271887, 0.12582919,
       0.12887255, 0.13186924, 0.13481475, 0.13773564, 0.14062761,
       0.14348296, 0.14630351, 0.14910132, 0.15189668, 0.15463524,
       0.15734407, 0.16004008, 0.16271428, 0.16537522, 0.1680107 ,
       0.17063408, 0.17323406, 0.17581582, 0.17839234, 0.18094353,
       0.18346282, 0.18597375, 0.18846598, 0.19093979, 0.19340077,
       0.19585035, 0.1982642 , 0.20065577, 0.20304367, 0.20542185,
       0.2077606 , 0.21009067, 0.21241012, 0.21471986, 0.21701081,
       0.21928375, 0.22152929, 0.22375826, 0.22598619, 0.22819966,
       0.2303974 , 0.23259742, 0.23478977, 0.23696289, 0.23912495,
       0.24127659, 0.24341734, 0.24554632, 0.24767211, 0.24977

In [18]:
new_x_train.shape

(961, 900)

In [19]:
# Transform the test data similarly to the 900 dimensions.
new_x_test = svd.transform(X_test)


In [20]:
new_x_test.shape

(640, 900)

In [21]:
# Build and run knn on these 900 dimensions.

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
knn2 = KNeighborsClassifier(n_neighbors=35)
knn2.fit(new_x_train, y_train) 

predicted2 = knn2.predict(new_x_test)

print(metrics.classification_report(y_test, predicted2))

              precision    recall  f1-score   support

           0       0.94      0.90      0.92       389
           1       0.85      0.91      0.88       251

    accuracy                           0.90       640
   macro avg       0.89      0.90      0.90       640
weighted avg       0.90      0.90      0.90       640



In [22]:
# Check the predicted class of the 3rd (index 2) test document.
knn2.predict([new_x_test[2]])[0]

1