In [1]:
# implementing a text classification model using sklearn.naive_bayes.GaussianNB estimator
# data used is from 20newsgroup dataset of sklearn
# reference : https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [25]:
# importing the required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

In [2]:
# fetching the data
# the returned value is a sklearn.utils.bunch a holder object
data = fetch_20newsgroups()

In [3]:
# taking only few categories(for faster execution)
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
categories = ['alt.atheism','comp.graphics','sci.med','soc.religion.christian','talk.religion.misc']

In [5]:
# fetching the train and test data
train_data = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

In [9]:
# to train a machine learning model the text data has to be converted to some numerical representation.
# the conversion is done by bag of word representation.construct a 2d maxtrix with each document as row, 
#  each word in the document as a column, the frequency of word in a document updated in matrix[row, col]

In [10]:
# if there are 10000 samples(documents) and 10000 features(words), then we get a 2dmartix of size 10000 * 10000 * 4 bytes(for numpy float64)
# also as each document will contain only some of unique words present in the entire document, most entries will be 0.
# hence bag of words is a high-dimensional sparse dataset.

In [11]:
# tokenizing text
# text preprocessing, tokenizing and filtering of stopwords can be performed by using CountVectorizer class.

In [12]:
count_vectorizer = CountVectorizer()
X_train = count_vectorizer.fit_transform(train_data.data)

In [13]:
X_train.shape

(2634, 39355)

In [14]:
X_train.dtype

dtype('int64')

In [15]:
X_train[0:5]

<5x39355 sparse matrix of type '<class 'numpy.int64'>'
	with 907 stored elements in Compressed Sparse Row format>

In [16]:
len(count_vectorizer.vocabulary_)

39355

In [17]:
count_vectorizer.vocabulary_['boebert']

7673

In [18]:
# after performing fit_transform on the sample text, the count_vectorizer will construct a dictionary with words
# keys and count of the word as value

In [19]:
# next step is to convert from occurence to frequency.
# there are two issues with this method
# 1 > long documents will eventually have higher word count that small document even thought the two documents
#     talks about the same topic
#   > to resolve this issue we divide the number of occurences of each word in the document be the total number of
#     words in the document.this is called term frequencies(TF)
# 2 > some common words(of, and, the) occur in many documents and are thus not informative.
#   > so the weights of these words can be reduces.this is called term frequency inverse document frequency(Tf-idf)


In [20]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train)
X_train_tf = tf_transformer.transform(X_train)

In [21]:
type(X_train_tf)

scipy.sparse.csr.csr_matrix

In [22]:
clf = MultinomialNB().fit(X_train_tf, train_data.target)

In [23]:
# testing the model
new_docs = ['god is love', 'opengl on GPU is fast']
X_new_counts = count_vectorizer.transform(new_docs)
X_new_tf = tf_transformer.transform(X_new_counts)

predict = clf.predict(X_new_tf)

for doc, category in zip(new_docs, predict):
    print(f'{doc} -> {train_data.target_names[category]}')
    

god is love -> soc.religion.christian
opengl on GPU is fast -> comp.graphics


In [26]:
# all the steps mentioned can be injected into a pipeline to make work easier
classifier = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

In [28]:
classifier.fit(train_data.data, train_data.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [30]:
predictions = classifier.predict(new_docs)

In [33]:
for doc, category in zip(new_docs, predictions):
    print(f' {doc} -> {train_data.target_names[category]}')

 god is love -> soc.religion.christian
 opengl on GPU is fast -> comp.graphics


In [34]:
# evaulating the model

predictions = classifier.predict(test_data.data)

In [35]:
np.mean(predictions == test_data.target)

0.7393040501996577