# Text Classification Using Topic Modelling

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

np.random.seed(42)

## Load the dataset

We'll use a dataset of news articles grouped into 20 news categories - but just use 7 for this example

In [2]:
categories = [
    'comp.windows.x',
    'rec.autos',
    'rec.sport.baseball',
    'rec.sport.hockey',
    'sci.space',
    'soc.religion.christian',
    'talk.politics.guns'
]

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

In [4]:
for newsgroup in newsgroups_train.target_names:
    print(newsgroup)

comp.windows.x
rec.autos
rec.sport.baseball
rec.sport.hockey
sci.space
soc.religion.christian
talk.politics.guns


There are some distinct themes in the news categories like sports, religion, science, technology, politics etc.

In [5]:
newsgroups_train.data[:2]

['From: rlennip4@mach1.wlu.ca (robert lennips 9209 U)\nSubject: Re: PLANETS STILL: IMAGES ORBIT BY ETHER TWIST\nX-Newsreader: TIN [version 1.1 PL6]\nOrganization: Wilfrid Laurier University\nLines: 2\n\nPlease get a REAL life.\n\n',
 "From: rdetweil@boi.hp.com (Richard Detweiler)\nSubject: Cards Mailing List?\nDistribution: usa\nOrganization: Hewlett Packard\nLines: 9\n\nCount me interested in a Cardinal's mailing list.  If anyone\nfinds one or starts one, please let me know.\n\nThanks,\n\nDick Detweiler\n\nrdetweil@hpdmd48.boi.hp.com\n\n"]

In [6]:
print(newsgroups_train.filenames.shape, newsgroups_train.target.shape)

(4122,) (4122,)


## Data Preprocessing

We will perform the following steps:

* **Tokenization**: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
* Words that have fewer than 3 characters are removed.
* All **stopwords** are removed.
* Words are **lemmatized** - words in third person are changed to first person and verbs in past and future tenses are changed into present.
* Words are **stemmed** - words are reduced to their root form.


In [7]:
# import nltk
# nltk.download('wordnet')

In [8]:
stemmer = SnowballStemmer('english')  # Porter2 stemmer

def lemmatize_stemming(text):
    lemmatized = WordNetLemmatizer().lemmatize(text, pos='v')
    return stemmer.stem(lemmatized)

def preprocess(text):
    """
    Tokenise and lemmatize text
    """
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [9]:
doc_sample = 'This disk has failed many times. I would like to get it replaced.'

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 
['This', 'disk', 'has', 'failed', 'many', 'times.', 'I', 'would', 'like', 'to', 'get', 'it', 'replaced.']


Tokenized and lemmatized document: 
['disk', 'fail', 'time', 'like', 'replac']


Preprocess all the messages we have (in parallel)

In [10]:
import multiprocessing
pool = multiprocessing.Pool()
processed_docs = list(pool.map(preprocess, newsgroups_train.data))

In [11]:
print(processed_docs[:2])

[['rlennip', 'mach', 'robert', 'lennip', 'subject', 'planet', 'imag', 'orbit', 'ether', 'twist', 'newsread', 'version', 'organ', 'wilfrid', 'laurier', 'univers', 'line', 'real', 'life'], ['rdetweil', 'richard', 'detweil', 'subject', 'card', 'mail', 'list', 'distribut', 'organ', 'hewlett', 'packard', 'line', 'count', 'interest', 'cardin', 'mail', 'list', 'find', 'start', 'know', 'thank', 'dick', 'detweil', 'rdetweil', 'hpdmd']]


## Create Bag of words

A dictionary is the number of times a word appears in the training set.
A mapping between words and their integer ids.

In [12]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [13]:
for k, v in dictionary.iteritems():
    print(k, v)
    if k > 5:
        break

0 ether
1 imag
2 laurier
3 lennip
4 life
5 line
6 mach


Filter out tokens that appear in
* less than 15 documents or
* more than 10% of documents
* after (1) and (2), keep only the first 100k most frequent tokens

In [14]:
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)

Convert document (a list of words) into the bag-of-words format.  
A list of (token_id, token_count) tuples

In [15]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [16]:
bow_doc_x = bow_corpus[10]
bow_word_x = 3

print('{} - {}'.format(
    bow_doc_x[5],
    dictionary[bow_doc_x[bow_word_x][0]]
))

(277, 1) - devic


## Build the LDA Model
(Latent Dirichlet Allocation)  
If observations are words collected into documents, it posits that each document is a mixture of a small number of topics and that each word's presence is attributable to one of the document's topics

* **alpha** and **eta** are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. We will let these be the default values for now(default value is `1/num_topics`)
    - Alpha is the per document topic distribution.
        * High alpha: Every document has a mixture of all topics(documents appear similar to each other).
        * Low alpha: Every document has a mixture of very few topics

    - Eta is the per topic word distribution.
        * High eta: Each topic has a mixture of most words(topics appear similar to each other).
        * Low eta: Each topic has a mixture of few words.


In [17]:
lda_model = gensim.models.LdaMulticore(
    bow_corpus,
    num_topics=7,
    id2word=dictionary,                                    
    passes=10,
    workers=4)

## Evaluate the model

In [18]:
lda_model.show_topics()

[(0,
  '0.019*"christian" + 0.008*"exist" + 0.007*"truth" + 0.005*"live" + 0.005*"life" + 0.005*"claim" + 0.005*"religion" + 0.005*"belief" + 0.004*"true" + 0.004*"absolut"'),
 (1,
  '0.010*"player" + 0.007*"season" + 0.006*"hockey" + 0.006*"score" + 0.004*"leagu" + 0.004*"goal" + 0.004*"basebal" + 0.004*"playoff" + 0.004*"defens" + 0.004*"second"'),
 (2,
  '0.014*"jesus" + 0.012*"church" + 0.007*"christ" + 0.006*"bibl" + 0.006*"christian" + 0.006*"hell" + 0.006*"faith" + 0.005*"cathol" + 0.005*"paul" + 0.005*"father"'),
 (3,
  '0.023*"window" + 0.011*"server" + 0.011*"widget" + 0.010*"file" + 0.010*"program" + 0.009*"motif" + 0.008*"applic" + 0.008*"display" + 0.008*"avail" + 0.007*"version"'),
 (4,
  '0.013*"file" + 0.009*"entri" + 0.009*"weapon" + 0.008*"gun" + 0.008*"firearm" + 0.006*"control" + 0.005*"crime" + 0.005*"govern" + 0.005*"output" + 0.005*"program"'),
 (5,
  '0.024*"space" + 0.014*"nasa" + 0.010*"orbit" + 0.009*"launch" + 0.006*"satellit" + 0.005*"mission" + 0.005*"eart

In [19]:
categories_map = {
    3: 'comp.windows.x',
    6: 'rec.autos',
    -1: 'rec.sport.baseball',
    1: 'rec.sport.hockey',
    5: 'sci.space',
    0: 'soc.religion.christian',
    2: 'soc.religion.christian',
    4: 'talk.politics.guns'
}

Testing model on unseen document

In [20]:
num = 2
unseen_document = newsgroups_test.data[num]
print(unseen_document)
print(newsgroups_test.target[num])
print(newsgroups_test.target_names[newsgroups_test.target[num]])

From: eggertj@moses.ll.mit.edu (Jim Eggert x6127 g41)
Subject: Re: Robin Lane Fox's _The Unauthorized Version_?
Reply-To: eggertj@ll.mit.edu
Organization: MIT Lincoln Lab - Group 41
Lines: 19

In article <May.7.01.09.39.1993.14550@athos.rutgers.edu> iscleekk@nuscc.nus.sg (LEE KOK KIONG JAMES) writes:
|   mpaul@unl.edu (marxhausen paul) writes:
|   > My mom passed along a lengthy review she clipped regarding Robin Lane
|   > Fox's book _The Unauthorized Version: Truth and Fiction in the Bible_,
|...
|   I've read the book. Some parts were quite typical regarding its
|   criticism of the bible as an inaccurate historical document,
|   alt.altheism, etc carries typical responses, but not as vociferous as
|   a.a. It does give an insight into how these historian (is he one... I 
|   don't have any biodata on him) work. I've not been able to understand/
|   appreciate some of the arguments, something like, it mentions certain 
|   events, so it has to be after that event, and so on. 

Robin

In [21]:
# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
pred = sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1])
print(pred)

[(0, 0.6471687), (3, 0.19757356), (1, 0.14305289)]


In [22]:
print('predicts {} with a probability of {:.2f}%'.format(categories_map[pred[0][0]], pred[0][1]*100))

predicts soc.religion.christian with a probability of 64.72%


The model correctly classifies the unseen document with 'x'% probability to the X category.

### Check Accuracy

In [23]:
import multiprocessing
pool = multiprocessing.Pool()
test_processed_docs = list(pool.map(preprocess, newsgroups_test.data))

In [24]:
test_bow_corpus = [dictionary.doc2bow(doc) for doc in test_processed_docs]

In [25]:
y_true = newsgroups_test.target

In [26]:
newsgroups_test.target_names

['comp.windows.x',
 'rec.autos',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns']

In [27]:
y_pred = []
for i, doc in enumerate(test_bow_corpus):
    pred_all = sorted(lda_model[doc], key=lambda tup: -1*tup[1])
    pred_cat = categories_map[pred_all[0][0]]
    y_pred.append(newsgroups_test.target_names.index(pred_cat))

Accuracy is the proportion of correct predictions of the model

In [28]:
from sklearn.metrics import accuracy_score
accuracy_score(y_true, y_pred)

0.6985052861830113

In [29]:
# creating a confusion matrix 
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_true, y_pred)

Y_pred ->, y_true \/

In [30]:
cm

array([[362,  11,   0,   8,  12,   2,   0],
       [  5, 317,   0,  33,  11,  27,   3],
       [  6,   8,   0, 356,   5,  18,   4],
       [  0,   7,   0, 382,   2,   6,   2],
       [  5,  13,   0,   3, 325,  25,  23],
       [  3,   3,   0,   3,   5, 381,   3],
       [  2, 174,   0,   5,   2,  32, 149]])

In [31]:
# import pyLDAvis.gensim
# pyLDAvis.enable_notebook()
# prepared = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
# pyLDAvis.show(prepared)