# Topic Detection with LDA

## [0] Pre-requisites

[0.1] __Download Stopwords__


In [1]:
import nltk; nltk.download('stopwords');

[nltk_data] Downloading package stopwords to /home/oso/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[0.2] __Imports__

In [12]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
from gensim.models.ldamodel import LdaModel
"""library for topic modelling, document indexing and similarity
retrieval with large corpora.
"""
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy
"""a library for advanced Natural Language Processing in Python
and Cython
"""

# Plotting tools
import pyLDAvis
"""library for interactive topic model visualization.
"""
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

# Logging for gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])



## <u> [1] Data Pre-proccessing:</u>


[1.1]  __Importing Data and Peeking at what we have__

In [3]:
# load Dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')

print(df.target_names.unique())
print("\nData Overview is :{}".format(df.head()))
print("\nData Shape is :{}".format(df.shape))

['rec.autos' 'comp.sys.mac.hardware' 'rec.motorcycles' 'misc.forsale'
 'comp.os.ms-windows.misc' 'alt.atheism' 'comp.graphics'
 'rec.sport.baseball' 'rec.sport.hockey' 'sci.electronics' 'sci.space'
 'talk.politics.misc' 'sci.med' 'talk.politics.mideast'
 'soc.religion.christian' 'comp.windows.x' 'comp.sys.ibm.pc.hardware'
 'talk.politics.guns' 'talk.religion.misc' 'sci.crypt']

Data Overview is :                                                content  target  \
0     From: lerxst@wam.umd.edu (where's my thing)\nS...       7   
1     From: guykuo@carson.u.washington.edu (Guy Kuo)...       4   
10    From: irwin@cmptrc.lonestar.org (Irwin Arnstei...       8   
100   From: tchen@magnus.acs.ohio-state.edu (Tsung-K...       6   
1000  From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\n...       2   

                 target_names  
0                   rec.autos  
1       comp.sys.mac.hardware  
10            rec.motorcycles  
100              misc.forsale  
1000  comp.os.ms-windows.misc  

Data 

### Pre-Processing

[1.2] __Removing unwanted text and converting to list__

In [4]:
# convert to list
data = df.content.values.tolist()

# remove emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# remove newline characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# remove distracting slingle quotes
data = [re.sub("\''", "", sent) for sent in data]

pprint(data[:1])


["From: (where's my thing) Subject: WHAT car is this!? Nntp-Posting-Host: "
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


[1.3] __Tokenize Sentence into a list of words__

In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True)) # deacc=True to remove punctuation 

data_words = list(sent_to_words(data))

print(data_words[:1])

[['from', 'where', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


[1.4] __Bigrams and Trigrams Models__

- Bigrams: *two words that frequently occur together in a document*
- Trigrams: *Three words that frequently occur together in a document*

In [6]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

bigram = Phrases(data_words, min_count=5, threshold=100) # higher threshold with fewer phrases
trigram = Phrases(bigram[data_words], threshold=100)

# get a sentence clubbed as bigram/trigram
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

# preview trigram 
print(trigram_mod[bigram_mod[data_words[0]]])

['from', 'where', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


[1.5] __Removing Stopwords, making Bigrams and Lemmatize__ functions

In [7]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
            if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """annotations from spacy"""
    texts_out = []
    for sent in texts:
        doc =nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

[1.6] __Initialize Spacy model__

- `pip install -U spacy`
- `python -m spacy download en`

****After installation you need to download a language model. ****

In [8]:
# remove stop words
data_words_nostops =remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initializing spacy 'en' model for efficiency
nlp = spacy.load('en', disable=['parser', 'ner'])

# Lemmatization keeping only noun, adj, vb, adv

data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADJ'])


print(data_lemmatized[:1])

[['thing', 'car', 'nntp_poste', 'host', 'umd', 'organization', 'university', 'maryland_college', 'park', 'line', 'wonder', 'anyone', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'call', 'bricklin', 'door', 'small', 'addition', 'front_bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'specs', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood']]


[1.7] __Creating a dictionary and Corpus needed for Topic Modelling__

**NB:** 
<u>Main inputs to the LDA are:</u>
<ol>
    <li>Dictionary <em>(id2word)</em></li>
    <li>Corpus</li>
</ol>

Gensim creates id for each word in a document, a produced corpus is a mapping of the [word_id, word_frequency]

In [9]:
# Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# corpus
texts = data_lemmatized

# Term Document frequency
corpus = [id2word.doc2bow(text) for text in texts]

print(corpus[:1])

[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 5), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1)]]


In [11]:
# to see what word an id corresponds to
print("The word: {}, corresponds to the ID:13\n".format(id2word[13]))

# human readbale representation
print("Human readable representation:\n", [[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

The word: funky, corresponds to the ID:13

Human readable representation:
 [[('addition', 1), ('anyone', 2), ('body', 1), ('bricklin', 1), ('bring', 1), ('call', 1), ('car', 5), ('could', 1), ('day', 1), ('door', 2), ('engine', 1), ('enlighten', 1), ('front_bumper', 1), ('funky', 1), ('history', 1), ('host', 1), ('info', 1), ('know', 1), ('line', 1), ('look', 2), ('mail', 1), ('make', 1), ('maryland_college', 1), ('model', 1), ('name', 1), ('neighborhood', 1), ('nntp_poste', 1), ('organization', 1), ('park', 1), ('production', 1), ('rest', 1), ('see', 1), ('separate', 1), ('small', 1), ('specs', 1), ('sport', 1), ('tellme', 1), ('thank', 1), ('thing', 1), ('umd', 1), ('university', 1), ('wonder', 1), ('year', 1)]]



## <u> [2] Topic Modelling:</u>


[2.1] __Building lda topic model using gensim models__
__Model Details:__
- built with 20 different topics
- each topic is a compination of keywords
- each keymword contributes a certain weightage to the topic

In [13]:
# LDA Model
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=20, random_state=100, update_every=1,
                    chunksize=100, passes=10, alpha='auto', per_word_topics=True)

[2.1] __View the topics detected by LDA model__

In [15]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.061*"game" + 0.059*"team" + 0.039*"win" + 0.039*"year" + 0.039*"play" + '
  '0.033*"player" + 0.022*"season" + 0.020*"bike" + 0.015*"score" + '
  '0.014*"division"'),
 (1,
  '0.071*"command" + 0.035*"quadra" + 0.028*"instruction" + 0.026*"centris" + '
  '0.020*"berkeley" + 0.013*"rom" + 0.010*"rewrite" + 0.010*"web" + '
  '0.009*"dale" + 0.008*"california_berkeley"'),
 (2,
  '0.089*"israel" + 0.062*"israeli" + 0.052*"rise" + 0.051*"peter" + '
  '0.042*"arab" + 0.031*"satellite" + 0.031*"center" + 0.023*"spacecraft" + '
  '0.020*"wide" + 0.017*"april"'),
 (3,
  '0.782*"ax" + 0.056*"max" + 0.013*"hockey" + 0.008*"goal" + 0.007*"fan" + '
  '0.006*"oil" + 0.005*"canadian" + 0.005*"sport" + 0.004*"count" + '
  '0.004*"playoff"'),
 (4,
  '0.105*"sale" + 0.098*"price" + 0.073*"sell" + 0.036*"air" + 0.027*"hall" + '
  '0.024*"quality" + 0.024*"printer" + 0.018*"cd" + 0.017*"yesterday" + '
  '0.013*"production"'),
 (5,
  '0.041*"gay" + 0.039*"mhz" + 0.038*"illinoi" + 0.038*"marriage" 

__Notes:__
- Topics are indexed, hence the `(13, ..)`
- each topic include top keywords that contribute to the topic
- by looking at the keywords, one can make make assumptions on what topics are being discussed i.e ***Topic 20 might be talking about religion***