<a href="https://colab.research.google.com/github/graviraja/100-Days-of-NLP/blob/architectures/architectures/Topic%20Modelling%20using%20LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports

In [18]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [31]:
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns

In [22]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

lemma = WordNetLemmatizer()
stemmer = PorterStemmer()

### Newsgroup dataset

In [5]:
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
df.head()

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [6]:
df.target_names.unique()

array(['rec.autos', 'comp.sys.mac.hardware', 'comp.graphics', 'sci.space',
       'talk.politics.guns', 'sci.med', 'comp.sys.ibm.pc.hardware',
       'comp.os.ms-windows.misc', 'rec.motorcycles', 'talk.religion.misc',
       'misc.forsale', 'alt.atheism', 'sci.electronics', 'comp.windows.x',
       'rec.sport.hockey', 'rec.sport.baseball', 'soc.religion.christian',
       'talk.politics.mideast', 'talk.politics.misc', 'sci.crypt'],
      dtype=object)

In [7]:
len(df)

11314

In [8]:
data = df.content.values.tolist()

### Processing the data

In [24]:
def preprocess(sent):
    # remove emails
    sent = re.sub('\S*@\S*\s?', '', sent)
    # remove newline chars
    sent = re.sub('\s+', ' ', sent)
    # remove single quotes
    sent = re.sub("\'", "", sent)
    # converts to lower case tokens and removes tokens that are
    # too small & too long. Also remove accent characters & punct
    tokens = simple_preprocess(str(sent), deacc=True)
    # remove stopwords
    stop_free = [i for i in tokens if i not in stop_words]
    # lemmatization
    lemmatized = [lemma.lemmatize(word) for word in stop_free]
    # stemming
    normalized = [stemmer.stem(word) for word in lemmatized]
    return normalized

In [25]:
cleaned_data = [preprocess(doc) for doc in data]
cleaned_data[0]

['where',
 'thing',
 'car',
 'nntp',
 'post',
 'host',
 'rac',
 'wam',
 'umd',
 'organ',
 'univers',
 'maryland',
 'colleg',
 'park',
 'line',
 'wonder',
 'anyon',
 'could',
 'enlighten',
 'car',
 'saw',
 'day',
 'door',
 'sport',
 'car',
 'look',
 'late',
 'earli',
 'call',
 'bricklin',
 'door',
 'realli',
 'small',
 'addit',
 'front',
 'bumper',
 'separ',
 'rest',
 'bodi',
 'know',
 'anyon',
 'tellm',
 'model',
 'name',
 'engin',
 'spec',
 'year',
 'product',
 'car',
 'made',
 'histori',
 'whatev',
 'info',
 'funki',
 'look',
 'car',
 'pleas',
 'mail',
 'thank',
 'il',
 'brought',
 'neighborhood',
 'lerxst']

### Corpus creation

In [26]:
# create dictionary
id2word = corpora.Dictionary(cleaned_data)

# bag of words
corpus = [id2word.doc2bow(text) for text in cleaned_data]

In [27]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('addit', 1),
  ('anyon', 2),
  ('bodi', 1),
  ('bricklin', 1),
  ('brought', 1),
  ('bumper', 1),
  ('call', 1),
  ('car', 5),
  ('colleg', 1),
  ('could', 1),
  ('day', 1),
  ('door', 2),
  ('earli', 1),
  ('engin', 1),
  ('enlighten', 1),
  ('front', 1),
  ('funki', 1),
  ('histori', 1),
  ('host', 1),
  ('il', 1),
  ('info', 1),
  ('know', 1),
  ('late', 1),
  ('lerxst', 1),
  ('line', 1),
  ('look', 2),
  ('made', 1),
  ('mail', 1),
  ('maryland', 1),
  ('model', 1),
  ('name', 1),
  ('neighborhood', 1),
  ('nntp', 1),
  ('organ', 1),
  ('park', 1),
  ('pleas', 1),
  ('post', 1),
  ('product', 1),
  ('rac', 1),
  ('realli', 1),
  ('rest', 1),
  ('saw', 1),
  ('separ', 1),
  ('small', 1),
  ('spec', 1),
  ('sport', 1),
  ('tellm', 1),
  ('thank', 1),
  ('thing', 1),
  ('umd', 1),
  ('univers', 1),
  ('wam', 1),
  ('whatev', 1),
  ('where', 1),
  ('wonder', 1),
  ('year', 1)]]

### LDA Model

In [28]:
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=20,
    random_state=42,
    passes=30
)

In [32]:
pprint(lda_model.print_topics())

[(0,
  '0.007*"new" + 0.007*"state" + 0.006*"inform" + 0.006*"nation" + '
  '0.005*"govern" + 0.005*"research" + 0.005*"public" + 0.004*"report" + '
  '0.004*"author" + 0.004*"american"'),
 (1,
  '0.011*"would" + 0.009*"gun" + 0.008*"think" + 0.008*"dont" + 0.008*"go" + '
  '0.007*"get" + 0.007*"like" + 0.007*"peopl" + 0.007*"one" + 0.006*"make"'),
 (2,
  '0.716*"ax" + 0.052*"max" + 0.007*"pl" + 0.006*"di" + 0.005*"wm" + '
  '0.005*"ei" + 0.005*"tm" + 0.004*"bhj" + 0.004*"giz" + 0.003*"ql"'),
 (3,
  '0.015*"write" + 0.014*"articl" + 0.013*"right" + 0.011*"organ" + '
  '0.010*"govern" + 0.009*"law" + 0.009*"peopl" + 0.009*"line" + 0.007*"state" '
  '+ 0.006*"post"'),
 (4,
  '0.035*"line" + 0.033*"organ" + 0.031*"post" + 0.024*"host" + 0.023*"nntp" + '
  '0.021*"univers" + 0.011*"articl" + 0.011*"distribut" + 0.010*"thank" + '
  '0.009*"repli"'),
 (5,
  '0.036*"file" + 0.019*"window" + 0.010*"line" + 0.010*"program" + '
  '0.009*"imag" + 0.009*"organ" + 0.008*"gif" + 0.008*"font" + 0.007

### Inferring topics from keywords



Let's see if we can infer what the topic is about from the Keywords for few of them

(18,
  '0.032*"key" + 0.016*"encrypt" + 0.014*"chip" + 0.011*"secur" + '
  '0.011*"clipper" + 0.008*"use" + 0.007*"one" + 0.007*"system" + 0.006*"bit" '
  '+ 0.006*"would"') - **crypt**

- - -
(17,
  '0.022*"game" + 0.021*"team" + 0.016*"play" + 0.014*"hockey" + '
  '0.012*"player" + 0.009*"nhl" + 0.008*"win" + 0.007*"pt" + 0.007*"season" + '
  '0.007*"la"') - **hockey**
- - -
(10,
  '0.012*"game" + 0.011*"year" + 0.009*"run" + 0.008*"line" + 0.008*"organ" + '
  '0.008*"team" + 0.007*"articl" + 0.007*"last" + 0.006*"write" + '
  '0.006*"basebal"') - **baseball**


