In [37]:
from warnings import filterwarnings
filterwarnings("ignore")

In [38]:
import os
import re

import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models
import spacy
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel, LdaModel, Phrases
from gensim.models.phrases import Phraser
from gensim.utils import simple_preprocess

from machine_learning.utils.utils_io import read_lines_from_text_file
from machine_learning.utils.utils_nlp import lemmatize, remove_email, remove_newline_char, remove_single_quote, remove_stopwords

In [None]:
data_nlp_dirpath = "../data/nlp"

In [3]:
stopwords_filename = "stopwords_english.txt"
stopwords_filepath = os.path.join(data_nlp_dirpath, stopwords_filename)
stopwords = read_lines_from_text_file(stopwords_filepath)

184

### data

In [4]:
# get newsgroups.json from https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json
# it is not included in this package to save 22.2MB space

newgroups_filename = "newsgroups.json"
newgroups_filepath = os.path.join(data_nlp_dirpath, newgroups_filename)
df = pd.read_json(newgroups_filepath)
df.head()

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [5]:
df.target_names.unique()

array(['rec.autos', 'comp.sys.mac.hardware', 'comp.graphics', 'sci.space',
       'talk.politics.guns', 'sci.med', 'comp.sys.ibm.pc.hardware',
       'comp.os.ms-windows.misc', 'rec.motorcycles', 'talk.religion.misc',
       'misc.forsale', 'alt.atheism', 'sci.electronics', 'comp.windows.x',
       'rec.sport.hockey', 'rec.sport.baseball', 'soc.religion.christian',
       'talk.politics.mideast', 'talk.politics.misc', 'sci.crypt'],
      dtype=object)

In [6]:
docs = df.content.values.tolist()

In [7]:
docs[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

### preprocessing

In [8]:
docs = [remove_email(doc) for doc in docs]

In [9]:
docs = [remove_newline_char(doc) for doc in docs]

In [10]:
docs = [remove_single_quote(doc) for doc in docs]

In [11]:
docs[0]

'From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: 15 I was wondering if anyone out there could enlighten me on this car I saw the other day. It was a 2-door sports car, looked to be from the late 60s/ early 70s. It was called a Bricklin. The doors were really small. In addition, the front bumper was separate from the rest of the body. This is all I know. If anyone can tellme a model name, engine specs, years of production, where this car is made, history, or whatever info you have on this funky looking car, please e-mail. Thanks, - IL ---- brought to you by your neighborhood Lerxst ---- '

In [12]:
tokenized_docs = [simple_preprocess(doc, deacc=True) for doc in docs]

In [13]:
tokenized_docs = [remove_stopwords(doc, stopwords) for doc in tokenized_docs]

### dictionary and corpus

In [14]:
bigram = Phrases(tokenized_docs, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = Phrases(bigram[tokenized_docs], threshold=100)  

# faster way to get sentence clubbed as bigram/trigram
bigram_model = Phraser(bigram)
trigram_model = Phraser(trigram)

In [15]:
bigrammed_docs = [bigram_model[doc] for doc in tokenized_docs]

In [16]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [17]:
lemmatized_docs = [lemmatize(doc, nlp) for doc in bigrammed_docs]

In [18]:
dictionary = Dictionary(lemmatized_docs)
corpus = [dictionary.doc2bow(doc) for doc in lemmatized_docs]

In [19]:
dictionary[0]

'addition'

In [20]:
corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 5),
 (5, 1),
 (6, 2),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 2),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 1)]

In [21]:
[(dictionary[i], freq) for (i, freq) in corpus[0]]

[('addition', 1),
 ('body', 1),
 ('bring', 1),
 ('call', 1),
 ('car', 5),
 ('day', 1),
 ('door', 2),
 ('early', 1),
 ('engine', 1),
 ('enlighten', 1),
 ('front_bumper', 1),
 ('funky', 1),
 ('history', 1),
 ('host', 1),
 ('info', 1),
 ('know', 1),
 ('late', 1),
 ('lerxst', 1),
 ('line', 1),
 ('look', 2),
 ('mail', 1),
 ('make', 1),
 ('model', 1),
 ('name', 1),
 ('neighborhood', 1),
 ('nntp_poste', 1),
 ('park', 1),
 ('production', 1),
 ('rac_wam', 1),
 ('really', 1),
 ('rest', 1),
 ('see', 1),
 ('separate', 1),
 ('small', 1),
 ('spec', 1),
 ('sport', 1),
 ('tellme', 1),
 ('thank', 1),
 ('thing', 1),
 ('wonder', 1),
 ('year', 1)]

### model

In [22]:
model = LdaModel(corpus, 20, dictionary, chunksize=100, passes=10, alpha="auto", 
                 random_state=0, per_word_topics=True)

In [23]:
model.print_topics()

[(0,
  '0.057*"field" + 0.050*"notice" + 0.050*"community" + 0.043*"suggest" + 0.042*"external" + 0.039*"average" + 0.038*"weight" + 0.035*"significant" + 0.029*"impact" + 0.028*"primarily"'),
 (1,
  '0.039*"evidence" + 0.022*"man" + 0.021*"law" + 0.019*"child" + 0.018*"faith" + 0.018*"state" + 0.018*"claim" + 0.017*"reason" + 0.017*"sense" + 0.016*"exist"'),
 (2,
  '0.714*"ax" + 0.017*"score" + 0.012*"baseball" + 0.010*"club" + 0.009*"ice" + 0.009*"cap" + 0.008*"pitch" + 0.008*"max" + 0.007*"tie" + 0.007*"stat"'),
 (3,
  '0.043*"soldier" + 0.035*"format" + 0.034*"war" + 0.031*"armenian" + 0.031*"attack" + 0.030*"village" + 0.025*"kill" + 0.024*"convert" + 0.023*"turkish" + 0.022*"civilian"'),
 (4,
  '0.029*"issue" + 0.017*"accept" + 0.017*"public" + 0.013*"encryption" + 0.013*"cover" + 0.012*"national" + 0.012*"control" + 0.012*"study" + 0.012*"body" + 0.011*"risk"'),
 (5,
  '0.123*"pin" + 0.085*"dept" + 0.051*"processor" + 0.038*"compile" + 0.030*"cub" + 0.022*"enable" + 0.021*"menu"

### evaluating model

In [24]:
model.log_perplexity(corpus)

-13.59976498601444

In [25]:
coherence_model = CoherenceModel(model, texts=lemmatized_docs, dictionary=dictionary)
coherence = coherence_model.get_coherence()
coherence

0.5063445704868277

In [26]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
vis

### words, topics, probs

#### per doc

In [27]:
rows = model[corpus]

In [28]:
row = rows[0]
row

([(1, 0.014455765),
  (4, 0.04294472),
  (6, 0.27582434),
  (8, 0.18695824),
  (9, 0.10469744),
  (10, 0.09411308),
  (11, 0.029494785),
  (13, 0.16451526),
  (14, 0.041336328),
  (18, 0.022573866)],
 [(0, [18, 8, 6, 1]),
  (1, [4]),
  (2, [6, 8]),
  (3, [10, 8, 6]),
  (4, [6]),
  (5, [6, 8]),
  (6, [9]),
  (7, [6, 8, 10]),
  (8, [9]),
  (9, [9]),
  (10, []),
  (11, []),
  (12, [8, 1]),
  (13, [13]),
  (14, [14, 13, 10]),
  (15, [8, 13]),
  (16, [6, 10]),
  (17, []),
  (18, [13]),
  (19, [13, 6, 10, 8]),
  (20, [14]),
  (21, [8, 6, 10]),
  (22, [10]),
  (23, [8, 10, 14]),
  (24, [9]),
  (25, [13]),
  (26, [6, 13]),
  (27, [9]),
  (28, []),
  (29, [8, 6, 13]),
  (30, [6, 8]),
  (31, [8, 6]),
  (32, [4, 18]),
  (33, [6, 10]),
  (34, [11]),
  (35, [6]),
  (36, []),
  (37, [13]),
  (38, [8, 6]),
  (39, [13, 8]),
  (40, [6])],
 [(0, [(1, 0.026019026), (6, 0.15314448), (8, 0.3962276), (18, 0.42388827)]),
  (1, [(4, 0.9997103)]),
  (2, [(6, 0.54237443), (8, 0.4574931)]),
  (3, [(6, 0.03324916

In [29]:
topics_probs = row[0]
topics_probs

[(1, 0.014455765),
 (4, 0.04294472),
 (6, 0.27582434),
 (8, 0.18695824),
 (9, 0.10469744),
 (10, 0.09411308),
 (11, 0.029494785),
 (13, 0.16451526),
 (14, 0.041336328),
 (18, 0.022573866)]

In [30]:
words_topics_probs = row[2]
words_topics_probs

[(0, [(1, 0.026019026), (6, 0.15314448), (8, 0.3962276), (18, 0.42388827)]),
 (1, [(4, 0.9997103)]),
 (2, [(6, 0.54237443), (8, 0.4574931)]),
 (3, [(6, 0.033249166), (8, 0.41076913), (10, 0.55590636)]),
 (4, [(6, 4.9998183)]),
 (5, [(6, 0.79833776), (8, 0.2016224)]),
 (6, [(9, 1.9999104)]),
 (7, [(6, 0.7249683), (8, 0.21156576), (10, 0.06332822)]),
 (8, [(9, 0.9999742)]),
 (9, [(9, 0.9992062)]),
 (10, []),
 (11, []),
 (12, [(1, 0.08278239), (8, 0.9168026)]),
 (13, [(13, 0.9999839)]),
 (14, [(10, 0.08518557), (13, 0.3229914), (14, 0.59171194)]),
 (15, [(8, 0.5574355), (13, 0.43968734)]),
 (16, [(6, 0.8202432), (10, 0.17958148)]),
 (17, []),
 (18, [(13, 0.9909794)]),
 (19, [(6, 0.79651845), (8, 0.08626624), (10, 0.1751189), (13, 0.9420429)]),
 (20, [(14, 0.99994135)]),
 (21, [(6, 0.2861923), (8, 0.6276793), (10, 0.08610653)]),
 (22, [(10, 0.9997107)]),
 (23, [(8, 0.4696081), (10, 0.2739301), (14, 0.256353)]),
 (24, [(9, 0.9997231)]),
 (25, [(13, 0.99998593)]),
 (26, [(6, 0.82496226), (13

#### per topic

In [31]:
topic = 0
words_probs = model.show_topic(topic)
words_probs

[('field', 0.057476845),
 ('notice', 0.049948048),
 ('community', 0.04962867),
 ('suggest', 0.043319833),
 ('external', 0.04158011),
 ('average', 0.038643923),
 ('weight', 0.03821546),
 ('significant', 0.034962088),
 ('impact', 0.028737571),
 ('primarily', 0.027721057)]

### prediction

In [32]:
doc_a = lemmatized_docs[0]
doc_z = lemmatized_docs[-1]
doc_a_z = doc_a + doc_z

In [33]:
sample_a = dictionary.doc2bow(doc_a)
row_a = model[sample_a]
row_a[0]

[(1, 0.014455527),
 (4, 0.04294037),
 (6, 0.27582324),
 (8, 0.18695359),
 (9, 0.104697436),
 (10, 0.094113834),
 (11, 0.029494785),
 (13, 0.16451567),
 (14, 0.0413363),
 (18, 0.022583045)]

In [34]:
sample_z = dictionary.doc2bow(doc_z)
row_z = model[sample_z]
row_z[0]

[(1, 0.018587569),
 (4, 0.04483253),
 (6, 0.14925666),
 (8, 0.14437138),
 (9, 0.054617684),
 (10, 0.19890724),
 (11, 0.018616397),
 (13, 0.28071913),
 (14, 0.015154537),
 (18, 0.04092703)]

In [35]:
sample_a_z = dictionary.doc2bow(doc_a_z)
row_a_z = model[sample_a_z]
row_a_z[0]

[(1, 0.010190097),
 (4, 0.04166739),
 (6, 0.2430974),
 (8, 0.14198059),
 (9, 0.099733174),
 (10, 0.13977821),
 (11, 0.021252474),
 (13, 0.22644262),
 (14, 0.026960706),
 (18, 0.032275222)]