## Toxic: LDA

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from gensim import corpora, models, similarities, matutils
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
df = pd.read_pickle('../data/toxictrain.pkl')
print(df.shape)

(159571, 17)


In [5]:
df = df[-100:]
df.shape

(100, 17)

In [6]:
# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', token_pattern="\\b[a-z][a-z]+\\b")
count_vectorizer.fit(df.comment_text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
# Create the term-document matrix
# Transpose it so the terms are the rows
counts = count_vectorizer.transform(df.comment_text).transpose()

In [8]:
counts.shape

(5058, 100)

In [9]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(counts)

In [10]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [11]:
len(id2word)

5058

In [12]:
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=10)

2018-02-28 08:46:35,622 : INFO : using symmetric alpha at 0.3333333333333333
2018-02-28 08:46:35,626 : INFO : using symmetric eta at 0.3333333333333333
2018-02-28 08:46:35,631 : INFO : using serial LDA version on this node
2018-02-28 08:46:35,767 : INFO : running online (multi-pass) LDA training, 3 topics, 10 passes over the supplied corpus of 100 documents, updating model once every 100 documents, evaluating perplexity every 100 documents, iterating 50x with a convergence threshold of 0.001000
2018-02-28 08:46:36,596 : INFO : -9.926 per-word bound, 972.7 perplexity estimate based on a held-out corpus of 100 documents with 7004 words
2018-02-28 08:46:36,598 : INFO : PROGRESS: pass 0, at document #100/100
2018-02-28 08:46:37,334 : INFO : topic #0 (0.333): 0.004*"wikipedia" + 0.004*"article" + 0.003*"talk" + 0.003*"page" + 0.003*"male" + 0.002*"like" + 0.002*"female" + 0.002*"salute" + 0.002*"articles" + 0.002*"don"
2018-02-28 08:46:37,335 : INFO : topic #1 (0.333): 0.005*"article" + 0.0

2018-02-28 08:46:44,141 : INFO : topic #0 (0.333): 0.004*"page" + 0.003*"talk" + 0.003*"article" + 0.003*"like" + 0.002*"way" + 0.002*"don" + 0.002*"really" + 0.002*"think" + 0.002*"people" + 0.002*"metaphysical"
2018-02-28 08:46:44,143 : INFO : topic #1 (0.333): 0.007*"wikipedia" + 0.007*"male" + 0.006*"article" + 0.005*"female" + 0.005*"salute" + 0.003*"te" + 0.003*"male female" + 0.003*"subject" + 0.003*"like" + 0.003*"used"
2018-02-28 08:46:44,150 : INFO : topic #2 (0.333): 0.004*"article" + 0.003*"copyright" + 0.003*"wikipedia" + 0.003*"image" + 0.003*"mexican" + 0.002*"page" + 0.002*"source" + 0.002*"articles" + 0.002*"thanks" + 0.002*"deletion"
2018-02-28 08:46:44,153 : INFO : topic diff=0.012342, rho=0.316228
2018-02-28 08:46:45,028 : INFO : -8.655 per-word bound, 403.0 perplexity estimate based on a held-out corpus of 100 documents with 7004 words
2018-02-28 08:46:45,033 : INFO : PROGRESS: pass 9, at document #100/100
2018-02-28 08:46:45,454 : INFO : topic #0 (0.333): 0.004*"p

In [13]:
lda.print_topics()

2018-02-28 08:46:58,771 : INFO : topic #0 (0.333): 0.004*"page" + 0.003*"talk" + 0.003*"article" + 0.003*"like" + 0.002*"way" + 0.002*"don" + 0.002*"really" + 0.002*"think" + 0.002*"people" + 0.002*"metaphysical"
2018-02-28 08:46:58,778 : INFO : topic #1 (0.333): 0.007*"wikipedia" + 0.007*"male" + 0.006*"article" + 0.005*"female" + 0.005*"salute" + 0.003*"te" + 0.003*"male female" + 0.003*"subject" + 0.003*"like" + 0.003*"used"
2018-02-28 08:46:58,781 : INFO : topic #2 (0.333): 0.004*"article" + 0.003*"copyright" + 0.003*"wikipedia" + 0.003*"image" + 0.003*"mexican" + 0.002*"page" + 0.002*"source" + 0.002*"articles" + 0.002*"thanks" + 0.002*"use"


[(0,
  '0.004*"page" + 0.003*"talk" + 0.003*"article" + 0.003*"like" + 0.002*"way" + 0.002*"don" + 0.002*"really" + 0.002*"think" + 0.002*"people" + 0.002*"metaphysical"'),
 (1,
  '0.007*"wikipedia" + 0.007*"male" + 0.006*"article" + 0.005*"female" + 0.005*"salute" + 0.003*"te" + 0.003*"male female" + 0.003*"subject" + 0.003*"like" + 0.003*"used"'),
 (2,
  '0.004*"article" + 0.003*"copyright" + 0.003*"wikipedia" + 0.003*"image" + 0.003*"mexican" + 0.002*"page" + 0.002*"source" + 0.002*"articles" + 0.002*"thanks" + 0.002*"use"')]

In [14]:
# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus = lda[corpus]
lda_corpus

<gensim.interfaces.TransformedCorpus at 0x1a479f6128>

In [15]:
# Store the documents' topic vectors in a list so we can take a peak
lda_docs = [doc for doc in lda_corpus]

In [16]:
# Check out the document vectors in the topic space for the first 5 documents
lda_docs[0:15]

[[(0, 0.99753726)],
 [(0, 0.042118639), (1, 0.042645715), (2, 0.91523564)],
 [(1, 0.98526078)],
 [(0, 0.056080092), (1, 0.88772446), (2, 0.056195457)],
 [(2, 0.98377347)],
 [(0, 0.06069915), (1, 0.057171375), (2, 0.88212943)],
 [(0, 0.94292706), (1, 0.028429797), (2, 0.028643137)],
 [(0, 0.021256423), (1, 0.02168618), (2, 0.95705742)],
 [(2, 0.98217463)],
 [(2, 0.99028659)],
 [(0, 0.028442981), (1, 0.028236082), (2, 0.94332093)],
 [(0, 0.011543071), (1, 0.011115024), (2, 0.97734195)],
 [(1, 0.99157745)],
 [(1, 0.98850095)],
 [(0, 0.9308973), (1, 0.035490468), (2, 0.033612248)]]

In [17]:
df.comment_text[5]

'"== August 2009 == \nSurv1v4l1st (Talk|Contribs) "'

In [18]:
lda.log_perplexity

<bound method LdaModel.log_perplexity of <gensim.models.ldamodel.LdaModel object at 0x1a4e291588>>