In [2]:
#!/usr/bin/env python
# coding: utf-8
from __future__ import print_function

__source__ = "lda_transcripts.ipynb"
__author__ = "Frank J. Greco"
__copyright__ = "Copyright 2015-2018, Frank J. Greco"
__credits__ = []
__license__ = "Apache"
__version__ = "1.0.1"
__email__ = ""
__status__ = "Development"

# Create and manage document term matrix; generate lda model
# Marshall full set of transcript documents for LDA analysis

In [3]:
%load_ext autoreload
%autoreload 2
#%reload_ext autoreload
import sys
sys.path.append('/Users/fjgreco/Dev-Atlas')

In [4]:

import numpy as np
import textmining
import lda.datasets
import os,re,json

#from document_term_matrix import DocumentTermMatrix

class DocumentTermMatrix():
    
    # Initialize class to create term-document matrix

    def __init__(self):
        self.tdm = textmining.TermDocumentMatrix()
        self.docs = []
        self.titles = []

    def reset(self):
        self.docs = []
        self.titles = []

    def add_doc(self,title,doc):
        self.titles.append(title)
        self.docs.append(doc)

    # create a  variable with doc-term info
    def create(self):
        for doc in self.docs:
            self.tdm.add_doc(doc)

        self.dtm = list(self.tdm.rows(cutoff=1))
        self.vocab = tuple(self.dtm[0])
        self.X = np.array(self.dtm[1:])
        return self.dtm

    # get document-term matrix from remaining rows
    def get_dtm(self):
        return self.dtm

    def get_X(self):
        return self.X

    def get_vocabulary(self):
        return self.vocab

    def print_input(self):
        print("\nDocuments:")
        for n, doc in enumerate(self.docs):
            print("document {}: {}".format(n + 1, doc))

        print("\nTitles:")
        for n, title in enumerate(self.titles):
            print("title {}: {}".format(n + 1, title))

        print("\nCombined Titles and Documents:")
        for n, item in enumerate(list(zip(self.titles, self.docs))):
            print("Item {}: {}".format(n + 1, item))

    def print_DTM(self):
        # Document-term matrix
        print("\nDocument-term Matrix:\n")
        print("\ntype(X): {}".format(type(self.X)))
        print("\nshape: {}".format(self.X.shape))
        print("X:", self.X, sep="\n" )
        print("\nNumber of rows = number of documents, number of columns = number of word in the vocabulary")

        # Vocabulary
        print("\nVocabulary:")
        print("\ntype(vocab): {}".format(type(self.vocab)))
        print("\nlen(vocab): {}".format(len(self.vocab)))
        print("\nWords:", self.vocab, sep="\n")


    def lda(self, n_topics=2, random_state=0, n_iter=100):

        self.model = lda.LDA(n_topics=n_topics, n_iter=n_iter, random_state=random_state)

        self.model.fit(self.X)  # model.fit_transform(X) is also available

        print('Model Fit: ', self.model.fit(self.X))
        print
        print('Model components: ', self.model.components_)
        print
        print('Log likelihood: ', self.model.loglikelihood())
        print


    def topic(self, mod='', v=''):
        topic_word = self.model.topic_word_  # model.components_ also works
        n_top_words = 8
        for i, topic_dist in enumerate(topic_word):
            topic_words = np.array(self.vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
            print("Topic {}: {}".format(i, ' '.join(topic_words)))



In [5]:
from preprocess import create_title_list

dtm=DocumentTermMatrix()

titles, docs = create_title_list('../CC_TRANSCRIPTS3/')
print (len(titles),len(docs))

#for title, doc in zip(titles, docs)[0:2]:
#    print(title, doc)

for title, doc in zip(titles, docs):
    dtm.add_doc(title,doc)

243 243


In [6]:
dtm.print_input()


Documents:
document 1: Spanish, and you would know if, I mean, you would know what they're saying because you learn English, and you talk, you learn English and you know how to talk Spanish.
 I agree with you, because like, because like if they when you said that they had to read, write, and speak English, like they put something that, they put something that's English on your paper, you'd probably understand it more, so that's why I agree with you.
 But when they fill out the application, what if the application didn't 
Well,
 The application is about like where you're from, why are you coming here for, like they want to know a lot of information about you.
 I think we should, I think we should keep it at one, because if you fill out the application, you'll know more about people, and their birthday and stuff, like and then they usually bring them a present, 
Okay, so it's important to know about somebody. Now, the citizenship board might not go to their birthday party, but it is imp

UnicodeEncodeError: 'ascii' codec can't encode character u'\u266a' in position 313: ordinal not in range(128)

In [8]:
dtm.create()

[[u'raining',
  u'panchito',
  u'yellow',
  u'four',
  u'sunburning',
  u'woods',
  u'travaux',
  u'spiders',
  u'railing',
  u'centimeter',
  u'gabrielle',
  u'otro',
  u'najeem',
  u'augustine',
  u'electricity',
  u'rupture',
  u'similarity',
  u'otra',
  u'fingernails',
  u'alphabetic',
  u'lord',
  u'aiden',
  u'shaving',
  u'sinking',
  u'digit',
  u'hormone',
  u'callie',
  u'pigment',
  u'sonja',
  u'hibernating',
  u'classifications',
  u'stabbed',
  u'screaming',
  u'identity',
  u'differentiated',
  u'basics',
  u'internally',
  u'scholar',
  u'wednesday',
  u'piling',
  u'oooo',
  u'persisted',
  u'kits',
  u'oooh',
  u'stereotypical',
  u'reliable',
  u'andra',
  u'fuera',
  u'expanded',
  u'esos',
  u'tired',
  u'miller',
  u'hanging',
  u'bacon',
  u'frederick',
  u'pulse',
  u'elections',
  u'tires',
  u'elegant',
  u'second',
  u'crisply',
  u'flask',
  u'clarinet',
  u'errors',
  u'mls',
  u'cooking',
  u'contributed',
  u'fingers',
  u'clorox',
  u'fossil',
  u'desig

In [19]:
dtm.print_DTM()


Document-term Matrix:


type(X): <type 'numpy.ndarray'>

shape: (226, 5)
X:
[[0 0 1 0 0]
 [4 0 0 0 0]
 [5 0 0 0 0]
 ...
 [7 1 0 0 0]
 [7 1 0 0 0]
 [4 1 0 0 0]]

Number of rows = number of documents, number of columns = number of word in the vocabulary

Vocabulary:

type(vocab): <type 'tuple'>

len(vocab): 5

Words:
('tw', 's', 'nan', 'c', 'm')


In [8]:
dtm.lda(n_topics=10)

INFO:lda:n_documents: 243
INFO:lda:vocab_size: 12319
INFO:lda:n_words: 476425
INFO:lda:n_topics: 10
INFO:lda:n_iter: 100
INFO:lda:<0> log likelihood: -4220118
INFO:lda:<10> log likelihood: -3726453
INFO:lda:<20> log likelihood: -3539531
INFO:lda:<30> log likelihood: -3465647
INFO:lda:<40> log likelihood: -3433384
INFO:lda:<50> log likelihood: -3413838
INFO:lda:<60> log likelihood: -3397554
INFO:lda:<70> log likelihood: -3385472
INFO:lda:<80> log likelihood: -3375506
INFO:lda:<90> log likelihood: -3370322
INFO:lda:<99> log likelihood: -3365231
INFO:lda:n_documents: 243
INFO:lda:vocab_size: 12319
INFO:lda:n_words: 476425
INFO:lda:n_topics: 10
INFO:lda:n_iter: 100
INFO:lda:<0> log likelihood: -4220118
INFO:lda:<10> log likelihood: -3726453
INFO:lda:<20> log likelihood: -3539531
INFO:lda:<30> log likelihood: -3465647
INFO:lda:<40> log likelihood: -3433384
INFO:lda:<50> log likelihood: -3413838
INFO:lda:<60> log likelihood: -3397554
INFO:lda:<70> log likelihood: -3385472
INFO:lda:<80> log l

Model Fit:  <lda.lda.LDA instance at 0x109950638>
Model components:  [[1.11780315e-07 1.11780315e-07 1.11780315e-07 ... 1.11780315e-07
  1.11780315e-07 1.11780315e-07]
 [1.81332902e-07 1.81332902e-07 1.81332902e-07 ... 1.81332902e-07
  1.81332902e-07 1.81332902e-07]
 [1.32224079e-07 1.32224079e-07 5.42250948e-04 ... 1.32224079e-07
  1.32224079e-07 1.32224079e-07]
 ...
 [2.45693062e-07 2.45693062e-07 2.21369449e-04 ... 2.45693062e-07
  2.45693062e-07 2.45693062e-07]
 [9.57756254e-04 1.91414624e-03 1.36627140e-06 ... 1.36627140e-06
  2.74620552e-04 1.36627140e-06]
 [1.27492169e-07 1.27492169e-07 2.93359481e-04 ... 1.27492169e-07
  1.27492169e-07 1.27492169e-07]]
Log likelihood:  -3365230.73171


In [9]:
dtm.topic()

Topic 0: you that to i it the and like
Topic 1: they the it that and like s what
Topic 2: you your to i do okay and a
Topic 3: the of and you a that it so
Topic 4: it the s to you okay and so
Topic 5: que est de a s c noel bien
Topic 6: the and to of we a on that
Topic 7: the he s and to i was that
Topic 8: i a no sun s note hot me
Topic 9: you the it s what one that so
