## Study lsa with simple example

In [1]:
%qtconsole

In [2]:
from gensim import corpora
from nltk.corpus import stopwords

lsi.show_topics()
Out[3]: 
[(0,
  u'
  0.703*"trees" 
  + 0.538*"graph" 
  + 0.402*"minors" 
  + 0.187*"survey" 
  + 0.061*"system" 
  + 0.060*"time" 
  + 0.060*"response" 
  + 0.058*"user" 
  + 0.049*"computer" 
  + 0.035*"interface"'),

In [12]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

Remove common words and tokenize and remove words that appear only once

In [13]:
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]

# 
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]


Print the text

In [None]:
from pprint import pprint  # pretty-printer
pprint(texts)

Save files

In [6]:
dictionary = corpora.Dictionary(texts)
dictionary.save('simple-deerwester.dict')  # store the dictionary, for future reference

corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('simple-deerwester.mm', corpus)  # store to disk, for later use

corpus = corpora.MmCorpus('simple-deerwester.mm')
dictionary = corpora.Dictionary.load('simple-deerwester.dict')


Make transforms

In [7]:
from gensim import corpora, models, similarities

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=5) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi


In [8]:
lsi.save('simple-model.lsi') # same for tfidf, lda, ...
lsi = models.LsiModel.load('simple-model.lsi')

Test 

In [9]:
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] #

## LDA Simple

[Introduction to Latent Dirichlet Allocation](http://blog.echen.me/2011/08/22/introduction-to-latent-dirichlet-allocation/)

Paper [Latent Dirichlet Allocation](https://www.cs.princeton.edu/~blei/papers/BleiNgJordan2003.pdf)

In [8]:
documents_lda = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

In [9]:
#stoplist = set('for a of the and to in'.split())
stoplist = stopwords.words('english')
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents_lda]

# 
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 0] for text in texts]


LookupError: 
**********************************************************************
  Resource u'corpora/stopwords' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - 'C:\\Users\\t148770/nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'd:\\Users\\t148770\\AppData\\Local\\Continuum\\Anaconda2\\nltk_data'
    - 'd:\\Users\\t148770\\AppData\\Local\\Continuum\\Anaconda2\\lib\\nltk_data'
    - 'D:\\t148770\\AppData\\Roaming\\nltk_data'
**********************************************************************

Check text

In [32]:
from pprint import pprint  # pretty-printer
pprint(texts)

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]


In [33]:
dictionary = corpora.Dictionary(texts)
dictionary.save('simple-deerwester-lda.dict')  # store the dictionary, for future reference

corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('simple-deerwester-lda.mm', corpus)  # store to disk, for later use

Load corpus and dictinary

In [34]:
corpus = corpora.MmCorpus('simple-deerwester-lda.mm')
dictionary = corpora.Dictionary.load('simple-deerwester-lda.dict')

Make transformations

In [35]:
from gensim import corpora, models, similarities

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=3, update_every=1, passes=10)

lda.print_topics()


[(0,
  u'0.073*survey + 0.053*management + 0.053*minors + 0.051*user + 0.051*system + 0.048*opinion + 0.044*graph + 0.043*eps + 0.043*interface + 0.039*time'),
 (1,
  u'0.053*paths + 0.053*intersection + 0.049*trees + 0.045*random + 0.045*generation + 0.045*binary + 0.045*unordered + 0.042*measurement + 0.042*relation + 0.042*perceived'),
 (2,
  u'0.051*human + 0.044*engineering + 0.044*testing + 0.043*system + 0.040*machine + 0.040*lab + 0.040*abc + 0.040*applications + 0.039*iv + 0.039*widths')]

In [36]:
lda.save('simple-model.lda') # same for tfidf, lda, ...
lda = models.LdaModel.load('simple-model.lda')

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]

Match each doc to each topic 

In [38]:
for text in corpus:
    print(lda[text])

[(0, 0.045493802268803958), (1, 0.042539947638558159), (2, 0.91196625009263788)]
[(0, 0.91266682819470168), (1, 0.043818493568679735), (2, 0.043514678236618573)]
[(0, 0.88325496654911717), (1, 0.056774695875308152), (2, 0.059970337575574545)]
[(0, 0.054881895706703709), (1, 0.048429797071328759), (2, 0.89668830722196746)]
[(0, 0.050343689198366237), (1, 0.90705857075137308), (2, 0.042597740050260693)]
[(0, 0.056424991504380705), (1, 0.88679209718662177), (2, 0.056782911308997507)]
[(0, 0.071849158227446044), (1, 0.85861628292037684), (2, 0.069534558852177186)]
[(0, 0.043010229431555413), (1, 0.042232978385991861), (2, 0.91475679218245276)]
[(0, 0.82615193272210363), (1, 0.086774100887493599), (2, 0.087073966390402838)]


## LDA using Sci-kit Learn

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [4]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

In [5]:
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]

# 
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

In [6]:
print("Loading dataset...")
# scode: data_samples = <FILL IN>   # Usar join sobre corpus_clean.
data_samples = [" ".join(doc) for doc in texts]
data_samples = map(lambda x: " ".join(x), texts)

Loading dataset...


In [21]:
print(data_samples)

['human interface computer', 'survey user computer system response time', 'eps user interface system', 'system human system eps', 'user response time', 'trees', 'graph trees', 'graph minors trees', 'graph minors survey']


In [7]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
n_features = 10
#max_features only consider the top max_features ordered by term frequency.
#min_df ignore terms that have a document frequency strictly lower than the given threshold.
#max_df ignore terms that have a document frequency strictly higher than the given threshold 
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=1,
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(data_samples)
print tf[0][0][0]

Extracting tf features for LDA...
  (0, 3)	1
  (0, 4)	1
  (0, 0)	1
