# NLP Practice 1: computing text similarities using gensim and nltk

In [74]:
import nltk
import gensim
import sys, logging
%matplotlib inline
import matplotlib.pyplot as plt
# all output goes to logs 
sys.path.append('~/logs')
import cjzpy_load_logging
#cjzpy_load_logging.load_logging_json(default_level=logging.INFO)
cjzpy_load_logging.load_logging_json(default_level=logging.DEBUG)
logger = logging.getLogger(sys._getframe().f_code.co_name)

25-03-2016 19:15:21 - cjzpy_load_logging - load_logging_json - INFO - Loading my universal log service: cjzpy_logging.json


Successfully imported ColorizingStreamHandler from logutils
Failed importing ColorizingStreamHandler from logutils, using logging.StreamHandler()


In [76]:
logger.debug('hello gensim')
logger.info('hello NLP practice 1')

25-03-2016 19:16:05 - <ipython-input-76-954f0d7d5eb2> - <module> - DEBUG - hello gensim
25-03-2016 19:16:05 - <ipython-input-76-954f0d7d5eb2> - <module> - INFO - hello NLP practice 1


In [77]:
def gensim4texts(logger, texts, query, nTopic):
    logger.info('This is %s', sys._getframe().f_code.co_name)
    logger.debug('texts: %s', texts)
    # In BoW representation, each document is represented by one vector where each vector element represents a question-answer pair, in the style of: How many times does the word system appear in the document? The mapping between the questions and ids is called a dictionary
    dictionary = gensim.corpora.Dictionary(texts)
    logger.debug('dictionary.token2id: %s', dictionary.token2id)
    # To actually convert tokenized documents to vectors
    corpus = [dictionary.doc2bow(text) for text in texts]
    # gensim.corpora.MmCorpus.serialize('corpus_tmp.mm', corpus) # store to disk, for later use
    logger.debug('corpus: %s', corpus)
    tfidf = gensim.models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    for doc in corpus_tfidf:
        logger.debug('corpus_tfidf.doc: %s', doc)
    logger.debug('tfidf.dfs: %s', tfidf.dfs)
    logger.debug('tfidf.idfs: %s', tfidf.idfs)
    lsi = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=nTopic)
    lsi.print_topics(2)
    corpus_lsi = lsi[corpus_tfidf]
    for doc in corpus_lsi:
        logger.debug('copus_lsi.doc: %s', doc)

    lda = gensim.models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=nTopic)
    lda.print_topics(2)
    index = gensim.similarities.MatrixSimilarity(lsi[corpus])
    #query_bow = dictionary.doc2bow(query.lower().split())
    query_bow = dictionary.doc2bow(query)
    logger.debug('query_bow: %s', query_bow)
    query_lsi = lsi[query_bow]
    logger.debug('query_lsi: %s', query_lsi)
    sims = index[query_lsi]
    logger.debug('sims: %s', list(enumerate(sims)))

    sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
    logger.info('sort_sims: %s', sort_sims)
    return sort_sims

In [78]:
documents = ["Shipment of gold damaged in a fire", "Delivery of silver arrived in a silver truck", 
 "Shipment of gold arrived in a truck"]
texts = [[word for word in document.lower().split()] for document in documents]
query = 'gold silver truck'
gensim4texts(logger, texts, query.lower().split(), 2)


25-03-2016 19:16:14 - <ipython-input-77-a79ba582be37> - <module> - INFO - This is gensim4texts
25-03-2016 19:16:14 - <ipython-input-77-a79ba582be37> - <module> - DEBUG - texts: [['shipment', 'of', 'gold', 'damaged', 'in', 'a', 'fire'], ['delivery', 'of', 'silver', 'arrived', 'in', 'a', 'silver', 'truck'], ['shipment', 'of', 'gold', 'arrived', 'in', 'a', 'truck']]
25-03-2016 19:16:14 - dictionary - gensim.corpora.dictionary - INFO - adding document #0 to Dictionary(0 unique tokens: [])
25-03-2016 19:16:14 - dictionary - gensim.corpora.dictionary - INFO - built Dictionary(11 unique tokens: ['delivery', 'a', 'silver', 'shipment', 'damaged']...) from 3 documents (total 22 corpus positions)
25-03-2016 19:16:14 - <ipython-input-77-a79ba582be37> - <module> - DEBUG - dictionary.token2id: {'delivery': 7, 'a': 3, 'silver': 8, 'shipment': 1, 'damaged': 2, 'in': 4, 'of': 0, 'fire': 6, 'gold': 5, 'arrived': 9, 'truck': 10}
25-03-2016 19:16:14 - <ipython-input-77-a79ba582be37> - <module> - DEBUG - c

[(1, 0.93163693), (2, 0.83416492), (0, 0.40757114)]

In [79]:
def nltkPreProcess4texts(logger, texts, stemmer=None):
# logger: logging object
# texts: list of texts for preprocessing with NLTK
# stemmer: default is LancasterStemmer; PorterStemmer
    logger.info('This is %s', sys._getframe().f_code.co_name)
    logger.info('texts=%s', texts[0])
# lowering case
    texts_lower = [[word for word in document.lower().split()] for document in texts]
    logger.debug('texts_lower[0]=%s', texts_lower[0])
# tockenizing
    texts_tokenized = [[word.lower() for word in nltk.word_tokenize(document)] for document in texts]
    logger.debug('texts_tokenized[0]=%s', texts_tokenized[0])
# filtering stopwords and punctuations
    english_stopwords = nltk.corpus.stopwords.words('english')
    english_stopwords.extend([',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%'])
    logger.debug('english_stopwords=%s, length=%d', english_stopwords,len(english_stopwords))
    texts_filtered_stopwords = [[word for word in document if not word in english_stopwords] for document in texts_tokenized]
    logger.debug('texts_filtered_stopwords[0]=%s', texts_filtered_stopwords[0])
# stemmering with LancasterStemmer
    if stemmer is 'porter':
        st = nltk.PorterStemmer()
#        st = nltk.stem.porter.PorterStemmer()
    else:
        st = nltk.LancasterStemmer()
#        st = nltk.stem.lancaster.LancasterStemmer()
    texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered_stopwords]
    logger.debug('texts_stemmed[0]=%s', texts_stemmed[0])
# eliminating words with only one occurence
    all_stems = sum(texts_stemmed, [])
    stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1)
    texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed]
    logger.debug('texts_WithMoreThanOneCounts[0]=%s', texts[0])
    return texts

In [80]:
documents = ["你 认识 那个 和 主席 握手 的 Python 的哥 吗", "他 开 一辆 黑色 C++", "黑色 的士", "我 爱 Python 和 C++"]
texts = [[word for word in document.split()] for document in documents]
print(texts)
query = u'红色 的 Python'
gensim4texts(logger, texts, query.lower().split(), 2)

25-03-2016 22:02:56 - <ipython-input-77-a79ba582be37> - <module> - INFO - This is gensim4texts
25-03-2016 22:02:56 - <ipython-input-77-a79ba582be37> - <module> - DEBUG - texts: [['你', '认识', '那个', '和', '主席', '握手', '的', 'Python', '的哥', '吗'], ['他', '开', '一辆', '黑色', 'C++'], ['黑色', '的士'], ['我', '爱', 'Python', '和', 'C++']]
25-03-2016 22:02:56 - dictionary - gensim.corpora.dictionary - INFO - adding document #0 to Dictionary(0 unique tokens: [])
25-03-2016 22:02:56 - dictionary - gensim.corpora.dictionary - INFO - built Dictionary(18 unique tokens: ['的', '的哥', '的士', 'C++', '和']...) from 4 documents (total 22 corpus positions)
25-03-2016 22:02:56 - <ipython-input-77-a79ba582be37> - <module> - DEBUG - dictionary.token2id: {'的': 0, '的哥': 3, '的士': 15, 'C++': 10, '和': 6, '一辆': 11, '他': 13, '黑色': 12, 'Python': 1, '你': 2, '开': 14, '我': 17, '那个': 4, '认识': 7, '吗': 9, '爱': 16, '主席': 5, '握手': 8}
25-03-2016 22:02:56 - <ipython-input-77-a79ba582be37> - <module> - DEBUG - corpus: [[(0, 1), (1, 1), (2, 1), 

[['你', '认识', '那个', '和', '主席', '握手', '的', 'Python', '的哥', '吗'], ['他', '开', '一辆', '黑色', 'C++'], ['黑色', '的士'], ['我', '爱', 'Python', '和', 'C++']]


[(0, 0.99848497), (3, 0.92187905), (1, 0.018948048), (2, -0.31816125)]

In [81]:
documents = ["运 送 黄金 的 车 在 火 中 受损", "一辆 银色 的 运 银 卡车 到达", "运送 黄金 的 卡车 抵达"]
texts = [[word for word in document.split()] for document in documents]
print(texts)
query = u'黄金 银 卡车'
gensim4texts(logger, texts, query.split(), 2)

25-03-2016 22:03:02 - <ipython-input-77-a79ba582be37> - <module> - INFO - This is gensim4texts
25-03-2016 22:03:02 - <ipython-input-77-a79ba582be37> - <module> - DEBUG - texts: [['运', '送', '黄金', '的', '车', '在', '火', '中', '受损'], ['一辆', '银色', '的', '运', '银', '卡车', '到达'], ['运送', '黄金', '的', '卡车', '抵达']]
25-03-2016 22:03:02 - dictionary - gensim.corpora.dictionary - INFO - adding document #0 to Dictionary(0 unique tokens: [])
25-03-2016 22:03:02 - dictionary - gensim.corpora.dictionary - INFO - built Dictionary(16 unique tokens: ['银', '的', '一辆', '火', '运送']...) from 3 documents (total 21 corpus positions)
25-03-2016 22:03:02 - <ipython-input-77-a79ba582be37> - <module> - DEBUG - dictionary.token2id: {'银': 9, '的': 1, '一辆': 10, '火': 6, '运送': 14, '车': 5, '受损': 8, '银色': 11, '卡车': 12, '抵达': 15, '运': 0, '在': 4, '黄金': 2, '送': 3, '到达': 13, '中': 7}
25-03-2016 22:03:02 - <ipython-input-77-a79ba582be37> - <module> - DEBUG - corpus: [[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]

[['运', '送', '黄金', '的', '车', '在', '火', '中', '受损'], ['一辆', '银色', '的', '运', '银', '卡车', '到达'], ['运送', '黄金', '的', '卡车', '抵达']]


[(2, 0.97292513), (1, 0.9648906), (0, 0.23582897)]

In [None]:
fn = '../cjzpyml/cjzpynlp/data/coursera_corpus'
courses = [line.strip() for line in open(fn)]
courses_name = [course.split('\t')[0] for course in courses]
logger.debug('courses_name[0:10]=%s', courses_name[0:10])

texts = nltkPreProcess4texts(logger, courses) #default stemmer is Lancaster
sims = gensim4texts(logger, texts, texts[210], 10)
for i, j in enumerate(sims[0:10]):
    logger.info('(#,similar_course_name,sims)=(%d,%s,%f)', i, courses_name[j[0]], j[1])

texts = nltkPreProcess4texts(logger, courses, stemmer='porter')
sims = gensim4texts(logger, texts, texts[210], 10)
for i, j in enumerate(sims[0:10]):
    logger.info('(#,similar_course_name,sims)=(%d,%s,%f)', i, courses_name[j[0]], j[1])

In [None]:
fn = '../cjzpyml/cjzpynlp/data/coursera_corpus_c'
courses = [line.strip() for line in open(fn)]
courses_name = [course.split('\t')[0] for course in courses]
logger.debug('courses_name[0:10]=%s', courses_name[0:10])

texts = nltkPreProcess4texts(logger, courses) #default stemmer is Lancaster
sims = gensim4texts(logger, texts, texts[210], 10)
for i, j in enumerate(sims[0:10]):
    logger.info('(#,similar_course_name,sims)=(%d,%s,%f)', i, courses_name[j[0]], j[1])

texts = nltkPreProcess4texts(logger, courses, stemmer='porter')
sims = gensim4texts(logger, texts, texts[210], 10)
for i, j in enumerate(sims[0:10]):
    logger.info('(#,similar_course_name,sims)=(%d,%s,%f)', i, courses_name[j[0]], j[1])

In [83]:
class MyCorpus(object):
    def __init__(self, logger, fn):
        self.fn = fn
        self.logger = logger
        self.stoplist = set('for a of the and to in'.split())
        self.logger.info('I am at class %s', self)
    
    def getDictionary1(self):
        documents = ["Human machine interface for lab abc computer applications",
                    "A survey of user opinion of computer system response time",
                    "The EPS user interface management system",
                    "System and human system engineering testing of EPS",
                    "Relation of user perceived response time to error measurement",
                    "The generation of random binary unordered trees",
                    "The intersection graph of paths in trees",
                    "Graph minors IV Widths of trees and well quasi ordering",
                    "Graph minors A survey"]
# remove common words and tokenize
        texts = [[word for word in document.lower().split() if word not in self.stoplist] for document in documents]
# remove words that appear only once
        all_tokens = sum(texts, [])
        tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
        texts = [[word for word in text if word not in tokens_once] for text in texts]
        self.dictionary1 = gensim.corpora.Dictionary(texts)
#        self.dictionary1.save('dict_tmp.dict') # store the dictionary, for future reference

    def getDictionary(self):
# collect statistics about all tokens
        self.dictionary = gensim.corpora.Dictionary(line.lower().strip().split() for line in open(self.fn))
# remove stop words and words that appear only once
        stop_ids = [self.dictionary.token2id[stopword] for stopword in self.stoplist if stopword in self.dictionary.token2id]
        once_ids = [tokenid for tokenid, docfreq in self.dictionary.dfs.items() if docfreq == 1]
        self.dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
        self.dictionary.compactify() # remove gaps in id sequence after words that were removed

    def __iter__(self):
        self.getDictionary1()
        self.getDictionary()
        for line in open(self.fn):
# assume there's one document per line, tokens separated by whitespace
            yield self.dictionary.doc2bow(line.lower().strip().split())

In [84]:
fn = '../cjzpyml/cjzpynlp/data/mycorpus.txt'
corpus_memory_friendly = MyCorpus(logger, fn) # doesn't load the corpus into memory!
for vector in corpus_memory_friendly: # load one vector into memory at a time
    logger.info('vector=%s', vector)

25-03-2016 22:03:37 - <ipython-input-83-1f1d0e5492ae> - <module> - INFO - I am at class <__main__.MyCorpus object at 0x7f98f89d2048>
25-03-2016 22:03:37 - dictionary - gensim.corpora.dictionary - INFO - adding document #0 to Dictionary(0 unique tokens: [])
25-03-2016 22:03:37 - dictionary - gensim.corpora.dictionary - INFO - built Dictionary(12 unique tokens: ['computer', 'trees', 'eps', 'user', 'graph']...) from 9 documents (total 29 corpus positions)
25-03-2016 22:03:37 - dictionary - gensim.corpora.dictionary - INFO - adding document #0 to Dictionary(0 unique tokens: [])
25-03-2016 22:03:37 - dictionary - gensim.corpora.dictionary - INFO - built Dictionary(58 unique tokens: ['city', 'rock', 'a', 'gas', 'highway,']...) from 4 documents (total 74 corpus positions)
25-03-2016 22:03:37 - dictionary - gensim.corpora.dictionary - DEBUG - rebuilding dictionary, shrinking gaps
25-03-2016 22:03:37 - dictionary - gensim.corpora.dictionary - DEBUG - rebuilding dictionary, shrinking gaps
25-03-

In [85]:
fn = '../cjzpyml/cjzpynlp/data/mycorpus1.txt'
corpus_memory_friendly = MyCorpus(logger, fn) # doesn't load the corpus into memory!
for vector in corpus_memory_friendly: # load one vector into memory at a time
    logger.info('vector=%s', vector)

25-03-2016 22:03:38 - <ipython-input-83-1f1d0e5492ae> - <module> - INFO - I am at class <__main__.MyCorpus object at 0x7f98f92cee48>
25-03-2016 22:03:38 - dictionary - gensim.corpora.dictionary - INFO - adding document #0 to Dictionary(0 unique tokens: [])
25-03-2016 22:03:38 - dictionary - gensim.corpora.dictionary - INFO - built Dictionary(12 unique tokens: ['computer', 'trees', 'eps', 'user', 'graph']...) from 9 documents (total 29 corpus positions)
25-03-2016 22:03:38 - dictionary - gensim.corpora.dictionary - INFO - adding document #0 to Dictionary(0 unique tokens: [])
25-03-2016 22:03:38 - dictionary - gensim.corpora.dictionary - INFO - built Dictionary(50 unique tokens: ['a', 'time', 'trees', 'interface', 'widths']...) from 13 documents (total 94 corpus positions)
25-03-2016 22:03:38 - dictionary - gensim.corpora.dictionary - DEBUG - rebuilding dictionary, shrinking gaps
25-03-2016 22:03:38 - dictionary - gensim.corpora.dictionary - DEBUG - rebuilding dictionary, shrinking gaps


In [None]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
