In [2]:
%matplotlib inline

In [3]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import cPickle as pkl

import numpy as np
import pylab as plt
import matplotlib.image as mpimg


## load captions

In [8]:
tokenizer = RegexpTokenizer(r'\w+')
p_stemmer = PorterStemmer()

f = open('ImageCaption/data/captions/kar_train_captions.txt')
c = 0
id_c = 0
cap_graph_list = []
for line in f:
    if c == 0 :
        cap_graph = line
        c+=1
    else :
        cap_graph += line
        c += 1
        if c%5 == 0 :
            c = 0
            id_c += 1
            tokens = tokenizer.tokenize(cap_graph)
            stemmed_tokens = [p_stemmer.stem(i) for i in tokens]
            cap_graph_list.append(' '.join(stemmed_tokens))
            #if id_c > 2 :
            #    break
            
print(id_c, len(cap_graph_list))
f.close()

113287 113287


In [9]:
tokenizer = RegexpTokenizer(r'\w+')
p_stemmer = PorterStemmer()

f = open('ImageCaption/captions/kar_val_captions.txt')
c = 0
id_c = 0
cap_graph_list2 = []
for line in f:
    if c == 0 :
        cap_graph = line
        c+=1
    else :
        cap_graph += line
        c += 1
        if c%5 == 0 :
            c = 0
            id_c += 1
            tokens = tokenizer.tokenize(cap_graph)
            stemmed_tokens = [p_stemmer.stem(i) for i in tokens]
            cap_graph_list2.append(' '.join(stemmed_tokens))
            #if id_c > 2 :
            #    break
            
print(id_c, len(cap_graph_list2))
f.close()

5000 5000


In [10]:
tokenizer = RegexpTokenizer(r'\w+')
p_stemmer = PorterStemmer()

f = open('ImageCaption/captions/kar_test_captions.txt')
c = 0
id_c = 0
cap_graph_list3 = []
for line in f:
    if c == 0 :
        cap_graph = line
        c+=1
    else :
        cap_graph += line
        c += 1
        if c%5 == 0 :
            c = 0
            id_c += 1
            tokens = tokenizer.tokenize(cap_graph)
            stemmed_tokens = [p_stemmer.stem(i) for i in tokens]
            cap_graph_list3.append(' '.join(stemmed_tokens))
            #if id_c > 2 :
            #    break
            
print(id_c, len(cap_graph_list3))
f.close()

5000 5000


## load and process captions

In [16]:
print('tokenizing captions line by line...')
tokenizer = RegexpTokenizer(r'\w+')
p_stemmer = PorterStemmer()

tokenized_captions = {}
for split in ['test', 'val', 'train']:
    print(split)
    f = open('ImageCaption/data/captions/kar_%s_captions.txt'%(split))
    id_c = 0
    cap_line_list = []
    for line in f:
        tokens = tokenizer.tokenize(line)
        stemmed_tokens = [p_stemmer.stem(i) for i in tokens]
        cap_line_list.append(' '.join(stemmed_tokens))
        id_c += 1
    tokenized_captions[split] = cap_line_list
    print(id_c, len(cap_line_list))
    f.close()

print('concatenate tokenized captions into paragraph...')
caption_per_image = 5
tokenized_paragraphs = {}
for split, captions in tokenized_captions.items():
    print(split)
    paragraph_list = []
    cap_ind = 0
    image_captions = []
    for caption in captions:
        image_captions.append(caption)
        cap_ind += 1
        if cap_ind % 5 == 0:
            paragraph_list.append(' '.join(image_captions))
            image_captions = []
    print(cap_ind, '-->', len(paragraph_list))
    tokenized_paragraphs[split] = paragraph_list

concatenate tokenized captions into paragraph...
test
25000 5000
train
566435 113287
val
25000 5000


## train topic model

In [6]:
n_samples = 113287
n_features = 5000
n_topics = 100
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

cap_graph_list = tokend_paragraphs['train']
data_samples = cap_graph_list[:n_samples]


# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english',
                                   token_pattern=u'(?u)[A-z][A-z]+')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))


# Fit the NMF model
print("Fitting the NMF model with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=100,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

'''

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english',
                                token_pattern=u'(?u)[A-z][A-z]+')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=100,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

'''

Extracting tf-idf features for NMF...
done in 2.792s.
Fitting the NMF model with tf-idf features, n_samples=113287 and n_features=5000...
done in 251.584s.

Topics in NMF model:
Topic #0:
zucchini flatscreen flap flash flashlight flask flat flatb flatbread flatten flank flavor flea fleet fli flickr flier flight flannel flamingo
Topic #1:
tenni court racket ball player racquet hit play swing serv match femal readi return male prepar shirt short net dure
Topic #2:
sit seat ground outsid coupl counter near besid middl microwav old floor step lap atop insid togeth site children teddi
Topic #3:
blue sky yellow white shirt color bright clear paint stripe jet jacket toothbrush purpl ha short blanket wear tile eye
Topic #4:
giraff zoo field grass enclosur tall grassi head feed neck area babi rock look dirt coupl group togeth leav wild
Topic #5:
train track station travel passeng platform railroad yellow pull engin bridg pass rail long come subway near steam commut locomot
Topic #6:
bat basebal

'\n\n# Use tf (raw term count) features for LDA.\nprint("Extracting tf features for LDA...")\ntf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,\n                                max_features=n_features,\n                                stop_words=\'english\',\n                                token_pattern=u\'(?u)[A-z][A-z]+\')\nt0 = time()\ntf = tf_vectorizer.fit_transform(data_samples)\nprint("done in %0.3fs." % (time() - t0))\n\nprint("Fitting LDA models with tf features, "\n      "n_samples=%d and n_features=%d..."\n      % (n_samples, n_features))\nlda = LatentDirichletAllocation(n_topics=n_topics, max_iter=100,\n                                learning_method=\'online\',\n                                learning_offset=50.,\n                                random_state=0)\nt0 = time()\nlda.fit(tf)\nprint("done in %0.3fs." % (time() - t0))\n\nprint("\nTopics in LDA model:")\ntf_feature_names = tf_vectorizer.get_feature_names()\nprint_top_words(lda, tf_feature_names, n_top_words

## save topic model

In [7]:
with open('topic_model/tfidf-nmf_t100_.pkl', 'w') as f :
    pkl.dump([tfidf_vectorizer, nmf], f)

print(nmf.reconstruction_err_)

232.54410058870891


In [14]:
caption_splits = tokenized_captions # tokenized_captions or tokenized_paragraphs
for topic_num in [100, 200, 300, 400]:
    print('topic_num:', topic_num)
    with open('topic_model/tfidf-nmf_t%d.pkl'%(topic_num)) as f :
        tfidf_vectorizer, nmf = pkl.load(f)
    
    for split, cap_graph_x in caption_splits.iteritems():
        print(len(cap_graph_x))
        tfidf = tfidf_vectorizer.transform(cap_graph_x)
        print(tfidf.shape)
        nmf_result = nmf.transform(tfidf)
        np.save('topic_model/coco/doc-topic_%s_line_t%d.npy'%(split, topic_num), nmf_result.astype('float32'))
        

topic_num: 100
25000
(25000, 5000)
566435
(566435, 5000)
25000
(25000, 5000)
topic_num: 200
25000
(25000, 5000)
566435
(566435, 5000)
25000
(25000, 5000)
topic_num: 300
25000
(25000, 5000)
566435
(566435, 5000)
25000
(25000, 5000)
topic_num: 400
25000
(25000, 5000)
566435
(566435, 5000)
25000
(25000, 5000)


## print topic representations

In [13]:
###print pretrained topic models

#with open('nmf/tfidf-fname_model-components.pkl') as f:
#    a, b = pkl.load(f)
#feature_names = a
#components_x = b

with open('topic_model/tfidf-nmf_t200.pkl') as f:
    tfidf_vectorizer, nmf = pkl.load(f)

with open('topic_model/coco/doc-topic_val_t200.npy') as f :
    nmf_result_x = np.load(f)
    
feature_names = tfidf_vectorizer.get_feature_names()
components_x = nmf.components_

def print_top_words(components_, feature_names, n_top_words):
    for topic_idx, topic in enumerate(components_):
        #print("Topic #%d:" % topic_idx)
        print("Topic #%d: "%(topic_idx) + " ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    
print_top_words(components_x, feature_names, 5)
#x2 = a.transform(cap_graph_list2)
#y2 = b.transform(x2)
#print(b.reconstruction_err_)

Topic #0: zucchini flatscreen flap flash flashlight
Topic #1: tenni court racket player racquet
Topic #2: cat gray grey kitten paw
Topic #3: train track travel railroad engin
Topic #4: giraff zoo enclosur tall feed
Topic #5: train station platform passeng subway
Topic #6: basebal player pitch game throw
Topic #7: sink bathroom cabinet faucet soap
Topic #8: ski slope snow skier countri
Topic #9: bathroom tile towel clean vaniti
Topic #10: kite fli flown string day
Topic #11: skateboard trick skate perform rail
Topic #12: dog brown leash mouth puppi
Topic #13: sleep blanket asleep curl nap
Topic #14: eleph trunk herd tusk enclosur
Topic #15: clock mount time face ornat
Topic #16: hors brown graze jockey rider
Topic #17: bu drive passeng transit travel
Topic #18: woman ladi dress femal young
Topic #19: laptop comput use lap work
Topic #20: room hotel dine corner dark
Topic #21: man shirt young guy anoth
Topic #22: zebra graze herd zoo togeth
Topic #23: teddi bear stuf brown dress
Topic #2