In [46]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
from stop_words import get_stop_words
#from scipy import sparse
from scipy import spatial
import gensim
import utils
import sys
import string
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import scipy.io as scio
from mpl_toolkits import mplot3d
from mpl_toolkits.mplot3d import Axes3D
from utils import random_idx
from utils import utils
from utils import lang_vectors_utils as lvu
from sklearn import manifold, datasets
#import lang_vectors_utils


%matplotlib inline
tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

num_topics = 50
passes = 20
#topn = 10
k = 5000
N = 10000
# cluster_sizes is mapping to n-gram size
# cluster_sz in random_idx referring to specific element (int) in cluster_sizes, array
cluster_sizes = [1, 2, 3, 4, 5, 6, 7, 8]
ordered = 1
#assuming this is the alphabet bc of precedent in generate_text.py
#alph = 'abc' 
alphabet = string.lowercase + ' '
RI_letters = random_idx.generate_letter_id_vectors(N, k, alphabet)
# number of stacked syntax vectors per meaning matrix
meaning_granularities = [5,10,100,200]


In [6]:
def create_doc_set(path, files):
    doc_set = []
    for filename in files:
        f = open(path + filename, "r")
        doc_set.append(f.read())
        f.close()
    return doc_set


def tokenize(doc_set):
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts


In [7]:
def meaning_matrix(ldamodel, topicid, topn, dictionary):
    " NO "
    # token 2 id dictionary
    # print dictionary.token2id
    matrix = np.zeros((N,N))
    id2token = dictionary.id2token
    topic_terms = []

    for tup in ldamodel.get_topic_terms(topicid, topn):
        topic_terms.append(str(id2token[tup[0]]))

    for i in range(0,topn):
        term_vector = random_idx.id_vector(N, topic_terms[i], alphabet, RI_letters, ordered)
        matrix[i] = term_vector
    return matrix


def meaning_matrices(ldamodel, num_topics, topn, dictionary):
    " NO "
    matrices = np.zeros((num_topics,N,N))
    for topicid in range(0,num_topics):
        matrices[topicid] = create_meaning_matrix(ldamodel, topicid, topn, dictionary)
    return matrices




In [12]:
def vectorize_dictionary(dictionary):
    vectors = {}
    for token_id, token in dictionary.items():
            vectors[token_id] = random_idx.id_vector(N, token, alphabet, RI_letters)
    return vectors

def meaning_space(ldamodel, num_topics, dictionary, vectorized_dictionary):
    """
    number of granularities meaning space x number of topics x granularity x N
    index in numpy array on axis "number of tokens" equivalent to token id
    """
    # for token ids
    num_tokens = len(dictionary.keys())
    matrices = []
    for gran_i in range(len(meaning_granularities)):
        matrix = np.zeros((num_topics, meaning_granularities[gran_i], N))
        for topicid in range(num_topics):
            terms = ldamodel.get_topic_terms(topicid, meaning_granularities[gran_i])
            for i in range(len(terms)):
                token_id = terms[i][0]
                matrix[topicid][i] = vectorized_dictionary[token_id]
        matrices.append(matrix)
    
    return matrices

In [33]:
def validate_meaning(test_files, tokenized_test_documents, dictionary, meaninged_space, meaning_granularities):
    # you can't test a meaning matrix...try graphing first
    """
    meaninged_space dimensions: number of granularities meaning space x number of topics x granularity x N
    prediction_vector = argmax(np.dot(test_token_vector, meaning matrix i))
    graph prediction vector in matplotlib
    N dimensions lmao. how to reduce dimensionality
    """
    # each vector maps to the test_token. dots graphs but mappings can printed
    token_to_id = dictionary.token2id
    # num test_files x number of granularities x number of topics
    # so each topic gets a list of test_token_vectors
    topic_sets = []
    for i in range(len(test_files)):
        topic_sets.append([])
        for j in range(len(meaning_granularities)):
            topic_sets[i].append([])
            for k in range(num_topics):
                topic_sets[i][j].append([])

    print len(topic_sets)
    print len(topic_sets[0])
    print len(topic_sets[0][0])
    
    # dictionary of lists of tokens specific to that topic
    topic_to_tokens = {}
    for topicid in range(num_topics):
        topic_to_tokens[topicid] = []
        
    for i in range(len(test_files)):
        test_tokens = tokenized_test_documents[i]
        for gran_i in range(len(meaning_granularities)):
            for test_token in test_tokens:
                vectorized_test_token = random_idx.id_vector(N, test_token, alphabet, RI_letters)
                # test_token vector will dot product with every matrix 
                # and will match up to a vector in that matrix that would make a high dot product
                # shapes (50,5,10000) and (1,10000) not aligned: 10000 (dim 2) != 1 (dim 0)
                prediction_matrix = np.dot(meaninged_space[gran_i], vectorized_test_token[0])
                topic_similarity = np.zeros(prediction_matrix.shape[0])
                for topicid in range(prediction_matrix.shape[0]):
                    topic_similarity[topicid] = sum(prediction_matrix[topicid])
                matching_topicid = np.argmax(topic_similarity)
                topic_sets[i][gran_i][matching_topicid].append(vectorized_test_token)
                topic_to_tokens[matching_topicid].append(test_token)
    
    return topic_sets, topic_to_tokens

In [None]:
def graph_3d():
    # each vector maps to the test_token. dots graphs but mappings can printed
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    xs =[1,2,3,4,5,6,7,8,9,10]
    ys =[5,6,2,3,13,4,1,2,4,8]
    zs =[2,3,3,3,5,7,9,11,9,10]

    xt =[-1,-2,-3,-4,-5,-6,-7,8,-9,-10]
    yt =[-5,-6,-2,-3,-13,-4,-1,2,-4,-8]
    zt =[-2,-3,-3,-3,-5,-7,9,-11,-9,-10]

    ax.scatter(xs, ys, zs, c='r', marker='o')
    ax.scatter(xt, yt, zt, c='b', marker='^')

    ax.set_xlabel('X Label')
    ax.set_ylabel('Y Label')
    ax.set_zlabel('Z Label')

    plt.show()

In [55]:
"""
topic sets is: len(test_files) x len(granularities) x num_topics where each element is a list of vectors belonging to that topic
these are the points. 
graph these points along with the training set. 
meaninged_space = number of granularities x number of topics x granularity x N
where the granularity dimension contains the rows of points.
and if you wanna see some of the words from the training set in each topic:
terms = ldamodel.get_topic_terms(topicid, meaning_granularities[gran_i])
"""
def graph(test_files, meaning_granularities, num_topics, topic_sets, topic_to_tokens, meaninged_space):
    n_components = 2 
    n_neighbors = 10
    method = 'hessian'
    # actual meaninged_space dimensions: (2, 50, 5, 10000)
    # dimensions wrong. need to fix. other than that, graphing is good.
    #a = '../preprocessed_texts/english/with_spaces/alice_in_wonderland.txt'

    #one_hot_encoding = random_idx.generate_letter_id_vectors(N, k)
    for i in range(len(test_files)):
        for j in range(len(meaning_granularities)):
            for k in range(num_topics):
                print meaninged_space[i][j][k].shape
                lst = meaninged_space[i][j][k].tolist()
                print len(lst)
                tup = tuple(lst)

                big_matrix = np.vstack(lst)
                big_matrix = big_matrix[0:1000]
                print big_matrix.shape

                print "compressing data"

                fig = plt.figure(figsize=(15, 8))

                tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0)
                Y = tsne.fit_transform(big_matrix)

                plt.scatter(Y[:, 0], Y[:, 1], cmap=plt.cm.Spectral)
                plt.show()
                se = manifold.SpectralEmbedding(n_components=n_components,n_neighbors=n_neighbors)
                Y = se.fit_transform(big_matrix)
                plt.scatter(Y[:, 0], Y[:, 1], cmap=plt.cm.Spectral)
                plt.show()




In [9]:
# create sample documents
raw_path = "raw_texts/texts_english/"
preprocessed_path = "preprocessed_texts/english/"
training_preprocessed_path = "preprocessed_texts/english/with_spaces/"

training_files = ["a_christmas_carol.txt", "alice_in_wonderland.txt"]
# this is for testing accuracy against the 
# actual stream that will be the test input
test_files = ["hamlet_english.txt", "percy_addleshaw.txt"]

In [10]:
training_doc_set = create_doc_set(training_preprocessed_path, training_files)
test_doc_set = create_doc_set(preprocessed_path, test_files)

tokenized_training_documents = tokenize(training_doc_set)
tokenized_test_documents = tokenize(test_doc_set)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(tokenized_training_documents)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in tokenized_training_documents]
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)

In [13]:
vectorized_dictionary = vectorize_dictionary(dictionary)
fwrite = open("meaning_vectorized_dictionary", "w")
pickle.dump(vectorized_dictionary, fwrite)
fwrite.close()

In [14]:
# meaning matrix
fread = open("meaning_vectorized_dictionary", "r")
vectorized_dictionary = pickle.load(fread)
fread.close()
meaninged_space = meaning_space(ldamodel, num_topics, dictionary, vectorized_dictionary)
#print meaninged_space
fwrite = open("meaninged_space", "w")
pickle.dump(meaninged_space, fwrite)
fwrite.close()

In [41]:
# test results
fread = open("meaning_vectorized_dictionary", "r")
fread1 = open("meaninged_space", "r")
vectorized_dictionary = pickle.load(fread)
meaninged_space = pickle.load(fread1)
fread.close()
fread1.close()

topic_sets, topic_to_tokens = validate_meaning(test_files, tokenized_test_documents, dictionary, meaninged_space, meaning_granularities)
fwrite = open("topic_sets", "w")
fwrite1 = open("topic_to_tokens", "w")
pickle.dump(topic_sets, fwrite)
pickle.dump(topic_to_tokens, fwrite1)
fwrite.close()
fwrite1.close()

2
4
50


In [56]:
fread = open("meaning_vectorized_dictionary", "r")
fread1 = open("meaninged_space", "r")
fread2 = open("topic_sets", "r")
fread3 = open("topic_to_tokens", "r")
vectorized_dictionary = pickle.load(fread)
meaninged_space = pickle.load(fread1)
topic_sets = pickle.load(fread2)
topic_to_tokens = pickle.load(fread3)
fread.close()
fread1.close()
fread2.close()
fread3.close()
graph(test_files, meaning_granularities, num_topics, topic_sets, topic_to_tokens, meaninged_space)

(50, 5, 10000)
(10000,)
10000
(1000, 1)
compressing data


ValueError: total size of new array must be unchanged

<matplotlib.figure.Figure at 0x103f05f50>