In [233]:
import numpy as np
import nltk
import lda
import sklearn
import string
from nltk.corpus import stopwords
from nltk.stem.porter import *
from collections import Counter, defaultdict
from sklearn.preprocessing import normalize

## Preprocessing

In [207]:
EMOTION_MAP = {0:'joy', 1:'anger', 2:'disgust', 3:'fear', 4:'guilt', 5:'sadness', 6:'shame'}
docs = open('corpus.txt').readlines()
labels = np.array(np.loadtxt('classes.txt'), dtype=np.int32)
stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def preprocess(doc):
    doc = doc.lower()
    no_punctuation = doc.translate(None, string.punctuation)
    tokens = nltk.word_tokenize(no_punctuation)
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    stemmed = stem_tokens(filtered, stemmer)
    return stemmed

processed = np.array(map(preprocess, docs))

## Build vocabulary dictionary

In [208]:
vocab = np.unique(np.hstack(processed.flat))
vocab_dict = {}
for idx, w in enumerate(vocab):
    vocab_dict[w] = idx

##Map Docs to Vocab

In [209]:
docs_as_nums = map(lambda doc: [vocab_dict[w] for w in doc], processed)

In [260]:
idf = {}
tfidf = {}

def freq_map(doc):
    out = np.zeros(vocab.size, dtype=np.int32)
    for w in doc:
        out[w] += 1
    return out

#def tf(word, doc):
#    return 
    # return 0.5 + 0.5*f[t,d] / max(

def n_containing(word, docs):
    return sum([1 for doc in docs for w in doc if w == word])

def tf(freq, max_freq):
    return (0.5 + (0.5*freq/max_freq))

def inv_df(word, docs):
    if word not in idf:
        idf[word]=np.log(len(docs) / (1. + n_containing(word, docs)))
    return idf[word]

def tfidf(doc):
    freq = Counter(doc)
    max_freq = np.max(freq.values())
    out = np.zeros(vocab.size, dtype=np.float32)
    for w in freq.keys():
        out[w] = tf(freq[w], max_freq) * inv_df(w, docs_as_nums) 
    return out
            
X1 = np.array(map(freq_map, docs_as_nums), dtype=np.int32)
X2 = np.array(map(tfidf, docs_as_nums), dtype=np.float32)



In [261]:
model = lda.LDA(n_topics=7, n_iter=500, random_state=1)
model.fit(X2)

TypeError: Cannot cast array data from dtype('float32') to dtype('int64') according to the rule 'safe'

In [212]:
topic_word = model.topic_word_
n_top_words = 100
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: friend felt girl time love boyfriend told would girlfriend like didnt one feel want anoth go relationship good person someon made sad mine boy much first realli get got know talk thought day someth long met never thing life start letter said guilti disgust call left found realiz joy best end see month happen make ago knew decid ask asham man think close certain guy also happi discov separ turn around write meet year fact live give could date angri well tri reason went alway quit marri although use shame break refus way help later heard relat togeth care
Topic 1: car saw walk man bu night peopl watch alon one way street go film road stop home came drive dark back sit start got tv hous went suddenli accid afraid drunk anoth driver ran see disgust front woman someon tri come dog run could near fear us water hit look fell door room nearli lost outsid two almost young seat light cross get along cut head stand scene travel away behind left began movi follow open turn side full ticke

In [219]:
topic_map = {0: 'shame', 1: 'anger', 2: 'fear', 3: 'joy', 4: 'sadness', 5: 'disgust', 6: 'guilt'}

topic_dict = np.zeros(7)
emotion_dict = np.zeros(7)
p_map = np.zeros((7,7))

doc_topic = model.doc_topic_

for i in range(X.shape[0]):
    predicted = doc_topic[i].argmax()
    actual = labels[i]
    topic_dict[predicted] += 1; emotion_dict[actual] += 1
    p_map[predicted, actual] += 1

print np.argmax(p_map, axis=1)

sorted(topic_dict / np.sum(topic_dict)), sorted(emotion_dict / np.sum(emotion_dict))

[5 3 0 0 5 1 4]


([0.11322499669705377,
  0.12419077817413132,
  0.14083762716342979,
  0.14532963403355795,
  0.1499537587528075,
  0.16025895098427798,
  0.16620425419474172],
 [0.14149821640903687,
  0.14189456995640112,
  0.1426872770511296,
  0.14295151274937243,
  0.14334786629673668,
  0.14374421984410093,
  0.14387633769322236])

##supervised lda

In [262]:
from optparse import OptionParser
import sys, re, numpy

def load_corpus(filename):
    corpus = []
    labels = []
    labelmap = dict()
    f = open(filename, 'r')
    for line in f:
        mt = re.match(r'\[(.+?)\](.+)', line)
        if mt:
            label = mt.group(1).split(',')
            for x in label: labelmap[x] = 1
            line = mt.group(2)
        else:
            label = None
        doc = re.findall(r'\w+(?:\'\w+)?',line.lower())
        if len(doc)>0:
            corpus.append(doc)
            labels.append(label)
    f.close()
    return labelmap.keys(), corpus, labels

class LLDA:
    def __init__(self, K, alpha, beta):
        #self.K = K
        self.alpha = alpha
        self.beta = beta

    def term_to_id(self, term):
        if term not in self.vocas_id:
            voca_id = len(self.vocas)
            self.vocas_id[term] = voca_id
            self.vocas.append(term)
        else:
            voca_id = self.vocas_id[term]
        return voca_id

    def complement_label(self, label):
        if not label: return numpy.ones(len(self.labelmap))
        vec = numpy.zeros(len(self.labelmap))
        vec[0] = 1.0
        for x in label: vec[self.labelmap[x]] = 1.0
        return vec

    def set_corpus(self, labelset, corpus, labels):
        labelset.insert(0, "common")
        self.labelmap = dict(zip(labelset, range(len(labelset))))
        self.K = len(self.labelmap)

        self.vocas = []
        self.vocas_id = dict()
        self.labels = numpy.array([self.complement_label(label) for label in labels])
        self.docs = [[self.term_to_id(term) for term in doc] for doc in corpus]

        M = len(corpus)
        V = len(self.vocas)

        self.z_m_n = []
        self.n_m_z = numpy.zeros((M, self.K), dtype=int)
        self.n_z_t = numpy.zeros((self.K, V), dtype=int)
        self.n_z = numpy.zeros(self.K, dtype=int)

        for m, doc, label in zip(range(M), self.docs, self.labels):
            N_m = len(doc)
            #z_n = [label[x] for x in numpy.random.randint(len(label), size=N_m)]
            z_n = [numpy.random.multinomial(1, label / label.sum()).argmax() for x in range(N_m)]
            self.z_m_n.append(z_n)
            for t, z in zip(doc, z_n):
                self.n_m_z[m, z] += 1
                self.n_z_t[z, t] += 1
                self.n_z[z] += 1

    def inference(self):
        V = len(self.vocas)
        for m, doc, label in zip(range(len(self.docs)), self.docs, self.labels):
            for n in range(len(doc)):
                t = doc[n]
                z = self.z_m_n[m][n]
                self.n_m_z[m, z] -= 1
                self.n_z_t[z, t] -= 1
                self.n_z[z] -= 1

                denom_a = self.n_m_z[m].sum() + self.K * self.alpha
                denom_b = self.n_z_t.sum(axis=1) + V * self.beta
                p_z = label * (self.n_z_t[:, t] + self.beta) / denom_b * (self.n_m_z[m] + self.alpha) / denom_a
                new_z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()

                self.z_m_n[m][n] = new_z
                self.n_m_z[m, new_z] += 1
                self.n_z_t[new_z, t] += 1
                self.n_z[new_z] += 1

    def phi(self):
        V = len(self.vocas)
        return (self.n_z_t + self.beta) / (self.n_z[:, numpy.newaxis] + V * self.beta)

    def theta(self):
        """document-topic distribution"""
        n_alpha = self.n_m_z + self.labels * self.alpha
        return n_alpha / n_alpha.sum(axis=1)[:, numpy.newaxis]

    def perplexity(self, docs=None):
        if docs == None: docs = self.docs
        phi = self.phi()
        thetas = self.theta()

        log_per = N = 0
        for doc, theta in zip(docs, thetas):
            for w in doc:
                log_per -= numpy.log(numpy.inner(phi[:,w], theta))
            N += len(doc)
        return numpy.exp(log_per / N)

def main():
    parser = OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.001)
    parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.001)
    parser.add_option("-k", dest="K", type="int", help="number of topics", default=20)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
    (options, args) = parser.parse_args()
    if not options.filename: parser.error("need corpus filename(-f)")

    labelset, corpus, labels = load_corpus(options.filename)

    llda = LLDA(options.K, options.alpha, options.beta)
    llda.set_corpus(labelset, corpus, labels)

    for i in range(options.iteration):
        sys.stderr.write("-- %d " % (i + 1))
        llda.inference()
    #print llda.z_m_n

    phi = llda.phi()
    for v, voca in enumerate(llda.vocas):
        #print ','.join([voca]+[str(x) for x in llda.n_z_t[:,v]])
        print ','.join([voca]+[str(x) for x in phi[:,v]])