In [None]:
import numpy as np
import math
from scipy.stats import gamma
from sklearn.decomposition import IncrementalPCA

import tensorly as tl
from tensorly.cp_tensor import cp_mode_dot
import tensorly.tenalg as tnl
from tensorly.tenalg.core_tenalg import tensor_dot, batched_tensor_dot, outer, inner

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

from pca import PCA

# Import TensorLy
import tensorly as tl
from tensorly.tenalg import kronecker
from tensorly import norm
from tensorly.decomposition import symmetric_parafac_power_iteration as sym_parafac
from tensorly.tenalg.core_tenalg.tensor_product import batched_tensor_dot
from tensorly.testing import assert_array_equal, assert_array_almost_equal

from tensorly.contrib.sparse.cp_tensor import cp_to_tensor

from tlda_final import TLDA
import cumulant_gradient
import tensor_lda_util as tl_util
## Break down into steps, then re-engineer.

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
porter = PorterStemmer()

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [porter.stem(self.wnl.lemmatize(t,get_wordnet_pos(t))) for t in word_tokenize(articles)]
    
class StemTokenizer(object):
    def __init__(self):
        self.porter = PorterStemmer()
    def __call__(self, articles):
        return [self.porter.stem(t) for t in word_tokenize(articles)]
        
import gc
from datetime import datetime

import scipy

In [None]:
# Import Data and convert to tensor
n_samples = 300000
df         = pd.read_csv("../Data/TwitterSpeech.csv")
df_p       = pd.read_csv("../Data/paradigm.csv")
print(df.head())

stop_words = (stopwords.words('english'))
added_words = ["amendment","family","get","adam","hear","feder","de","la","los","democrat","republican",
               'el', 'para', 'en', 'que',"lo",
               "amend","back","protect","commun","service","work","around","alway","november","august","january",
               "happen","ive","hall","nation","work","service","this","discuss","community","learn","congressional","amendment","speaker","say",
               "said","talk","congrats","pelosi","gop","congratulations","are","as","i", "me", "my", "myself", "we", "our", "ours", "ourselves", 
               "you", "your", "yours","he","her","him","she","hers","that","be","with","their","they're","is","was","been","not","they","it","have",
               "will","has","by","for","madam","Speaker","Mister","Gentleman","Gentlewoman","lady","voinovich","kayla","111th","115th","114th","rodgers",      
               "clerk" ,    "honor" ,   "address"   ,     
               "house" , "start"   ,"amend","bipartisan","bill",   "114th"    ,   "congress"  ,     
               "one",   "thing"    ,"bring","put", "north","give","keep","pa","even","texa","year","join","well",
               "call",  "learned"    ,   "legislator","things" ,"things","can't","can","cant","will","go","going","let",
               "lets","let's","say","says","know","talk","talked","talks","lady","honorable","dont","think","said","something",
               "something","wont","people","make","want","went","goes","congressmen","people","person","like","come","from",
               "need","us"]

stop_words= list(np.append(stop_words,added_words))

In [None]:
## Pre-process Data
print(int(0.002*n_samples))
countvec = CountVectorizer(tokenizer=StemTokenizer(),
                                strip_accents = 'unicode', # works 
                                stop_words = stop_words, # works
                                lowercase = True, # works
                                ngram_range = (1,2),
                                max_df = 0.4, # works
                                min_df = int(0.002*n_samples))

dtm           = countvec.fit_transform(df.tweet[ df.year>=2019][:n_samples])

print(dtm.toarray().shape)

In [None]:
    # find to sentiments
    sum_words = dtm.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     countvec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    print(words_freq[:100])
    
    top_sents = [ i[0] for i in  words_freq[:1000] if i[0] in df_p["Token"].unique() ]
    print(top_sents)

In [None]:
dtm_pos = np.float_(dtm.toarray())
dtm_neu = np.float_(dtm.toarray())
dtm_neg = np.float_(dtm.toarray())
for i,v in countvec.vocabulary_.items():

        if i in df_p["Token"].unique() and i in top_sents[:100]:
            print(i)
            print(df_p.Positive[df_p.Token==i])
            dtm_pos[:,v] *= df_p.Positive[df_p.Token==i].unique() 
            dtm_neg[:,v] *= df_p.Negative[df_p.Token==i].unique() 
            dtm_neu[:,v] *= df_p.Neutral[df_p.Token==i].unique()
        else:
            dtm_pos[:,v] *= 1/3
            dtm_neg[:,v] *= 1/3
            dtm_neu[:,v] *= 1/3

In [None]:
dtm_sent=scipy.sparse.csr_matrix(np.concatenate((dtm_pos,dtm_neg,dtm_neu),axis=1))

In [None]:
del dtm_pos,dtm_neg,dtm_neu
gc.collect()

In [None]:
a       = tl.tensor(dtm_sent.toarray(),dtype=np.float16)



In [None]:
del dtm_sent
gc.collect()


In [None]:

M1      = tl.mean(a, axis=0)


In [None]:
x_cent = scipy.sparse.csr_matrix(a - M1,dtype=np.float16) #center the data using the first moment 

In [None]:

gc.collect()


In [None]:

        start = datetime.now()
        print("now =", start)
    
    
        batch_size    = np.int(n_samples/20)
        verbose       = True 
        n_topic=  20
        
        beta_0=0.003
        
        pca = PCA(n_topic, beta_0, 30000)
        pca.fit(x_cent) # fits PCA to  data, gives W
        x_whit = pca.transform(x_cent) # produces a whitened words counts <W,x> for centered data x
        now = datetime.now()
        print("now =", now)
        pca_time = now- start 

In [None]:

gc.collect()
print(pca_time)

In [None]:
from importlib import reload  
import tlda_final
reload(tlda_final)
from tlda_final import TLDA


now = datetime.now()
print("now =", now)
learning_rate = 0.01 
batch_size =15000
t = TLDA(n_topic,n_senti=3, alpha_0= beta_0, n_iter_train=1000, n_iter_test=150, batch_size=batch_size,
         learning_rate=learning_rate)
now = datetime.now()
print("now =", now)

In [None]:
now = datetime.now()
print("now =", now)

t.fit(x_whit,verbose=True) # fit whitened wordcounts to get decomposition of M3 through SGD

now = datetime.now()
print("now =", now)

In [None]:
now = datetime.now()
print("now =", now)


t.factors_ = pca.reverse_transform(t.factors_)  # unwhiten the eigenvectors to get unscaled word-level factors

''' 
Recover alpha_hat from the eigenvalues of M3
'''  

eig_vals = [np.linalg.norm(k,3) for k in t.factors_ ]
# normalize beta
alpha      = np.power(eig_vals, -2)
print(alpha.shape)
alpha_norm = (alpha / alpha.sum()) * beta_0
t.alpha_   = alpha_norm
        
print(alpha_norm)

t.predict(x_whit,w_mat=True,doc_predict=False)  # normalize the factors 


now = datetime.now()
print("now =", now)

In [None]:

factors= t.factors_
print((factors.shape))
factors_reshape = np.concatenate((factors[:,0:(factors.shape[1]//3)],
                                  factors[:,(factors.shape[1]//3):(2*factors.shape[1]//3)],
                                  factors[:,(2*factors.shape[1]//3):(factors.shape[1])]),axis=0)
factors_reshape.shape

In [None]:
t.factors_ = factors_reshape
#t.factors_ = factors
now = datetime.now()
 
print("now =", now)

In [None]:

n_top_words=20
#print(t_n_indices)

for k in range(n_topic*3):
    if k ==0:
        t_n_indices   =t.factors_[k,:].argsort()[:-n_top_words - 1:-1]
        top_words_JST = [i for i,v in countvec.vocabulary_.items() if v in t_n_indices]
    else:
        t_n_indices   =t.factors_[k,:].argsort()[:-n_top_words - 1:-1]
        top_words_JST = np.vstack([top_words_JST, [i for i,v in countvec.vocabulary_.items() if v in t_n_indices]])
        print([i for i,v in countvec.vocabulary_.items() if v in t_n_indices])


In [None]:

now = datetime.now()



print("now =", now)
print(t.factors_.shape)
a_word       = tl.tensor(dtm.toarray(),dtype=tl.float32)

doc_topic_dist, topic_word_dist = t.predict(a_word,w_mat=False,doc_predict=True)
now = datetime.now()
 
print("now =", now)
end = datetime.now()
print(end)

In [None]:
 from sklearn.decomposition import LatentDirichletAllocation

In [None]:
dtm_sparse = scipy.sparse.csc_matrix(dtm.toarray(),dtype=np.float16)
dtm_sparse.shape

In [None]:
start_gensim = datetime.now()
lda = LatentDirichletAllocation(n_components=84,n_jobs=-1,
                                learning_method="online",verbose=1,max_iter=1000,
                                evaluate_every=10,batch_size=15000,max_doc_update_iter=150,perp_tol=1e-3)
lda.fit(dtm)


end_1 = datetime.now()

doc_topic_LDA = lda.transform(dtm)
end_2 = datetime.now()

In [None]:
print(end-start)
print(end_2-start_gensim)

In [None]:
n_top_words=20
#print(t_n_indices)

for k in range(n_topic*3):
    if k ==0:
        t_n_indices   =lda.components_[k,:].argsort()[:-n_top_words - 1:-1]
        top_words_LDA = [i for i,v in countvec.vocabulary_.items() if v in t_n_indices]
    else:
        t_n_indices   = lda.components_[k,:].argsort()[:-n_top_words - 1:-1]
        top_words_LDA = np.vstack([top_words_JST, [i for i,v in countvec.vocabulary_.items() if v in t_n_indices]])

In [None]:
print(top_words_LDA)

In [None]:
pd.DataFrame(doc_topic_LDA).to_csv("../Data/theta_LDA.csv")

In [None]:
pd.DataFrame(doc_topic_dist).to_csv("../Data/theta_JST_Tensor.csv")

In [None]:
pd.DataFrame(top_words_LDA).to_csv("../Data/theta_LDA_TopWords.csv")

In [None]:
pd.DataFrame(top_words_JST).to_csv("../Data/theta_JST_TopWords.csv")