In [None]:
import pandas as pd

In [None]:
df_full = pd.read_csv('./full.csv')

In [None]:
df_full.shape

In [None]:
df_full.sample(3)

In [None]:
from tqdm.notebook import tqdm_notebook

In [None]:
import spacy
from spacy.language import Language
from spacy.lang.en import STOP_WORDS

nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("merge_entities")
# nlp.add_pipe("merge_noun_chunks")
nlp.pipe_names

In [None]:
# PERSON:      People, including fictional.
# NORP:        Nationalities or religious or political groups.
# FAC:         Buildings, airports, highways, bridges, etc.
# ORG:         Companies, agencies, institutions, etc.
# GPE:         Countries, cities, states.
# LOC:         Non-GPE locations, mountain ranges, bodies of water.
# PRODUCT:     Objects, vehicles, foods, etc. (Not services.)
# EVENT:       Named hurricanes, battles, wars, sports events, etc.
# WORK_OF_ART: Titles of books, songs, etc.
# LAW:         Named documents made into laws.
# LANGUAGE:    Any named language.
# DATE:        Absolute or relative dates or periods.
# TIME:        Times smaller than a day.
# PERCENT:     Percentage, including ”%“.
# MONEY:       Monetary values, including unit.
# QUANTITY:    Measurements, as of weight or distance.
# ORDINAL:     “first”, “second”, etc.
# CARDINAL:    Numerals that do not fall under another type.

In [None]:
from rake_nltk import Rake

single_quote_unicode = ord("'")
translation_table_text = str.maketrans(
    {
        '`': single_quote_unicode,
        '‘': single_quote_unicode,
        '’': single_quote_unicode,
        '“': single_quote_unicode,
        '”': single_quote_unicode,
        '-': None,
    }
)
translation_table_token = str.maketrans(
    {
        "'": None,
        '"': None,
        '.': None
    }
)
translation_table_title = str.maketrans(
    {
        '`': single_quote_unicode,
        '‘': single_quote_unicode,
        '’': single_quote_unicode,
        '“': single_quote_unicode,
        '”': single_quote_unicode,
        ',': None,
        '-': None,
        '.': None        
    }
)


def titles2tokens(titles_text):
    return [title2tokens(title_text) for title_text in tqdm_notebook(titles_text)]

def title2tokens(title_text):
    r = Rake()
    r.extract_keywords_from_text(title_text)
    return r.get_ranked_phrases()

def corpus2tokens(corpus_text, *args, **kwargs):
    return [doc2tokens(doc) for doc in nlp.pipe(tqdm_notebook(corpus_text), *args, **kwargs)]

def doc2tokens(doc):
    tokens = [token for token in doc if not (token.is_punct or token.is_space)]
    return process_tokens(tokens, doc.ents)

def show_ents(ents):
    for ent in ents:
        print(ent.text+' - ' +str(ent.start_char) +' - '+ str(ent.end_char) +' - '+ent.label_+ ' - '+str(spacy.explain(ent.label_)))
        
def process_tokens(tokens, ents):

    
    ent_labels_to_sub = [
        "DATE", # Absolute or relative dates or periods
        "CARDINAL", # Numerals that do not fall under another type
        "PERCENT", # Percentage, including "%"
        "TIME", # Times smaller than a day
        "MONEY", # Monetary values, including unit
        "ORDINAL", # "first", "second", etc.
    ]
    tokens_processed = []
    stringed_ents = [ent.text.lower() for ent in ents]
    ent_tokens = []
    for token in tokens:
        stringed_token = token.text.lower()
        if stringed_token in stringed_ents:
            ent_tokens.append(stringed_token)
            ent_label = ents[stringed_ents.index(stringed_token)].label_
            if ent_label in ent_labels_to_sub:
                tokens_processed.append(ent_label)
                continue
#             stringed_token = ent_label + "|" + stringed_token.translate(translation_table_token)
            stringed_token = stringed_token.translate(translation_table_token)
        if stringed_token not in STOP_WORDS:
            tokens_processed.append(stringed_token)
    len_ent_tokens = len(set(ent_tokens))
    len_stringed_ents = len(set(stringed_ents))
    if len_ent_tokens != len_stringed_ents:
        print(f'WARNING: Somehow the number of unique tokens which are ents ({len_ent_tokens}) does not match the total number of unique ents ({len_stringed_ents})')
        diff = list(set(stringed_ents) - set(ent_tokens))
        if not diff:
            diff = list(set(ent_tokens) - set(stringed_ents))
            print(diff, "exist in tokens but not in ents")
        print(diff, "exist in ents but not in tokens")
        print("tokens: ", "\n", tokens, "\n\n")
        print("ents: ", "\n", ents, "\n\n")
    return tokens_processed

In [None]:
%%time

corpus_text_full = [
    item.translate(translation_table_text)
    for item in df_full.text.to_list()
]
corpus_text_tokens_full = corpus2tokens(corpus_text_full)

In [None]:
%%time
corpus_title_full = [
    item.translate(translation_table_title)
    for item in df_full.title.to_list()
]
corpus_title_tokens_full = titles2tokens(corpus_title_full)

In [None]:
import sys, re, numpy, random, string, json, pyLDAvis, operator
from functools import reduce
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
iteration=50
samplesize = 200

In [None]:
alpha=0.001
beta=0.001

In [None]:
class LLDA:
    def __init__(self, alpha, beta):
        self.alpha = alpha
        self.beta = beta
    def set_corpus(self, labelset, corpus, labels):
        labelset.insert(0, "common")
        self.labelmap = dict(zip(labelset, range(len(labelset))))        
        self.K = len(self.labelmap)
        self.vocas = []
        self.vocas_id = dict()
        self.labels = numpy.array([self.complement_label(label) for label in labels])

        
        self.docs = [[self.term_to_id(term) for term in doc] for doc in corpus]        
        M = len(corpus)
        V = len(self.vocas)

        self.z_m_n = []
        self.n_m_z = numpy.zeros((M, self.K), dtype=int)
        self.n_z_t = numpy.zeros((self.K, V), dtype=int)
        self.n_z = numpy.zeros(self.K, dtype=int)        
        for m, doc, label in zip(range(M), self.docs, self.labels):
            N_m = len(doc)
            #z_n = [label[x] for x in numpy.random.randint(len(label), size=N_m)]
            z_n = [numpy.random.multinomial(1, label / label.sum()).argmax() for x in range(N_m)]
            self.z_m_n.append(z_n)
            for t, z in zip(doc, z_n):
                self.n_m_z[m, z] += 1
                self.n_z_t[z, t] += 1
                self.n_z[z] += 1        
    def complement_label(self, label):
        if not label: return numpy.ones(len(self.labelmap))
        vec = numpy.zeros(len(self.labelmap))
        vec[0] = 1.0
        for x in label:            
            vec[self.labelmap[x]] = 1.0
        return vec
    def term_to_id(self, term):
        if term not in self.vocas_id:
            voca_id = len(self.vocas)
            self.vocas_id[term] = voca_id
            self.vocas.append(term)
        else:
            voca_id = self.vocas_id[term]
        return voca_id
    def perplexity(self, docs=None):
        if docs == None: docs = self.docs
        phi = self.phi()
        thetas = self.theta()

        log_per = N = 0
        for doc, theta in zip(docs, thetas):
            for w in doc:
                log_per -= numpy.log(numpy.inner(phi[:,w], theta))
            N += len(doc)
        return numpy.exp(log_per / N)
    def phi(self):
        V = len(self.vocas)        
        return (self.n_z_t + self.beta) / (self.n_z[:, numpy.newaxis] + V * self.beta)

    def theta(self):
        """document-topic distribution"""
        n_alpha = self.n_m_z + self.labels * self.alpha
        return n_alpha / n_alpha.sum(axis=1)[:, numpy.newaxis]
    def inference(self):
        V = len(self.vocas)
        for m, doc, label in zip(range(len(self.docs)), self.docs, self.labels):
            for n in range(len(doc)):
                t = doc[n]
                z = self.z_m_n[m][n]
                self.n_m_z[m, z] -= 1
                self.n_z_t[z, t] -= 1
                self.n_z[z] -= 1

                denom_a = self.n_m_z[m].sum() + self.K * self.alpha
                denom_b = self.n_z_t.sum(axis=1) + V * self.beta
                p_z = label * (self.n_z_t[:, t] + self.beta) / denom_b * (self.n_m_z[m] + self.alpha) / denom_a
                new_z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()

                self.z_m_n[m][n] = new_z
                self.n_m_z[m, new_z] += 1
                self.n_z_t[new_z, t] += 1
                self.n_z[new_z] += 1

In [None]:
llda = LLDA(alpha, beta)

In [None]:
# labelset = ["a", "b", "c", "d", "e"]
# corpus = [
#     ["category", "Ay", "and", "Bee"],
#     ["Not", "Ay", "Cee", "Dee", "Cee"],
#     ["Give", "Me", "an", "Eee"]

# ]
# labels = [
#     ["a", "b"],
#     ["a", "c", "d"],
#     ["e"]
# ]

# llda.set_corpus(labelset, corpus, labels)
labelset = list(set(reduce(list.__add__, corpus_title_tokens_full)))
llda.set_corpus(labelset, corpus_text_tokens_full, corpus_title_tokens_full)

In [None]:
labelmap = dict(zip(labelset, range(len(labelset))))            
K = len(labelmap)
print("M=%d, V=%d, L=%d, K=%d" % (len(corpus), len(llda.vocas), len(labelset), K))
print("len_corpus=%d, len_vocab=%d, len_labelset=%d, len_labelmap=%d" % (len(corpus), len(llda.vocas), len(labelset), K))

In [None]:
x = []
y = []
for i in range(iteration):
    perplexity = llda.perplexity()
    sys.stderr.write("-- %d : %.4f\n" % (i, perplexity))
    x.append(i)
    y.append(perplexity)
    llda.inference()
print("perplexity : %.4f" % llda.perplexity())

In [None]:
plt.plot(x, y)

In [None]:
phi = llda.phi()
result = {}
for k, label in enumerate(labelset):
    #print("\n-- label %d : %s" % (k, label))
    result[label]=[]
    for w in numpy.argsort(-phi[k])[:10]:
        #print("%s: %.4f" % (llda.vocas[w], phi[k,w]))
        result[label].append(str(llda.vocas[w])+":"+str(phi[k,w]))

In [None]:
import numpy as np
import matplotlib.pylab as plt 

#dat = np.random.randn(10,10)
dat = phi
#dat = np.array(dat)
#dat = np.minimum(100, dat*100)
#dat = dat.astype(np.int32)


plt.imshow(dat, interpolation='none')

clb = plt.colorbar()
clb.ax.set_title('This is a title')

#plt.show()

In [None]:
dat

In [None]:
pandaResult = pd.DataFrame(result).T

In [None]:
pandaResult.sample(30)

In [None]:
doc_lengths = []
for e in llda.docs:
    doc_lengths.append(len(e))

In [None]:
tf = {}
for doc in llda.docs:
    for w in doc:
        if w in tf:
            tf[w]+=1
        else:
            tf[w]=1
            
tf2 = {}
for e in llda.vocas_id:
    tf2[e]=tf[llda.vocas_id[e]]
    
data = {'topic_term_dists': llda.phi(),
        'doc_topic_dists': llda.theta(),
        'vocab':llda.vocas,
        'doc_lengths':doc_lengths,
        'term_frequency':tf2
       }

In [None]:
movies_model_data = data

In [None]:
print('Topic-Term shape: %s' % str(np.array(movies_model_data['topic_term_dists']).shape))
print('Doc-Topic shape: %s' % str(np.array(movies_model_data['doc_topic_dists']).shape))

In [None]:
lda_vis = pyLDAvis.prepare(**movies_model_data)

In [None]:
pyLDAvis.display(lda_vis)

In [None]:
sorted_x = sorted(llda.labelmap.items(), key=operator.itemgetter(1))
pd.Series(sorted_x)

In [None]:
sorted_x = sorted(llda.labelmap.items(), key=operator.itemgetter(1))

In [None]:
sorted_x

In [None]:
llda.K

In [None]:
len(corpus)

In [None]:
# set_corpus(self, labelset, corpus, labels):
#         labelset.insert(0, "common")
#         self.labelmap = dict(zip(labelset, range(len(labelset))))        
#         self.K = len(self.labelmap)
#         self.vocas = []
#         self.vocas_id = dict()
        

        
#         self.docs = [[self.term_to_id(term) for term in doc] for doc in corpus]
#         print(self.docs)
#         M = len(corpus)
#         V = len(self.vocas)

#         self.z_m_n = []
#         self.n_m_z = numpy.zeros((M, self.K), dtype=int)
#         self.n_z_t = numpy.zeros((self.K, V), dtype=int)
#         self.n_z = numpy.zeros(self.K, dtype=int)
#         print(self.n_m_z)
#         print(self.n_z_t)
#         print(self.n_z)
#         for m, doc, label in zip(range(M), self.docs, self.labels):
#             N_m = len(doc)
#             #z_n = [label[x] for x in numpy.random.randint(len(label), size=N_m)]
#             z_n = [numpy.random.multinomial(1, label / label.sum()).argmax() for x in range(N_m)]
#             self.z_m_n.append(z_n)
#             for t, z in zip(doc, z_n):
#                 self.n_m_z[m, z] += 1
#                 self.n_z_t[z, t] += 1
#                 self.n_z[z] += 1        
#     def perplexity(self, docs=None):
#         if docs == None: docs = self.docs
#         phi = self.phi()
#         thetas = self.theta()

#         log_per = N = 0
#         for doc, theta in zip(docs, thetas):
#             for w in doc:
#                 log_per -= numpy.log(numpy.inner(phi[:,w], theta))
#             N += len(doc)
#         return numpy.exp(log_per / N)
#     def phi(self):
        
#         return (self.n_z_t + self.beta) / (self.n_z[:, numpy.newaxis] + V * self.beta)

#     def inference(self):
        
#         for m, doc, label in zip(range(len(self.docs)), self.docs, self.labels):
#             for n in range(len(doc)):
#                 t = doc[n]
#                 z = self.z_m_n[m][n]
#                 self.n_m_z[m, z] -= 1
#                 self.n_z_t[z, t] -= 1
#                 self.n_z[z] -= 1

#                 denom_a = self.n_m_z[m].sum() + self.K * self.alpha
#                 denom_b = self.n_z_t.sum(axis=1) + V * self.beta
#                 p_z = label * (self.n_z_t[:, t] + self.beta) / denom_b * (self.n_m_z[m] + self.alpha) / denom_a
#                 new_z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()

In [None]:
for k,v in (llda.vocas_id).items():
    if v == 2:
        print(k)

In [None]:
len(llda.vocas_id)

In [None]:
len(llda.docs)

In [None]:
tf

In [None]:
llda.vocas_id

In [None]:
doc_ids = [0, 1, 2]
f, ax= plt.subplots(len(doc_ids), 1, figsize=(8, 12), sharex=True)
K = len(labelmap)
for i, k in enumerate(doc_ids):
    #ax[i].stem(doc_topic[k,:], linefmt='r-',
    ax[i].stem(llda.theta()[k], linefmt='r-',
               markerfmt='ro', basefmt='w-')
    ax[i].set_xlim(-1, K+1)
    ax[i].set_ylim(0, 1)
    ax[i].set_ylabel("Prob")
    ax[i].set_title(f"Document {k}:\n{' '.join(corpus[k])}")
    
    ax[i].set_xlabel("Topic")
plt.tight_layout()

In [None]:
print(llda.theta()[1][1])

In [None]:
print(sorted_x[1])

In [None]:
print(sorted_x[5])