* https://github.com/derekgreene/topic-model-tutorial 
* https://github.com/derekgreene/topic-model-tutorial/blob/master/topic-modelling-with-scikitlearn.pdf

In [None]:
#######################
#basic standard modules
#######################
import sys, os
import time
import collections, itertools, copy, operator

#######################
#custom config modules
#######################
from dotenv import load_dotenv
load_dotenv(dotenv_path='../.env')
load_dotenv()

#######################
#file manipulation modules
#######################
import pickle, json

#######################
#string manipulation modules
#######################
import re, string
import nltk
print("in the other",os.getenv("TEST_MULTILINE_VAR"))
nltk.data.path.append(os.getenv("NLTKDATADIR"))
from bs4 import BeautifulSoup

#######################
#general data manipulation and data analysis modules
#######################
import pandas, gensim, sklearn, scipy, numpy, math

#######################
#custom modules
#######################
import processingData

In [2]:
############################
## extract_candidate_chunks: candidate phrases based on http://bdewilde.github.io/blog/2014/09/23/intro-to-automatic-keyphrase-extraction/
############################
def extract_candidate_chunks(lemmpostxt, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}', STOPWORDS = nltk.corpus.stopwords.words('english')):
    #def redotaggedtext(lemmpostxt, lemmpossts = [], lemmposst = []):
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    lemmpostxt_redo = []
    for lemmposst in lemmpostxt:
        lemmposst_redo = []
        for w,val,l,pos in lemmposst:
            if w not in STOPWORDS:
                #or not all(char in punct for char in w):
                lemmposst_redo.append((l,pos))
        lemmpostxt_redo.append(lemmposst_redo)
            
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(lemmposst_redo)) for lemmposst_redo in lemmpostxt_redo))

    #print(all_chunks)
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group)
                  for key, group in itertools.groupby(all_chunks, lambda w_TUPLE: w_TUPLE[2] != 'O') if key]

    return [cand for cand in candidates]

In [3]:
############################
## allrecordsPreparation 3: revision of 2 to extend it for keyphrase candidate analysis
## some articles:
## -- https://stackoverflow.com/questions/34714162/preventing-splitting-at-apostrophies-when-tokenizing-words-using-nltk
############################

def allrecordsPreparation3(allrecords, STOPWORDS=nltk.corpus.stopwords.words('english'), punct = set(string.punctuation)):
    '''
    description: tokenization and POS tagging
    input: dict of allrecords texts and data from different sources
    treatment: separating only those with posts in the forum and tokenizing the posts
    output:
    1) list of lists, each with:
    -- id
    -- username
    -- link of the post
    -- tokenized text
    -- POS tagged text
    2) list of post ids
    '''
    print('in allrecordsPreparation (len(allrecords))::',len(allrecords))
 
       
    def HTMLtoText(u):
        #block arguments
        record = u["data"]
        forumpost = record['forum']['foundjob_msg']['text']
        if forumpost == '':
            return False
        forumpostID = record['forum']['foundjob_msg']['id']
        forumpostLINK = record['forum']['foundjob_msg']['link']
        soup_forumpost = BeautifulSoup(forumpost)
        soup_forumpostTEXT = soup_forumpost.find('body').get_text().replace('’',"'")
        tksoup_forumpostTEXT = [
                                #nltk.stem.WordNetLemmatizer().lemmatize(token.lower()) 
                                token.lower()
                                for token in nltk.word_tokenize(soup_forumpostTEXT)
                                #if token.lower() not in STOPWORDS 
                                #and not re.match(r'\d+?', token)
                                #and not all(char in set(string.punctuation) for char in token)
                               ]
        return forumpostID, forumpostLINK, tksoup_forumpostTEXT
    
    def approxsts(text):
        modtext = []
        for w in tksoup_forumpostTEXT:
            w = w.lower()
            rws = []
            if len(w) > 1 and len({'.','-',':'}.intersection(w)) >= 1:
                #print(w)
                for punc in {'.','-',':'}.intersection(w):
                    rws = w.replace(punc, ' '+punc+' ').split()
                #print(rws)
            if len(rws) == 0:
                modtext.append(w)
            else:
                for w in rws:
                    modtext.append(w)
        return modtext
    
    def lemmatizationofpos(postxt, lemmws):
        lemmposws = []
        counterrors = 0
        countKerrors = 0
        countIerrors = 0
        countNCerrors = 0
        for posw in postxt:
            w = posw[0]
            pos = posw[1]
            if nltk.corpus.wordnet.synsets(w):
                try:
                    n = ''
                    if nltk.corpus.wordnet.synsets(w,pos[0].lower())[0].root_hypernyms()[0].name().split('.')[0] == 'entity':
                        n = w
                    else:
                        n = nltk.corpus.wordnet.synsets(w,pos[0].lower())[0].root_hypernyms()[0].name().split('.')[0]
                    if pos[0] == 'V':
                        n = nltk.stem.wordnet.WordNetLemmatizer().lemmatize(w,'v')
                    if pos == 'NNS' or pos == 'NN$':
                        n = nltk.stem.wordnet.WordNetLemmatizer().lemmatize(w)
                    #print(w, nltk.corpus.wordnet.synsets(w,pos[0].lower())[0].root_hypernyms(), n)
                    lemmposws.append((w,True,n,pos))
                    lemmws.append(n)
                except KeyError: #in some cases the POS tag is not recognised by wordnet synset
                    print("pos KeyErrors", w,pos)
                    lemmposws.append((w,False,w,pos))
                    lemmws.append(w)
                    counterrors += 1
                    countKerrors += 1
                except IndexError:
                    print("IndexErrors (POS not found)", w,pos) #in some cases (w,pos) pair was not found at some point of the synsets root hyernyms
                    #print(nltk.corpus.wordnet.synsets(w,pos[0].lower()))
                    lemmposws.append((w,False,w,pos))
                    lemmws.append(w)
                    counterrors += 1
                    countIerrors += 1
            else:
                #print(w, [])
                print("Error (not in corpus)", w,pos)
                lemmposws.append((w,False,w,pos))
                lemmws.append(w)
                counterrors += 1
                countNCerrors += 1
        #print("\n\ntotal number of errors : ", counterrors)
        #print("total number of noPOSerrors : ", countKerrors)
        #print("total number of indexerrors : ", countIerrors)
        #print("total number of nonincorpuserrors : ", countNCerrors)
        return lemmposws, counterrors
      
    
    lemmws = []
    lemmposrecs = []
    count = 0
    lemerrors = 0
    for u in allrecords:
        ## Getting the data as a text from HTML format (raw dataset)
        userdata = HTMLtoText(u)
        if userdata != False:
            forumpostID, forumpostLINK, tksoup_forumpostTEXT = userdata
            
            ## Sentence identification, tokenization and POS
            txt2possts = []
            st = []
            for token in tksoup_forumpostTEXT:
                st.append(token)
                if re.match(r'^[.!?]+|\n$', token):
                    postst = nltk.pos_tag(st)
                    txt2possts.append(postst)
                    st = []
            if not re.match(r'^[.!?]+|\n$', token):
                posst = nltk.pos_tag(st)
                txt2possts.append(posst)

           
            ## Lemm text
            lemmpostxt = []
            for posst in txt2possts:
                lemst, err = lemmatizationofpos(posst, lemmws)
                lemmpostxt.append(lemst)
                lemerrors += err

            
            ## Keyphrases candidates; complete lemmws with candidates that are not still there
            candidates = extract_candidate_chunks(lemmpostxt)
            for cand in candidates:
                if cand not in lemmws:
                    lemmws.append(cand)
            
            ## Adding data to the new created dataset
            lemmposrecs.append((
                            'f_'+forumpostID,
                            u["user"],
                            forumpostLINK,
                            lemmpostxt,
                            candidates
                            ))
            count += 1

    print("number of treated posts (len(count)) ::", count)
    print("lemm errs:", lemerrors)
    #return all_posedsts, forum_ids
    return lemmposrecs, nltk.FreqDist(lemmws)

In [4]:
############################
## wordimportance_var4: wordimportance_var3 modified to fit keyphrases; changes in the "opacity" formula
############################
def wordimportance_var4(lemmposrecs, lemmws_fd):
    '''
    description:
    
    This metric tries to solve some of the issues that appeared in `wordimportance_var1` metric, in particular the values of zero.
    
    This is done by setting lower bounds when required.
    
    --- In the case of opacity, a non-zero lower bound is set by changing the equation to the following:
        ```
        if 1-math.log(v)/maxdiv == 0: 1-math.log(maxdiv-1)/maxdiv # 1-math.log(v)/maxdiv == 0 if v == maxdiv
        ```
    --- In the case of sizing, a redefinition of the metric force a non-zero lower bound as well as rebumpimg rare terms in documents:
        ```
        (sum(vector)-max(vector))/sum(vector)
        ```
    
    input:
        1) tokenized list of texts
        2) freqDist of lemmatized words
    
    output: wordimportance 
    '''
    unigrams = dict([(grams, count) 
                     for grams, count in lemmws_fd.items() 
                     if len(grams.split()) == 1 or (len(grams.split()) == 2 and '' in grams.split())
                    ])
    maxdiv = math.log(sorted(unigrams.items(), key=lambda x: x[1], reverse=True)[0][1])
    opacity = collections.defaultdict(float)
    for grams, counts in lemmws_fd.items():
        opval = []
        for gram in grams.split():
            if gram == '':
                continue
            if gram in unigrams:
                opval.append(math.log(unigrams[gram]))
            else:
                opval.append(0)
        averopval = sum(opval)/len(opval)
        if 1 - averopval/maxdiv != 0.0:
            opacity[grams] = 1 - averopval/maxdiv
        else:
            opacity[grams] = 1-math.log(maxdiv-1)/maxdiv
        
    sizing_matrix = dict([(w, [0]*len(lemmposrecs)) for w in list(lemmws_fd.keys())])

    ## Count lemmatized words/characters per text  
    for i,lemmpos_t in enumerate(lemmposrecs):
        for k, lemmpos_TUPLE in enumerate(lemmpos_t[3]):
            ## Use lemmatized word
            lemmw = lemmpos_TUPLE[2]
            sizing_matrix[lemmw][i] = sizing_matrix[lemmw][i] + 1
        for cand in lemmpos_t[4]: #<--------------------------------- problem!! it counted candidates only once!!!
            if cand not in sizing_matrix:
                sizing_matrix[cand][i] = sizing_matrix[cand][i] + 1

       
    ## Normalization
    normalization = dict([(k, (sum(vector)-max(vector))/sum(vector)) if sum(vector) != 0 else (k,0.0) for k, vector in sizing_matrix.items()])
    #normalization = collections.defaultdict(float)
    #for k, vector in sizing_matrix.items():
    #    if sum(vector) != 0:
    #        normalization[k] = (sum(vector)-max(vector))/sum(vector)
    #    else:
    #        
        
    
    wordimportance = dict([(k, valnorm*opacity[k]) for k, valnorm in normalization.items()])

    return wordimportance

In [5]:
def cleaningtext(st, STOPWORDS = nltk.corpus.stopwords.words('english')):
    treated_st = []
    countwds = len(st)
    for w in st:
        if 'freecodecamp' in w:
            w = w.replace('freecodecamp','fcc')
        treated_st.append(w)
    return treated_st, countwds

In [6]:
############################
## gensim_models2 re-evaluated: modification of gensim_model2's cleanedsts_from_lemmpostxts to fit keyphrases
############################

def gensim_models2(lemmposrecs, NUM_TOPICS = 15, lemmws_fd = {}, wordimportance = {}, nltk = nltk, gensim = gensim):
    
    def cleanedsts_from_lemmpostxts2(lemmposrecs, STOPWORDS = nltk.corpus.stopwords.words('english')):
        redo_corpus_by_sts = []
        for lemmpos_r in lemmposrecs:
            candidates = lemmpos_r[4]
            treated_lemmcands = cleaningtext(candidates)[0] #passing candidates only
            redo_corpus_by_sts.append(treated_lemmcands) 
        
        #print(len(redo_corpus_by_sts), redo_corpus_by_sts[-1])
        return redo_corpus_by_sts
        
    def basedonBOW(redo_corpus_by_sts):
        dictionary = gensim.corpora.Dictionary(redo_corpus_by_sts) #[token for st in redo_corpus_by_sts for token in st]
        corpus = [dictionary.doc2bow(text) for text in redo_corpus_by_sts]
        return corpus, dictionary
    
    def basedonTFIDF(corpus):
        return gensim.models.TfidfModel(corpus)
    
    def basedonOTHER(redo_corpus_by_sts, dictionary, wordimportance):
        
        def metriccalc(w):
            if w in wordimportance:
                return 1.0+2.0**float(wordimportance[w])
            else:
                return 0.0

        corpus = []
        for sts in redo_corpus_by_sts:
            st = []
            for w in sts:
                st.append((dictionary.token2id[w], metriccalc(w)))
            corpus.append(st)
        return corpus
        
    
    redo_corpus_by_sts  = cleanedsts_from_lemmpostxts2(lemmposrecs)
    
    corpus, dictionary = basedonBOW(redo_corpus_by_sts)
    if wordimportance == {'tfidf':True}:
        tfidf = basedonTFIDF(corpus)
        corpus = tfidf[corpus]
    if wordimportance != {} and wordimportance != {'tfidf':True}:
        corpus = basedonOTHER(redo_corpus_by_sts, dictionary, wordimportance)
    
    
    lda_model = gensim.models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, minimum_probability=0.005, per_word_topics = True, minimum_phi_value = 0.001, id2word=dictionary)
    lsi_model = gensim.models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

   
    return lda_model, lsi_model, redo_corpus_by_sts, corpus, dictionary

In [7]:
with open('../data/jobproject_forum.json','r') as message:
    otp = json.load(message)
print(len(otp))
allrecords = [{ "user": k, "data": otp[k] }  for k in otp]
print(len(allrecords))

90
90


In [8]:
lemmposrecs, lemmws_fd = allrecordsPreparation3(allrecords)

in allrecordsPreparation (len(allrecords)):: 90




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


IndexErrors (POS not found) i VB
Error (not in corpus) the DT
Error (not in corpus) # #
Error (not in corpus) 100daysofcode CD
Error (not in corpus) . .
Error (not in corpus) my PRP$
Error (not in corpus) and CC
Error (not in corpus) : :
Error (not in corpus) : :
Error (not in corpus) freecodecamp NN
pos KeyErrors front JJ
pos KeyErrors 34 CD
pos KeyErrors 15 CD
Error (not in corpus) github NN
Error (not in corpus) codewars VBZ
pos KeyErrors 5 CD
pos KeyErrors 1 CD
IndexErrors (POS not found) i VBP
Error (not in corpus) : :
Error (not in corpus) , ,
Error (not in corpus) , ,
Error (not in corpus) es5 JJ
Error (not in corpus) , ,
Error (not in corpus) , ,
IndexErrors (POS not found) react NN
Error (not in corpus) , ,
Error (not in corpus) vue NN
Error (not in corpus) , ,
Error (not in corpus) d3 VB
Error (not in corpus) my PRP$
Error (not in corpus) : :
Error (not in corpus) 1-30 JJ
pos KeyErrors through IN
Error (not in corpus) , ,
pos KeyErrors 36 CD
IndexErrors (POS not found) fcc RB

In [10]:
lda_model, lsi_model, redo_corpus_by_sts, corpus, dictionary = gensim_models2(lemmposrecs, NUM_TOPICS=20, wordimportance = {'tfidf':True})

In [11]:
lda_model.print_topics(num_words=15)

[(0,
  '0.003*"time http" + 0.003*"couple month" + 0.002*"test" + 0.002*"jquery" + 0.002*"graphic design year" + 0.002*"sass" + 0.002*"story" + 0.002*"fcc" + 0.002*"interview process" + 0.001*"dream job future" + 0.001*"data science" + 0.001*"fcc backend project" + 0.001*"subject" + 0.001*"code base" + 0.001*"javascript jquery"'),
 (1,
  '0.002*"much time learning" + 0.002*"month graduation" + 0.002*"local code camp armenia" + 0.002*"political science" + 0.002*"profession" + 0.002*"bright day" + 0.002*"codacademy" + 0.002*"web development offer" + 0.002*"good kind resource" + 0.002*"good thing" + 0.002*"prototype" + 0.002*"august" + 0.002*"web development" + 0.002*"group" + 0.002*"someone"'),
 (2,
  '0.000*"lawyer" + 0.000*"day" + 0.000*"portfolio employer" + 0.000*"interview call" + 0.000*"larson" + 0.000*"css javascript" + 0.000*"developer post yesterday" + 0.000*"thank" + 0.000*"fcc" + 0.000*"thanks" + 0.000*"hey fellow camper" + 0.000*"self taught fcc" + 0.000*"developer" + 0.000*"

In [25]:
[' '.join([l for wr in rec[3] for w,_,l,pos in wr]) for rec in lemmposrecs]

["today i finish the # 100daysofcode challenge . here be my thought and accomplishment : accomplishment : freecodecamp front end certificate 34 blog post 15 github star codewars rank 5 1 job offer technology i encounter : html , css , es5 scss , gulp , react , vue , d3 my cod timeline : day 1-30 work through tutorial , read book day 36 finish fcc front end certicate day 45 finish react markdown app day 48 finish react+redux recipe+box app day 50-60 play with d3 day 60-75 html/css template conversion , gulp , web optimization day 83 finish my netflix clone day 85-95 prepared for cod interview , company interview day 97 get my 1st offer thought : coding everyday be ! important because it build momentum . my initial plan be merely to at least try for 100 day . the end result be much better than i image . now i 'm confident that i can learn any technology + language . more about me : here 's my github here 's my fcc front end certificate here 's my blog here 's my 100 day cod log",
 "i sta

In [30]:
[[l for wr in rec[3] for w,_,l,pos in wr] for rec in lemmposrecs]

[['today',
  'i',
  'finish',
  'the',
  '#',
  '100daysofcode',
  'challenge',
  '.',
  'here',
  'be',
  'my',
  'thought',
  'and',
  'accomplishment',
  ':',
  'accomplishment',
  ':',
  'freecodecamp',
  'front',
  'end',
  'certificate',
  '34',
  'blog',
  'post',
  '15',
  'github',
  'star',
  'codewars',
  'rank',
  '5',
  '1',
  'job',
  'offer',
  'technology',
  'i',
  'encounter',
  ':',
  'html',
  ',',
  'css',
  ',',
  'es5',
  'scss',
  ',',
  'gulp',
  ',',
  'react',
  ',',
  'vue',
  ',',
  'd3',
  'my',
  'cod',
  'timeline',
  ':',
  'day',
  '1-30',
  'work',
  'through',
  'tutorial',
  ',',
  'read',
  'book',
  'day',
  '36',
  'finish',
  'fcc',
  'front',
  'end',
  'certicate',
  'day',
  '45',
  'finish',
  'react',
  'markdown',
  'app',
  'day',
  '48',
  'finish',
  'react+redux',
  'recipe+box',
  'app',
  'day',
  '50-60',
  'play',
  'with',
  'd3',
  'day',
  '60-75',
  'html/css',
  'template',
  'conversion',
  ',',
  'gulp',
  ',',
  'web',
  'o

In [51]:
[' '.join([cand for cand in rec[4]]) for rec in lemmposrecs]

['today challenge thought accomplishment accomplishment freecodecamp front end certificate blog post github star job offer technology html css es5 scss gulp react vue timeline day tutorial book day front end certicate day react markdown app day react+redux recipe+box app day d3 day 60-75 html/css template conversion gulp web optimization day netflix clone day interview company interview day offer thought coding everyday momentum initial plan try day end result image technology + language github fcc front end certificate blog day log',
 'last week full-stack web developer quick outline path liberal art degree various non-programming office job lot year career path intro class intro class local community college couple year interest free code camp summer coding couple month period thing job december local meetups like node school interview internship december learning experience california parent job luck fcc certificate march fcc ‘you javascript email prospective employer phone intervie

In [52]:
#https://stackoverflow.com/questions/46282473/error-while-identify-the-coherence-value-from-lda-model
texts = [[dictionary[word_id] for word_id, freq in doc] for doc in corpus]
texts

[['today',
  'thought accomplishment',
  'es5 scss',
  'offer thought',
  'coding everyday',
  'job offer technology',
  'html',
  'tutorial',
  'company interview day',
  'web optimization day',
  'react+redux recipe+box app day',
  'front end certicate day',
  'blog',
  'technology + language',
  'netflix clone day',
  'timeline',
  'github star',
  'accomplishment',
  'd3 day 60-75 html/css template conversion',
  'gulp',
  'challenge',
  'end result',
  'book day',
  'try',
  'interview',
  'blog post',
  'fcc front end certificate',
  'vue',
  'day',
  'react markdown app day',
  'react',
  'image',
  'initial plan',
  'momentum',
  'log',
  'css',
  'github'],
 ['interview',
  'github',
  'learning experience',
  'quick outline path',
  'mind',
  'in-person interview',
  'couple month',
  'december',
  'interview process',
  'potential',
  'question',
  'coding',
  'various non-programming office job lot year',
  'interest',
  'javascript',
  'test github project enough evidence'

In [53]:
coherence_model_lda = gensim.models.coherencemodel.CoherenceModel(model=lda_model, 
                                                                  texts=texts,
                                                                  #corpus=corpus,
                                                                  window_size=20,
                                                                  dictionary=dictionary, 
                                                                  coherence='c_uci')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  -3.12684713831


In [54]:
coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, 
                                                   texts=texts, 
                                                   dictionary=dictionary,
                                                   coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.715785719407


In [48]:
# Compute Coherence Score using UMass
coherence_model_lda = gensim.models.coherencemodel.CoherenceModel(model=lda_model, 
                                     #texts=[[l for wr in rec[3] for w,_,l,pos in wr] for rec in lemmposrecs],
                                     corpus = corpus,
                                     dictionary=dictionary, 
                                     coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  -5.87869359215


In [43]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))


Perplexity:  -16.060070682


In [44]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x7fcef7007da0>

In [45]:
corpus

<gensim.interfaces.TransformedCorpus at 0x7fceca2090f0>

In [56]:
TOPICS = range(5,61, 5)

for numtopics in TOPICS:
    print('\n\nFor NUM_TOPICS:', numtopics)
    lda_model, lsi_model, redo_corpus_by_sts, corpus, dictionary = gensim_models2(lemmposrecs, NUM_TOPICS=numtopics, wordimportance = {'tfidf':True})
    umass_coherence_model_lda = gensim.models.coherencemodel.CoherenceModel(model=lda_model, 
                                     #texts=[[l for wr in rec[3] for w,_,l,pos in wr] for rec in lemmposrecs],
                                     corpus = corpus,
                                     dictionary=dictionary, 
                                     coherence="u_mass")
    umass_coherence_lda = umass_coherence_model_lda.get_coherence()
    print('\nU-Mass Coherence Score: ', umass_coherence_lda)
    texts = [[dictionary[word_id] for word_id, freq in doc] for doc in corpus]
    cv_coherence_model_lda = gensim.models.coherencemodel.CoherenceModel(model=lda_model, 
                                     texts=texts,
                                     dictionary=dictionary, 
                                     coherence="c_v")
    cv_coherence_lda = cv_coherence_model_lda.get_coherence()
    print('\nC_V Coherence Score: ', cv_coherence_lda)
    print('\nPerplexity: ', lda_model.log_perplexity(corpus))



For NUM_TOPICS: 0


ZeroDivisionError: float division by zero

### TODO
k-means to approximate the number of topics before trying a more elaborate form
Check and improve previous work:
* https://github.com/evaristoc/fccgitterDataScience/blob/master/Identifying%20Relevant%20Topics%20in%20a%20Chatroom.ipynb

Also check:
* https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
* https://radimrehurek.com/gensim/models/hdpmodel.html
* https://datascience.stackexchange.com/questions/128/latent-dirichlet-allocation-vs-hierarchical-dirichlet-process
* https://nlpforhackers.io/recipe-text-clustering/
* https://www.quora.com/Latent-Dirichlet-Allocation-LDA-What-is-the-best-way-to-determine-k-number-of-topics-in-topic-modeling
* http://nbviewer.jupyter.org/github/bmabey/hacker_news_topic_modelling/blob/master/HN%20Topic%20Model%20Talk.ipynb#topic=3&lambda=1&term=
* https://stackoverflow.com/questions/50106516/k-means-for-topic-modelling-elbow-method
* https://shuaiw.github.io/2016/12/22/topic-modeling-and-tsne-visualzation.html
* http://wdsinet.org/Annual_Meetings/2016_Proceedings/papers/Paper45.pdf
* http://ramet.elte.hu/~podani/Methods.htm
* https://hk.saowen.com/a/edc29232eae094158f66e8ff3f08d6f35b8a2a45d628fce8917d2dce6f94282e
* https://rare-technologies.com/validating-gensims-topic-coherence-pipeline/

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans

NUM_CLUSTERS = 8

from scipy.spatial.distance import cdist, pdist
K = list(range(1, n_components+1))
KM = [KMeans(n_clusters=k).fit(X) for k in K]
centroids = [k.cluster_centers_ for k in KM]

D_k = [cdist(X, cent, 'euclidean') for cent in centroids]
cIdx = [numpy.argmin(D,axis=1) for D in D_k]
dist = [numpy.min(D,axis=1) for D in D_k]
avgWithinSS = [sum(d)/X.shape[0] for d in dist]


kIdx = 8-1

# elbow curve
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(K, avgWithinSS, 'b*-')
ax.plot(K[kIdx], avgWithinSS[kIdx], marker='o', markersize=12, 
markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
plt.grid(True)
plt.xlabel('Number of clusters')
plt.ylabel('Average within-cluster sum of squares')
plt.title('Elbow for KMeans clustering')


###############################################################################
# Do the actual clustering

true_k = 8


km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)

print("Clustering sparse data with %s" % km)
newX = numpy.array(pandas.concat([pandas.DataFrame(X),datadf_foran['timestamp_norm'].reset_index()['timestamp_norm']],axis=1))
km.fit(newX)

print()

labels = [x for x in range(datadf_foran.shape[0])]
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(newX, km.labels_, sample_size=1000))

print()


fig_clusters = plt.figure()
fig_clusters.suptitle('Clusters over first 2 Components')
ax = fig_clusters.add_subplot(111)
ax.set_xlabel('Component I')
ax.set_ylabel('Component II')
plt.scatter(newX[:,0],newX[:,1], c=km.fit_predict(newX))
plt.show()

### TODO

In [None]:
############################
## raw_lda_frankjupyter2: modified to fit allrecordsPreparation2
############################
def raw_lda_frankjupyter2(lemmposrecs, wordimportance, metmodel=2, STOPWORDS=nltk.corpus.stopwords.words('english')):
    '''
    description: modified model based on https://www.frankcleary.com/svd/ for a more raw construction of a lda
    '''
    
   
    def metriccalc(st, normalizer, wordimportance):
        '''
        description:
        text normalization based on ALL characters in the sentence; why? Example: if two writers wrote 20 words, 2 of them very important, but one of them wrote half of characters stopwords, those 2 words wouldnt be penalized accordingly for this writer: the other wrote more important content
        '''       
        metfuncs = [
            lambda w: math.pow(0.1+float(wordimportance[w]),textbow[w]/normalizer) if w in list(wordimportance.keys()) else 0.0, #a sort of idf-normalization based on number of words in the text: the more the words in a text, the more important
            lambda w: float(wordimportance[w])*textbow[w] if w in list(wordimportance.keys()) else 0.0, #good but ignore those words with worimportance too low or 0 but that are frequent in text
            lambda w: 1.0+2.0**float(wordimportance[w]) if w in list(wordimportance.keys()) else 0.0, #<- probably the best one; because it is not normilized this indicator would simply say that if it has the word at least once is already on topic
            lambda w: float(wordimportance[w]) + textbow[w]/normalizer if w in list(wordimportance.keys()) else 0.0,
            lambda w: (1.0+textbow[w]/normalizer)*float(wordimportance[w]) if w in list(wordimportance.keys()) else 0.0, #<- apparently the second best with the adhoc word ranking
            lambda w: normalizer/textbow[w]*wordimportance[w]/sum(list(wordimportance.values())), #tfidf-ish
            lambda w: float(wordimportance[w]) if w in list(wordimportance.keys()) else 0.0
        ]
        likedict = collections.defaultdict(float)
        textbow = collections.Counter(st)
        for w in st:
            likedict[w] = metfuncs[metmodel](w)
        return likedict

    #redo_corpus_by_sts = []
    words_df = pandas.DataFrame()
    textreference = {}
    
    
    for textindex, lemmpos_r in enumerate(lemmposrecs):
        lemmpos_t = lemmpos_r[3]
        #print('lemmpos_t', len(lemmpos_t))
        lemm_sts = ''
        for lemmpos_TUPLE in lemmpos_t:
            w = lemmpos_TUPLE[2]
            lemm_sts = lemm_sts + w + ' '
        sts = lemm_sts.split('.')
        for stindex, lemmpos_st in enumerate(sts):
            treated_st, lensts = cleaningtext(lemmpos_st.split(), STOPWORDS=STOPWORDS)        
            #print('treated_st', lensts)
            if len(treated_st) > 3:
                likedict = metriccalc(treated_st, lensts, wordimportance)
                st_df = pandas.DataFrame.from_dict(likedict, orient='index')
                textindexing = str(textindex)+'_'+str(stindex)
                st_df.columns = [textindexing]
                textreference[textindexing] = {}
                textreference[textindexing]['treated_st'] = treated_st
                #st_df.columns = [str(count)]
                words_df = words_df.join(st_df, how='outer', )
    
    words_df = words_df.fillna(0)
    print("Number of unique words: %s" % len(words_df))
    print(words_df.head(10))
    #print(words_df.sort(columns=words_df.columns[0], ascending=False).head(10))
    
    return words_df, textreference