In [7]:
import spacy
import gensim
from gensim.models import TfidfModel

In [81]:
data=["i love my mum","i love my mum","i love my mum","my dad in better shape","i love my dad and my dad","i love my mum and dad"]

In [155]:
class LDAModel2:

    def __text2bow(self,text):
        a=self.__lem_tok_1(text)
        if self.__ngram == "bigram":
            b=self.__bigram_mod[a[0]]  
        else:
            b=self.__trigram_mod[self.__bigram_mod[a]]
        return self.__id2word.doc2bow(b)
    
    def __lem_tok_1(self,texts):
        nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
        lemmatized_text = " ".join([token.lemma_ for token in nlp(text) if token.pos_ in self.__allowed_postags])
        return gensim.utils.simple_preprocess(lemmatized_text, deacc=True)
        
    def __lem_tok(self):
        nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
        lemmatized_texts = [" ".join([token.lemma_ for token in nlp(text) if token.pos_ in self.__allowed_postags]) for text in self.__texts]
        self.__data_words = [gensim.utils.simple_preprocess(text, deacc=True) for text in lemmatized_texts]

    def __set_bigram(self):
        bigram = gensim.models.Phrases(self.__data_words, min_count=5, threshold=100)
        self.__bigram_mod = gensim.models.phrases.Phraser(bigram)
        self.__data_words_ngram = [self.__bigram_mod[text] for text in self.__data_words]
    
    def __set_trigram(self):
        bigram = gensim.models.Phrases(self.__data_words, min_count=5, threshold=100)
        self.__bigram_mod = gensim.models.phrases.Phraser(bigram)
        trigram = gensim.models.Phrases(self.__data_words, min_count=5, threshold=100)
        self.__trigram_mod = gensim.models.phrases.Phraser(trigram)
        self.__data_words_ngram = [self.__trigram_mod[self.__bigram_mod[text]] for text in self.__data_words]
        
    def __tfidfRemoval(self):
        self.__id2word = gensim.corpora.Dictionary(self.__data_words_ngram)
        self.__corpus = [self.__id2word.doc2bow(text) for text in self.__data_words_ngram]
        tfidf = TfidfModel(self.__corpus, id2word=self.__id2word)
        words  = []
        words_missing_in_tfidf = []
        for i in range(len(self.__corpus)):
            bow = self.__corpus[i]
            low_value_words = [] #reinitialize to be safe. You can skip this.
            tfidf_ids = [id for id, _ in tfidf[bow]]
            bow_ids = [id for id, _ in bow]
            low_value_words = [id for id, value in tfidf[bow] if value < self.__low_value]
            drops = low_value_words+words_missing_in_tfidf
            words.extend(self.__id2word[item] for item in drops)
            words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf score 0 will be missing

            new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
            self.__corpus[i] = new_bow

    def __init__(self,texts,num_topics=100,allowed_postags=["NOUN", "ADJ", "VERB", "ADV"],ngram="trigram",low_value=0.03,save=False):
        self.__ngram=ngram
        self.__low_value=low_value
        self.__texts=texts
        self.__allowed_postags=allowed_postags

        self.__lem_tok()
        
        if ngram == "bigram":
            self.__set_bigram()
        elif ngram == "trigram":
            self.__set_trigram()
        else:
            raise ValueError("ngram must be either bigram or trigram")
        
        self.__tfidfRemoval()

        self.__lda_model = gensim.models.ldamodel.LdaModel(corpus=self.__corpus,
                                           id2word=self.__id2word,
                                           num_topics=num_topics,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")
    
    def print_topics(self,num_words=10):
        return self.__lda_model.print_topics(num_words)
    
    def get_document_topics(self,text):
        a=self.__text2bow(text)
        return self.__lda_model.get_document_topics(a)
    
    def get_document_topic(self,text):
        result=self.get_document_topics(text)
        idxMax=max(result,key=lambda x:x[1])[0]
        return self.__lda_model.show_topic(idxMax)

In [156]:
lm2=LDAModel2(data,num_topics=10)

In [159]:
print(lm2.get_document_topic(data[-1]))

[('love', 0.40297326), ('mum', 0.32395872), ('dad', 0.25726476), ('well', 0.0079016), ('shape', 0.007901599)]
