# Topic Modeling using Gensism

Build topics model from the given messages in 'data/5.pulledTweet-deduplicated.csv'

In [1]:
import pandas
data4=pandas.read_csv('../data/5.pulledTweet-deduplicated.csv')
data4.tail()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,index,created_at,id,text,language,shingleSet,signature
219,219,227,251.0,Tue Apr 27 23:58:10 +0000 2021,1.387194e+18,Sometimes you get a third chance Love s Thir...,en,"{1975646848, 3508350977, 2277555713, 187718236...","[18622085, 32776440, 21561465, 191860621, 9160..."
220,220,228,252.0,Tue Apr 27 23:58:10 +0000 2021,1.387194e+18,Check out this Amazon deal The Art of My Neig...,en,"{622155522, 3082850439, 1706091273, 1593029644...","[116119196, 134412392, 15045377, 922378, 36390..."
221,221,229,253.0,Tue Apr 27 23:58:08 +0000 2021,1.387194e+18,First Steps How Upright Walking Made Us Hum...,en,"{2352757504, 1318699267, 463207690, 4013102741...","[77244016, 247586965, 343106086, 653196371, 20..."
222,222,230,254.0,Tue Apr 27 23:58:04 +0000 2021,1.387194e+18,The First Day of Spring by Nancy Tucker B...,en,"{3712246153, 2391777162, 2684049684, 401310274...","[8379671, 18718807, 281343041, 177753045, 3669..."
223,223,231,255.0,Tue Apr 27 23:58:04 +0000 2021,1.387194e+18,ad off Galaxy Star Projector ...,en,"{2473481, 4013102741, 566037670, 2455870758, 1...","[144950540, 122747773, 41607353, 97773221, 829..."


In [2]:
import gensim
from sklearn.feature_extraction.text import CountVectorizer

# Use CountVectorizor to find three letter tokens, remove stop_words, 
# remove tokens that don't appear in at least x documents,
# remove tokens that appear in more than y% of the documents
vect = CountVectorizer(min_df=1, max_df=0.4, stop_words='english')

X = vect.fit_transform(data4['text'].astype(str))

# Convert sparse matrix to gensim corpus.
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

# Mapping from word IDs to words (To be used in LdaModel's id2word parameter)
id_map = dict((v, k) for k, v in vect.vocabulary_.items())

# Use the gensim.models.ldamodel.LdaModel constructor to estimate 
# LDA model parameters on the corpus, and save to the variable `ldamodel`

ldamodel = gensim.models.ldamodel.LdaModel (corpus, id2word=id_map, passes=5)

#### Print Most important topics, select number of topics and number of words in each topics you want

In [3]:
ldamodel.print_topics(num_topics=5, num_words=5)

[(42,
  '0.069*"moonlak" + 0.013*"promo" + 0.013*"save" + 0.013*"code" + 0.013*"rfdk"'),
 (48,
  '0.030*"switch" + 0.030*"st" + 0.030*"party" + 0.030*"sale" + 0.030*"gamestop"'),
 (45,
  '0.035*"kiss" + 0.035*"alien" + 0.035*"ot" + 0.035*"read" + 0.035*"space"'),
 (76,
  '0.077*"amzn" + 0.069*"growth" + 0.069*"change" + 0.069*"yoy" + 0.069*"facebook"'),
 (11,
  '0.094*"items" + 0.047*"summer" + 0.047*"left" + 0.047*"lowest" + 0.047*"colored"')]

#### Topic distriution
Given a document, this function returns a list of tuples, where each tuple is `(#topic, probability)`*

In [4]:
def topic_distribution(doc:str):
    doc=[doc]
    docVectorized = vect.transform(doc)
    docCorpus = gensim.matutils.Sparse2Corpus(docVectorized, documents_columns=False)
    result=[]
    for item in ldamodel[docCorpus]:
        result+=item
    return result

topic_distribution(" We understand your concern  Some items may get shipped separately  To clarify  have we missed the delivery  ")

[(34, 0.8899993)]

In [6]:
ldamodel.print_topic(topicno=34,topn=7)

'0.061*"ps" + 0.061*"usa" + 0.059*"best" + 0.059*"today" + 0.030*"game" + 0.030*"items" + 0.030*"amazon"'