In [1]:
import pandas as pd
import numpy as np


In [15]:
df = pd.read_csv('data/text-preprocessing.csv', usecols=["tweet_id", "content_clean"])
df.columns = ["label", "content_clean"]
df = df.dropna()
df

Unnamed: 0,label,content_clean
0,153619597713608704,lomba poster ilmiah energi baruterbarukan dead...
1,153857491925610496,elaahhh ini si bapak tiba ngomong soal energi ...
2,154361424154603520,masih banyak daerah terpencil membutuhkam list...
3,156397219346518017,gw kira kalo gak tertarik dgn materi energi ba...
4,156936339718279168,satu lagi yang aneh inget jatropa alias minyak...
...,...,...
109700,1542534421219360772,bkpm ungkap pengembangan dan investasi ebt di ...
109701,1542540419677859842,chemation heat exchangerada beberapa macam ene...
109702,1542557674704740353,kpn yo ebt dadi sumber energi listrik utama
109703,1542592685978320896,seminar nasional himatikro aktualisasi peran e...


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(df['content_clean']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()



In [17]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 10

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

In [18]:
model.fit(tf)

In [19]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [22]:
no_top_words = 10
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,ebt,11948.1,ebt,7942.9,ebt,3000.9,energy,18222.3,ebt,2662.4,energi,26141.3,energi,18908.4,ebt,9445.8,pulau,5955.1,ebt,13339.8
1,listrik,9720.7,energi,3635.2,ke,1644.6,renewable,16673.6,di,1474.8,terbarukan,20277.0,terbarukan,10678.2,yg,5656.8,energi,3752.4,energi,3437.6
2,pembangkit,8269.7,baru,2859.7,proyek,1421.0,in,2583.1,ehal,1233.1,baru,20210.3,baru,10319.0,di,4834.4,baru,3724.3,di,3394.8
3,pln,6767.0,pln,2656.2,energi,1341.3,rt,2229.7,indonesia,1018.3,dan,7805.0,dan,9105.3,yang,4039.2,terbarukan,3654.8,pengembangan,3019.3
4,di,5561.9,terbarukan,2349.9,rp,1338.0,to,1396.0,plts,965.9,esdm,3302.9,untuk,5883.9,dan,3716.7,sumba,3518.1,esdm,2210.2
5,energi,3789.7,dan,2249.4,nasional,913.7,solar,1190.7,amp,920.9,indonesia,3191.1,yang,5329.3,ini,3714.4,oleh,3242.6,terbarukan,2195.4
6,bauran,3201.7,ruu,2055.7,presiden,834.1,of,1090.9,ya,841.0,pemerintah,2391.8,indonesia,4072.1,ada,3491.5,sebagai,3033.5,menteri,2117.7
7,target,3191.7,plnid,1996.0,akan,781.5,and,1085.5,dengan,836.0,di,2373.6,di,3937.3,bisa,3065.4,ntt,2925.1,baru,2082.2
8,dari,3136.3,pemerintah,1273.5,dan,765.5,for,988.8,jakarta,533.2,kementerian,2141.0,dalam,3921.8,kita,2902.0,sebuah,2815.1,pemerintah,2029.6
9,pada,2838.9,usaha,1016.1,pertamina,757.3,gw,916.5,power,482.9,potensi,1990.6,ini,2910.1,itu,2738.5,pendanaan,2789.1,dorong,1964.5
