# Tugas 5 (LSA Topic Modeling)

### Read Data

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
df = pd.read_csv(
    'https://raw.githubusercontent.com/errjak/dataset/main/datacomment.csv')
df

Unnamed: 0,comment,comment (clean)
0,ID\nSkip navigation\nSign in\nIni Jagoan Pilpr...,id skip navigation sign ini ini jagoan pilpres...
1,Jika masarakat Indonesia ingin ketentraman dan...,jika masarakat indonesia ingin ketentraman dan...
2,Kita doakan agar datang pemimpin nasional yg b...,kita doakan agar datang pemimpin nasional yang...
3,"Semoga kita jgn salah pilih, karna menyangkut ...",semoga kita jangan salah pilih karena menyangk...
4,Ganjar menang di survei Anis menang di hati ma...,ganjar meg di survei anis meg di hati masyarakat
...,...,...
615,Kalo pilpres tidak pake sistim degital tetap k...,kalo pilpres tidak pakai sistim degital tetap ...
616,Sapa aja boleh Asal jgn ganjar dan puan,sapa saja boleh asal jangan ganjar dan puan
617,3 Mestro Pemimpin Milineal Bangsa Mengubah wuj...,mestro pemimpin milineal bangsa mengubah wuju...
618,"Kalo begitu, PDIp lh yg mau trs berkuasa",kalo begitu pdip boleh yang mau terus berkuasa


### Modelling

In [3]:
import nltk
nltk.download('stopwords', quiet=True)


True

In [4]:
from nltk.tokenize import RegexpTokenizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
stopwords = stopwords.words('indonesian')

# Membentuk matriks dokumen x kata
tokenizer = RegexpTokenizer(r'\w+')
vectorizer = TfidfVectorizer(lowercase=True,
                             stop_words=stopwords,
                             tokenizer=tokenizer.tokenize)

tfidf_matrix = vectorizer.fit_transform(df['comment (clean)'])

# Melakukan dekomposisi matriks dengan SVD
svd_model = TruncatedSVD(n_components=4)
lsa_matrix = svd_model.fit_transform(tfidf_matrix)

### bobot kata

In [5]:
# bobot kata terhadap masing masing topik
terms = vectorizer.get_feature_names_out()

for index, component in enumerate(svd_model.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:2]
    print("Topic "+str(index)+": ",top_terms_key)

Topic 0:  [('ganjar', 0.8015227951104641), ('pranowo', 0.36086427029376134)]
Topic 1:  [('anis', 0.4127634533301101), ('semoga', 0.3045565537768373)]
Topic 2:  [('anis', 0.6561329950834731), ('baswedan', 0.1639745610581629)]
Topic 3:  [('ridwan', 0.35821626868527273), ('kamil', 0.33859288840946716)]


### bobot setiap topik

In [6]:
# bobot setiap topik terhadap  dokumen
df_lsa = pd.DataFrame(lsa_matrix, columns=[
                      "Topik 0", "Topik 1", "Topik 2", "Topik 3"])
df_lsa = pd.concat([df["comment (clean)"], df_lsa], axis=1)
df_lsa['Topik'] = df_lsa[['Topik 0', 'Topik 1', 'Topik 2', 'Topik 3']].apply(
    lambda x: x.argmax(), axis=1)

df_lsa

Unnamed: 0,comment (clean),Topik 0,Topik 1,Topik 2,Topik 3,Topik
0,id skip navigation sign ini ini jagoan pilpres...,0.144787,0.124085,-0.033238,-0.007034,0
1,jika masarakat indonesia ingin ketentraman dan...,0.035902,0.088992,-0.062081,-0.049254,1
2,kita doakan agar datang pemimpin nasional yang...,0.061436,0.139722,-0.004767,-0.079815,1
3,semoga kita jangan salah pilih karena menyangk...,0.102842,0.239149,-0.146182,-0.151532,1
4,ganjar meg di survei anis meg di hati masyarakat,0.205908,0.114505,0.150622,-0.039845,0
...,...,...,...,...,...,...
615,kalo pilpres tidak pakai sistim degital tetap ...,0.059383,0.039824,0.020276,0.022528,0
616,sapa saja boleh asal jangan ganjar dan puan,0.225357,-0.075593,0.016417,-0.006042,0
617,mestro pemimpin milineal bangsa mengubah wuju...,0.175793,0.174926,0.170143,0.102222,0
618,kalo begitu pdip boleh yang mau terus berkuasa,0.021243,0.024430,-0.018350,0.014611,1


In [7]:
df_lsa['Topik'].value_counts()

1    310
0    217
3     56
2     37
Name: Topik, dtype: int64