# Tugas 5 (LSA Topic Modeling)

### Read Data

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
df = pd.read_csv(
    'https://raw.githubusercontent.com/errjak/dataset/main/datacomment1.csv')
df

Unnamed: 0,comment,comment (clean)
0,ID\nSkip navigation\nSign in\nIni Jagoan Pilpr...,id skip navigation sign ini ini jagoan pilpres...
1,Jika masarakat Indonesia ingin ketentraman dan...,jika masarakat indonesia ingin ketentraman dan...
2,Kita doakan agar datang pemimpin nasional yg b...,kita doakan agar datang pemimpin nasional yang...
3,"Semoga kita jgn salah pilih, karna menyangkut ...",semoga kita jangan salah pilih karena menyangk...
4,Ganjar menang di survei Anis menang di hati ma...,ganjar meg di survei anis meg di hati masyarakat
...,...,...
176,Semoga Anis Baswedan tidak salah pilih pasang...,semoga anis baswedan tidak salah pilih pasanga...
177,Semoga kang Emil ( Ridwan Kamil ) bisa maju se...,semoga kang emil ridwan kamil bisa maju sebaga...
178,Yg jelas yg bersih dari kasus korupsi.\ndan ke...,yang jelas yang bersih dari kasus korupsi dan ...
179,Pak Ganjar Pranowo selalu dihati\n Slam dari m...,pak ganjar pranowo selalu dihati salam dari ma...


### Modelling

In [3]:
import nltk
nltk.download('stopwords', quiet=True)


True

In [4]:
from nltk.tokenize import RegexpTokenizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
stopwords = stopwords.words('indonesian')

# Membentuk matriks dokumen x kata
tokenizer = RegexpTokenizer(r'\w+')
vectorizer = TfidfVectorizer(lowercase=True,
                             stop_words=stopwords,
                             tokenizer=tokenizer.tokenize)

tfidf_matrix = vectorizer.fit_transform(df['comment (clean)'])

# Melakukan dekomposisi matriks dengan SVD
svd_model = TruncatedSVD(n_components=4)
lsa_matrix = svd_model.fit_transform(tfidf_matrix)

### bobot kata

In [5]:
# bobot kata terhadap masing masing topik
terms = vectorizer.get_feature_names_out()

for index, component in enumerate(svd_model.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:2]
    print("Topic "+str(index)+": ",top_terms_key)

Topic 0:  [('ganjar', 0.5215732293298396), ('semoga', 0.28427772257252926)]
Topic 1:  [('ganjar', 0.5633875792073725), ('pranowo', 0.3146338252569963)]
Topic 2:  [('bendera', 0.47463952971581697), ('indonesia', 0.31852930135529517)]
Topic 3:  [('rakyat', 0.3906039055312414), ('dedi', 0.2889270042415189)]


### bobot setiap topik

In [6]:
# bobot setiap topik terhadap  dokumen
df_lsa = pd.DataFrame(lsa_matrix, columns=[
                      "Topik 0", "Topik 1", "Topik 2", "Topik 3"])
df_lsa = pd.concat([df["comment (clean)"], df_lsa], axis=1)
df_lsa['Topik'] = df_lsa[['Topik 0', 'Topik 1', 'Topik 2', 'Topik 3']].apply(
    lambda x: x.argmax(), axis=1)

df_lsa

Unnamed: 0,comment (clean),Topik 0,Topik 1,Topik 2,Topik 3,Topik
0,id skip navigation sign ini ini jagoan pilpres...,0.233830,-0.059861,0.024617,0.040750,0
1,jika masarakat indonesia ingin ketentraman dan...,0.088780,-0.084838,-0.006623,0.079625,0
2,kita doakan agar datang pemimpin nasional yang...,0.128969,-0.059578,-0.064357,0.077075,0
3,semoga kita jangan salah pilih karena menyangk...,0.234240,-0.274441,-0.085227,0.013523,0
4,ganjar meg di survei anis meg di hati masyarakat,0.192664,0.101263,-0.077935,-0.010195,0
...,...,...,...,...,...,...
176,semoga anis baswedan tidak salah pilih pasanga...,0.177322,-0.215866,-0.136662,-0.079601,0
177,semoga kang emil ridwan kamil bisa maju sebaga...,0.301168,-0.113017,0.203455,-0.211137,0
178,yang jelas yang bersih dari kasus korupsi dan ...,0.029285,-0.008196,-0.004308,-0.005884,0
179,pak ganjar pranowo selalu dihati salam dari ma...,0.358387,0.364321,-0.038973,0.051343,1


In [7]:
df_lsa['Topik'].value_counts()

0    123
3     30
1     15
2     13
Name: Topik, dtype: int64