# LSA Topic Modelling


In [None]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
df = pd.read_csv(
    'https://raw.githubusercontent.com/chendytriwardani/Data/main/scrapping_youtube.csv')
df

Unnamed: 0,komentar,hasil komentar
0,ID\nSkip navigation\nSign in\nLIVE: Eksklusif ...,ini live eksklusif ganjar pranowo bicara c...
1,"Semoga dapat saling bersinergi,dan menerima ma...",semoga dapat saling bersinergi dan menerima ma...
2,Kami dari NTT siap mendukung Bapak Ganjar Pran...,kami dari ti siap mendukung bapak ganjar prano...
3,Mantappp dan berlanjut pak ganjar...walau kami...,mantap dan berlanjut pak ganjar walau kami sek...
4,Saya bangga jadi warga Semarang dan rumah saya...,saya bangga jadi warga semarang dan rumah saya...
...,...,...
1134,"Mantap gw jd lagi dah milih lu pak, kemaren pi...",mantap gue jadi lagi deh memilih lu pak kemari...
1135,Luar biasa,luar biasa
1136,"Saya sih pilih bak nana aja, bravo bak nana.. ...",saya sih pilih bak a saja bravo bak a najwasih...
1137,"Sejak FPI dibubarkan, Indonesia lebih kondusif...",sejak fpi dibubarkan indonesia lebih kondusif ...


## Modelling

In [None]:
import nltk
nltk.download('stopwords', quiet=True)

True

## Tokenize

In [None]:
from nltk.tokenize import RegexpTokenizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
stopwords = stopwords.words('indonesian')

# Membentuk matriks dokumen x kata
tokenizer = RegexpTokenizer(r'\w+')
vectorizer = TfidfVectorizer(lowercase=True,
                             stop_words=stopwords,
                             tokenizer=tokenizer.tokenize)

tfidf_matrix = vectorizer.fit_transform(df['hasil komentar'])

# Melakukan dekomposisi matriks dengan SVD
svd_model = TruncatedSVD(n_components=4)
lsa_matrix = svd_model.fit_transform(tfidf_matrix)

## Memberikan bobot pada kata

In [None]:
# bobot kata terhadap masing masing topik
terms = vectorizer.get_feature_names_out()

for index, component in enumerate(svd_model.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:2]
    print("Topic "+str(index)+": ",top_terms_key)

Topic 0:  [('ganjar', 0.5830505845596635), ('indonesia', 0.3328192894160908)]
Topic 1:  [('indonesia', 0.6245962234648971), ('bendera', 0.5273475755789233)]
Topic 2:  [('partai', 0.6460209720112603), ('petugas', 0.4134764274552875)]
Topic 3:  [('semoga', 0.4009609793949179), ('presen', 0.32628264427837234)]


## Memberikan bobot pada setiap topik

In [None]:
# bobot setiap topik terhadap  dokumen
df_lsa = pd.DataFrame(lsa_matrix, columns=[
                      "Topik 0", "Topik 1", "Topik 2", "Topik 3"])
df_lsa = pd.concat([df["hasil komentar"], df_lsa], axis=1)
df_lsa['Topik'] = df_lsa[['Topik 0', 'Topik 1', 'Topik 2', 'Topik 3']].apply(
    lambda x: x.argmax(), axis=1)

df_lsa

Unnamed: 0,hasil komentar,Topik 0,Topik 1,Topik 2,Topik 3,Topik
0,ini live eksklusif ganjar pranowo bicara c...,0.193849,-0.025296,0.053368,-0.017935,0
1,semoga dapat saling bersinergi dan menerima ma...,0.202434,0.084346,0.056132,0.117129,0
2,kami dari ti siap mendukung bapak ganjar prano...,0.160734,-0.032889,-0.034213,0.037801,0
3,mantap dan berlanjut pak ganjar walau kami sek...,0.182660,0.064723,-0.034402,-0.010032,0
4,saya bangga jadi warga semarang dan rumah saya...,0.106619,-0.033735,0.019840,-0.020896,0
...,...,...,...,...,...,...
1134,mantap gue jadi lagi deh memilih lu pak kemari...,0.064722,-0.052072,0.030412,-0.025618,0
1135,luar biasa,0.000000,0.000000,0.000000,0.000000,0
1136,saya sih pilih bak a saja bravo bak a najwasih...,0.062390,-0.081754,-0.014059,-0.128078,0
1137,sejak fpi dibubarkan indonesia lebih kondusif ...,0.186008,0.094760,0.030697,0.086250,0


## Menghitung Tiap topik


In [None]:
df_lsa['Topik'].value_counts()

0    896
2    150
1     49
3     44
Name: Topik, dtype: int64