# LSA Topic Modelling


In [None]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
df = pd.read_csv(
    'https://raw.githubusercontent.com/chendytriwardani/Data/main/Comments_Data_yt.csv')
df

Unnamed: 0,publishedAt,user,comment,likeCount,comment (clean)
0,2023-06-02T09:58:39Z,Kamal Yusuf,Sebaiknya Bacawapres Bapak Ganjar Pranowo bera...,0.0,sebaiknya bacawapres bapak ganjar pranowo bera...
1,2023-06-02T00:56:09Z,Oemar Husain,Knpa gue gk tertarik ma ni orang,0.0,kenapa gue enggak tertarik sama nih orang
2,2023-06-01T15:39:36Z,akhiri yanto,<b>PETUGAS PARTAI</b><br><br><b>Menurut paham ...,0.0,petugas partaimenurut paham negara demokrasi m...
3,2023-06-01T11:01:38Z,Kamal Yusuf,Apabila Bapak Ganjar Pranowo menjadi pemimpin ...,0.0,apabila bapak ganjar pranowo menjadi pemimpin ...
4,2023-06-01T07:55:09Z,M. Maulana Muhson,Aslinya pak ganjar masih menjadi gubernur jate...,0.0,aslinya pak ganjar masih menjadi gubernur jate...
...,...,...,...,...,...
10494,2023-04-23T06:08:09Z,Olive,"Halaaah. Timbang si Yaman, kampanye ke mana² b...",1.0,halaaah timbang sih yaman kampanye ke mana² be...
10495,2023-04-23T06:01:08Z,CAH NDESO NEWS,Gaaasssss 2024,3.0,gaaas
10496,2023-04-23T06:00:59Z,BUZZTRUCK,Masih sepi,1.0,masih sepi
10497,2023-04-23T06:11:39Z,Mulyadi,Soalnya lagi makan ketupat...<br>Katanya malas...,0.0,soalnya lagi makan ketupat katanya malas ah so...


## Modelling

In [None]:
import nltk
nltk.download('stopwords', quiet=True)

True

## Tokenize

In [None]:
df['comment (clean)'].fillna('', inplace=True)

In [None]:
from nltk.tokenize import RegexpTokenizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
stopwords = stopwords.words('indonesian')

# Membentuk matriks dokumen x kata
tokenizer = RegexpTokenizer(r'\w+')
vectorizer = TfidfVectorizer(lowercase=True,
                             stop_words=stopwords,
                             tokenizer=tokenizer.tokenize)

tfidf_matrix = vectorizer.fit_transform(df['comment (clean)'])

# Melakukan dekomposisi matriks dengan SVD
svd_model = TruncatedSVD(n_components=4)
lsa_matrix = svd_model.fit_transform(tfidf_matrix)

## Memberikan bobot pada kata

In [None]:
# bobot kata terhadap masing masing topik
terms = vectorizer.get_feature_names_out()

for index, component in enumerate(svd_model.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:2]
    print("Topic "+str(index)+": ",top_terms_key)

Topic 0:  [('ganjar', 0.7753996873278318), ('presiden', 0.26051512981200636)]
Topic 1:  [('partai', 0.7115739909065594), ('petugas', 0.5832168208268506)]
Topic 2:  [('presiden', 0.5102938674843012), ('prabowo', 0.4719719056311544)]
Topic 3:  [('ri', 0.469172904993494), ('presiden', 0.4515331144360789)]


## Memberikan bobot pada setiap topik

In [None]:
# bobot setiap topik terhadap  dokumen
df_lsa = pd.DataFrame(lsa_matrix, columns=[
                      "Topik 0", "Topik 1", "Topik 2", "Topik 3"])
df_lsa = pd.concat([df["comment (clean)"], df_lsa], axis=1)
df_lsa['Topik'] = df_lsa[['Topik 0', 'Topik 1', 'Topik 2', 'Topik 3']].apply(
    lambda x: x.argmax(), axis=1)

df_lsa

Unnamed: 0,comment (clean),Topik 0,Topik 1,Topik 2,Topik 3,Topik
0,sebaiknya bacawapres bapak ganjar pranowo bera...,1.559275e-01,-4.724538e-02,-3.568017e-02,1.823356e-02,0
1,kenapa gue enggak tertarik sama nih orang,2.674609e-02,1.197432e-02,2.110648e-02,-2.144787e-02,0
2,petugas partaimenurut paham negara demokrasi m...,1.207237e-01,2.584727e-01,3.032209e-02,3.695698e-05,1
3,apabila bapak ganjar pranowo menjadi pemimpin ...,1.947422e-01,-2.328706e-02,3.644148e-02,3.957651e-02,0
4,aslinya pak ganjar masih menjadi gubernur jate...,8.571420e-02,6.205139e-03,5.003411e-02,2.695248e-02,0
...,...,...,...,...,...,...
10494,halaaah timbang sih yaman kampanye ke mana² be...,7.656587e-03,4.594410e-03,1.199556e-02,-1.336154e-02,2
10495,gaaas,-1.522081e-14,5.596667e-13,1.092808e-11,-1.088433e-11,2
10496,masih sepi,2.310806e-04,1.216991e-04,3.587381e-04,-3.524848e-04,2
10497,soalnya lagi makan ketupat katanya malas ah so...,2.730572e-03,3.563408e-03,3.270354e-03,-3.694595e-03,1


## Menghitung Tiap topik


In [None]:
df_lsa['Topik'].value_counts()

0    6409
2    2560
1    1394
3     136
Name: Topik, dtype: int64