# Tugas 5 (LSA Topic Modelling)

## Scrapping Data Comment Youtube

In [None]:
!pip install emoji indoNLP --quiet

In [None]:
#import library
import numpy as np
import pandas as pd
from googleapiclient.discovery import build

from indoNLP.preprocessing import pipeline, replace_word_elongation, replace_slang, remove_html, remove_url
import re, string
import emoji

from nltk.tokenize import RegexpTokenizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
stopwords = stopwords.words('indonesian')


# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
def video_comments(video_id, api_key):
	# empty list for storing reply
	replies = []

	# creating youtube resource object
	youtube = build('youtube', 'v3', developerKey=api_key)

	# retrieve youtube video results
	video_response = youtube.commentThreads().list(part='snippet,replies', videoId=video_id).execute()

	# iterate video response
	while video_response:
		for item in video_response['items']:
			
			# Extracting comments ()
			published = item['snippet']['topLevelComment']['snippet']['publishedAt']
			user = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
			comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
			likeCount = item['snippet']['topLevelComment']['snippet']['likeCount']

			replies.append([published, user, comment, likeCount])

			replycount = item['snippet']['totalReplyCount']

			# if reply is there
			if replycount>0:
				# iterate through all reply
				for reply in item['replies']['comments']:
					
					# Extract reply
					published = reply['snippet']['publishedAt']
					user = reply['snippet']['authorDisplayName']
					repl = reply['snippet']['textDisplay']
					likeCount = reply['snippet']['likeCount']
					
					# Store reply is list
					replies.append([published, user, repl, likeCount])

		# Again repeat
		if 'nextPageToken' in video_response:
			video_response = youtube.commentThreads().list(
					part = 'snippet,replies',
					pageToken = video_response['nextPageToken'], 
					videoId = video_id
				).execute()
		else:
			break
	#endwhile
	return replies

In [None]:
#api key
api_key = 'AIzaSyACxT95EPnUkLb_Mtjq2MvG0tcJqQiulhI'
video_id = "rbZ3Wv_hLJA"

comments = video_comments(video_id, api_key)

In [None]:
# dataframe
df = pd.DataFrame(comments, columns=['publishedAt', 'user', 'comment', 'likeCount'])
df.head()

Unnamed: 0,publishedAt,user,comment,likeCount
0,2023-05-12T19:33:49Z,Cekwanaceh8074gmai Wan,Surfe gak juga benar <br>Rakyat yg menentukan,0
1,2023-05-10T15:01:33Z,Anto Anto,"Lembaga survei GK ada yg independent skr,,semu...",0
2,2023-05-10T05:35:54Z,SITI MARHAMAH,"Ibu Khofifah dan Bpk Prabowo sangattt okeee,ay...",0
3,2023-05-09T22:16:57Z,M. Jen,Semoga Pak Anies selalu sehat wal afiat....,0
4,2023-05-08T14:19:52Z,Redmi 4x,Pokoknya hanguskan quick count dan survey lain...,0


### Prepocessing and cleaning

In [None]:
# Text Cleaning
def cleaning(text):
    # HTML Tag Removal
    text = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});').sub('', str(text))

    # Case folding
    text = text.lower()

    # Trim text
    text = text.strip()

    # Remove punctuations, karakter spesial, and spasi ganda
    text = re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)

    # Number removal
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = emoji.replace_emoji(text)

    return text


pipe = pipeline([replace_word_elongation, replace_slang, remove_html, remove_url])

In [None]:
df['comment (clean)'] = df['comment'].apply(lambda x: pipe(x))
df['comment (clean)'] = df['comment (clean)'].apply(lambda x: cleaning(x))
# ubah empty string menjadi NaN
df['comment (clean)'] = df['comment (clean)'].replace('', np.nan)


In [None]:
print(df.isna().sum())

publishedAt        0
user               0
comment            0
likeCount          0
comment (clean)    6
dtype: int64


In [None]:
df.dropna(inplace=True)

### Data clean 

In [None]:
df.head()

Unnamed: 0,publishedAt,user,comment,likeCount,comment (clean)
0,2023-05-12T19:33:49Z,Cekwanaceh8074gmai Wan,Surfe gak juga benar <br>Rakyat yg menentukan,0,surfe enggak juga benar rakyat yang menentukan
1,2023-05-10T15:01:33Z,Anto Anto,"Lembaga survei GK ada yg independent skr,,semu...",0,lembaga survei enggak ada yang independent sek...
2,2023-05-10T05:35:54Z,SITI MARHAMAH,"Ibu Khofifah dan Bpk Prabowo sangattt okeee,ay...",0,ibu khofifah dan bpk prabowo sangat oke ayo du...
3,2023-05-09T22:16:57Z,M. Jen,Semoga Pak Anies selalu sehat wal afiat....,0,semoga pak anies selalu sehat wal afiat
4,2023-05-08T14:19:52Z,Redmi 4x,Pokoknya hanguskan quick count dan survey lain...,0,pokoknya hanguskan quick count dan survey lain...


Export

In [None]:
# to csv
df.to_csv('datacomment_cnn_fix.csv', index=False)

## LSA Topic Modelling

### Read Data

In [None]:
# df=pd.read_csv('https://raw.githubusercontent.com/HamedAyani114/dataset/main/csv/datacomment_cnn_fix.csv')
df=pd.read_csv('datacomment_cnn_fix.csv')
df.head()

Unnamed: 0,publishedAt,user,comment,likeCount,comment (clean)
0,2023-05-12T19:33:49Z,Cekwanaceh8074gmai Wan,Surfe gak juga benar <br>Rakyat yg menentukan,0,surfe enggak juga benar rakyat yang menentukan
1,2023-05-10T15:01:33Z,Anto Anto,"Lembaga survei GK ada yg independent skr,,semu...",0,lembaga survei enggak ada yang independent sek...
2,2023-05-10T05:35:54Z,SITI MARHAMAH,"Ibu Khofifah dan Bpk Prabowo sangattt okeee,ay...",0,ibu khofifah dan bpk prabowo sangat oke ayo du...
3,2023-05-09T22:16:57Z,M. Jen,Semoga Pak Anies selalu sehat wal afiat....,0,semoga pak anies selalu sehat wal afiat
4,2023-05-08T14:19:52Z,Redmi 4x,Pokoknya hanguskan quick count dan survey lain...,0,pokoknya hanguskan quick count dan survey lain...


### Modelling

In [None]:
# Membentuk matriks dokumen x kata
tokenizer = RegexpTokenizer(r'\w+')
vectorizer = TfidfVectorizer(lowercase=True,
                        stop_words=stopwords,
                        tokenizer = tokenizer.tokenize)

tfidf_matrix = vectorizer.fit_transform(df['comment (clean)'])

# Melakukan dekomposisi matriks dengan SVD
svd_model = TruncatedSVD(n_components=4)
lsa_matrix = svd_model.fit_transform(tfidf_matrix)

#### bobot kata terhadap masing masing topik

In [None]:
# bobot kata terhadap masing masing topik
terms = vectorizer.get_feature_names_out()

for index, component in enumerate(svd_model.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:3]
    print("Topic "+str(index)+": ",top_terms_key)

Topic 0:  [('prabowo', 0.8203382976852225), ('ganjar', 0.2850973180946724), ('mahfud', 0.20752516584365116)]
Topic 1:  [('ganjar', 0.7596031036029476), ('pilih', 0.20994495257834747), ('no', 0.1749220341807083)]
Topic 2:  [('mahfud', 0.6708995403327359), ('md', 0.5385923559756101), ('ganjar', 0.27115911329727865)]
Topic 3:  [('presiden', 0.4967871475051675), ('anies', 0.4161682445661496), ('anis', 0.4027404802045691)]


#### bobot setiap topik terhadap dokumen

In [None]:
# bobot setiap topik terhadap  dokumen
df_lsa = pd.DataFrame(lsa_matrix, columns=["Topik 0", "Topik 1", "Topik 2", "Topik 3"])
df_lsa = pd.concat([df["comment (clean)"], df_lsa], axis=1)
df_lsa['Topik']= df_lsa[['Topik 0', 'Topik 1', 'Topik 2', 'Topik 3']].apply(lambda x: x.argmax(), axis=1)

df_lsa

Unnamed: 0,comment (clean),Topik 0,Topik 1,Topik 2,Topik 3,Topik
0,surfe enggak juga benar rakyat yang menentukan,0.010964,0.013961,0.000303,0.039396,3
1,lembaga survei enggak ada yang independent sek...,0.009863,0.019416,0.004447,0.058923,3
2,ibu khofifah dan bpk prabowo sangat oke ayo du...,0.158999,-0.013885,-0.072652,0.058483,0
3,semoga pak anies selalu sehat wal afiat,0.021590,0.029059,-0.013129,0.127406,3
4,pokoknya hanguskan quick count dan survey lain...,0.087800,-0.018042,-0.022393,0.012986,0
...,...,...,...,...,...,...
3001,yang penting jangan pilih pks demokrat karena ...,0.041839,0.069477,0.013132,0.061926,1
3002,sebagai pencinta timnas saya siap turun kejala...,0.021244,0.045233,0.015232,0.011087,1
3003,pdip pks jangan di pilih mereka mementingkan b...,0.059363,0.103096,0.019127,0.073883,1
3004,kontolodon aktor utama ganjar koster bong wkw...,0.038260,0.104071,0.037042,-0.038650,1


In [None]:
df_lsa['Topik'].value_counts()

0    1211
1     862
3     794
2     139
Name: Topik, dtype: int64