# **1. Import Library**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import re
import string
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from wordcloud import WordCloud
import nltk
from collections import Counter
import csv
import requests
from io import StringIO
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM, SimpleRNN

# **2. Import Dataset hasil scrapping**

In [2]:
df = pd.read_csv('7dsgc_review.csv')

In [3]:
df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,061a9c37-9da0-4afb-923d-b9b4f120a1e7,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,event nya banyak tambah lagi event nya terus,5,0,2.73.0,2025-03-28 12:04:51,,,2.73.0
1,b17544bc-2268-49a5-afa7-747f4484fd59,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,lebih bagus lagi ini game mod miring deh,2,0,2.73.0,2025-03-28 01:34:11,,,2.73.0
2,a6dcf278-4c53-4afd-ba34-ddbf9031d62e,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,bagus banget gamenya,5,0,,2025-03-27 22:11:08,,,
3,6517e761-e80e-4c8f-9e7d-40bf5092d455,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,bagus dan menghibur 3d sangat enak di lihat mata,5,0,2.73.0,2025-03-27 18:30:01,,,2.73.0
4,0943c392-06e9-49f1-a361-aacf643c8880,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Tolong di jelaskan kenapa akun saya di ban den...,1,0,2.73.0,2025-03-27 17:12:54,,,2.73.0


# **3. Text Preprocessing**

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17958 entries, 0 to 17957
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              17958 non-null  object
 1   userName              17958 non-null  object
 2   userImage             17958 non-null  object
 3   content               17958 non-null  object
 4   score                 17958 non-null  int64 
 5   thumbsUpCount         17958 non-null  int64 
 6   reviewCreatedVersion  13806 non-null  object
 7   at                    17958 non-null  object
 8   replyContent          408 non-null    object
 9   repliedAt             408 non-null    object
 10  appVersion            13806 non-null  object
dtypes: int64(2), object(9)
memory usage: 1.5+ MB


In [5]:
df.isnull().sum()

reviewId                    0
userName                    0
userImage                   0
content                     0
score                       0
thumbsUpCount               0
reviewCreatedVersion     4152
at                          0
replyContent            17550
repliedAt               17550
appVersion               4152
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
df.drop(columns=['replyContent', 'repliedAt','reviewCreatedVersion','appVersion'], inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17958 entries, 0 to 17957
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   reviewId       17958 non-null  object
 1   userName       17958 non-null  object
 2   userImage      17958 non-null  object
 3   content        17958 non-null  object
 4   score          17958 non-null  int64 
 5   thumbsUpCount  17958 non-null  int64 
 6   at             17958 non-null  object
dtypes: int64(2), object(5)
memory usage: 982.2+ KB


## Text Cleaning

In [9]:
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # menghapus mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # menghapus hashtag
    text = re.sub(r'RT[\s]', '', text) # menghapus RT
    text = re.sub(r"http\S+", '', text) # menghapus link
    text = re.sub(r'[0-9]+', '', text) # menghapus angka
    text = re.sub(r'[^\w\s]', '', text) # menghapus karakter selain huruf dan angka
 
    text = text.replace('\n', ' ') # mengganti baris baru dengan spasi
    text = text.translate(str.maketrans('', '', string.punctuation)) # menghapus semua tanda baca
    text = text.strip(' ') # menghapus karakter spasi dari kiri dan kanan teks
    return text

def casefoldingText(text): # Mengubah semua karakter dalam teks menjadi huruf kecil
    text = text.lower()
    return text

# Terapkan cleaning pada kolom 'content'
df['cleaned_content'] = df['content'].apply(cleaningText)
df['cleaned_casefolding_content'] = df['cleaned_content'].apply(casefoldingText)

In [10]:
slang_dict = {"gk": "tidak","ga": "tidak","nggak": "tidak","ngga": "tidak","bgt": "banget","bener": "benar","tp": "tapi","dgn": "dengan",
    "udh": "sudah", "aja": "saja", "blm": "belum", "sy": "saya", "dg": "dengan", "krn": "karena", "dr": "dari", "sm": "sama", "trs": "terus", 
    "jg": "juga", "ny": "nya", 
}

def fixing_slangwords(text):
    words = text.split()
    fixed_words = []
 
    for word in words:
        if word.lower() in slang_dict:
            fixed_words.append(slang_dict[word.lower()])
        else:
            fixed_words.append(word)
 
    fixed_text = ' '.join(fixed_words)
    return fixed_text

df['cleaned_slang_content'] = df['cleaned_casefolding_content'].apply(fixing_slangwords)

In [11]:
def tokenizingText(text): # Memecah atau membagi string, teks menjadi daftar token
    text = word_tokenize(text)
    return text

df['tokenized_content'] = df['cleaned_slang_content'].apply(tokenizingText)

In [12]:
def filteringText(text): # Menghapus stopwords dalam teks
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords1 = set(stopwords.words('english'))
    listStopwords.update(listStopwords1)
    listStopwords.update(['iya','yaa','gak','nya','na','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy"])
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

df['stopword_tokenized_content'] = df['tokenized_content'].apply(filteringText)

In [14]:
def stemmingText(text): # Mengurangi kata ke bentuk dasarnya yang menghilangkan imbuhan awalan dan akhiran atau ke akar kata
    # Membuat objek stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    # Memecah teks menjadi daftar kata
    words = text

    # Menerapkan stemming pada setiap kata dalam daftar
    stemmed_words = [stemmer.stem(word) for word in words]

    # Menggabungkan kata-kata yang telah distem
    stemmed_text = ' '.join(stemmed_words)

    return stemmed_text

df['final_content'] = df['stopword_tokenized_content'].apply(stemmingText)

KeyboardInterrupt: 

In [16]:
df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,at,cleaned_content,cleaned_casefolding_content,cleaned_slang_content,tokenized_content,stopword_tokenized_content,stemmed_tokenized_content,final_content
0,061a9c37-9da0-4afb-923d-b9b4f120a1e7,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,event nya banyak tambah lagi event nya terus,5,0,2025-03-28 12:04:51,event nya banyak tambah lagi event nya terus,event nya banyak tambah lagi event nya terus,event nya banyak tambah lagi event nya terus,"[event, nya, banyak, tambah, lagi, event, nya,...","[event, event]",event event,e v e n t e v e n t
1,b17544bc-2268-49a5-afa7-747f4484fd59,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,lebih bagus lagi ini game mod miring deh,2,0,2025-03-28 01:34:11,lebih bagus lagi ini game mod miring deh,lebih bagus lagi ini game mod miring deh,lebih bagus lagi ini game mod miring deh,"[lebih, bagus, lagi, ini, game, mod, miring, deh]","[bagus, game, mod, miring, deh]",bagus game mod miring deh,b a g u s g a m e m o d m i r i n g d e h
2,a6dcf278-4c53-4afd-ba34-ddbf9031d62e,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,bagus banget gamenya,5,0,2025-03-27 22:11:08,bagus banget gamenya,bagus banget gamenya,bagus banget gamenya,"[bagus, banget, gamenya]","[bagus, banget, gamenya]",bagus banget gamenya,b a g u s b a n g e t g a m e n y a
3,6517e761-e80e-4c8f-9e7d-40bf5092d455,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,bagus dan menghibur 3d sangat enak di lihat mata,5,0,2025-03-27 18:30:01,bagus dan menghibur d sangat enak di lihat mata,bagus dan menghibur d sangat enak di lihat mata,bagus dan menghibur d sangat enak di lihat mata,"[bagus, dan, menghibur, d, sangat, enak, di, l...","[bagus, menghibur, enak, lihat, mata]",bagus hibur enak lihat mata,b a g u s h i b u r e n a k l i h a t ...
4,0943c392-06e9-49f1-a361-aacf643c8880,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Tolong di jelaskan kenapa akun saya di ban den...,1,0,2025-03-27 17:12:54,Tolong di jelaskan kenapa akun saya di ban den...,tolong di jelaskan kenapa akun saya di ban den...,tolong di jelaskan kenapa akun saya di ban den...,"[tolong, di, jelaskan, kenapa, akun, saya, di,...","[tolong, akun, ban, alasan, penyalahgunaan, ke...",tolong akun ban alas penyalahgunaan mana banding,t o l o n g a k u n b a n a l a s p e ...


In [18]:
# Membaca data kamus kata-kata positif dari GitHub
lexicon_positive = dict()
 
response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv')
# Mengirim permintaan HTTP untuk mendapatkan file CSV dari GitHub
 
if response.status_code == 200:
    # Jika permintaan berhasil
    reader = csv.reader(StringIO(response.text), delimiter=',')
    # Membaca teks respons sebagai file CSV menggunakan pembaca CSV dengan pemisah koma
 
    for row in reader:
        # Mengulangi setiap baris dalam file CSV
        lexicon_positive[row[0]] = int(row[1])
        # Menambahkan kata-kata positif dan skornya ke dalam kamus lexicon_positive
else:
    print("Failed to fetch positive lexicon data")
 
# Membaca data kamus kata-kata negatif dari GitHub
lexicon_negative = dict()
 
response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv')
# Mengirim permintaan HTTP untuk mendapatkan file CSV dari GitHub
 
if response.status_code == 200:
    # Jika permintaan berhasil
    reader = csv.reader(StringIO(response.text), delimiter=',')
    # Membaca teks respons sebagai file CSV menggunakan pembaca CSV dengan pemisah koma
 
    for row in reader:
        # Mengulangi setiap baris dalam file CSV
        lexicon_negative[row[0]] = int(row[1])
        # Menambahkan kata-kata negatif dan skornya dalam kamus lexicon_negative
else:
    print("Failed to fetch negative lexicon data")

In [19]:
def sentiment_analysis_lexicon_indonesia(text):
    score = 0
    # Inisialisasi skor sentimen ke 0

    for word in text:
        # Mengulangi setiap kata dalam teks

        if word in lexicon_positive:
            score += lexicon_positive[word]
            # Jika kata ada dalam kamus positif, tambahkan skornya ke skor sentimen

    for word in text:
        # Mengulangi setiap kata dalam teks (sekali lagi)

        if word in lexicon_negative:
            score += lexicon_negative[word]
            # Jika kata ada dalam kamus negatif, kurangkan skornya dari skor sentimen

    polarity = ''
    # Inisialisasi variabel polaritas

    # Menambahkan kategori netral
    if score > 0:
        polarity = 'positive'
        # Jika skor sentimen lebih besar dari 0, maka polaritas adalah positif
    elif score < 0:
        polarity = 'negative'
        # Jika skor sentimen kurang dari 0, maka polaritas adalah negatif
    else:
        polarity = 'neutral'
        # Jika skor sentimen sama dengan 0, maka polaritas adalah netral

    return score, polarity
    # Mengembalikan skor sentimen dan polaritas teks

In [20]:
results = df['stemmed_tokenized_content'].apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
df['polarity_score'] = results[0]
df['polarity'] = results[1]
print(df['polarity'].value_counts())

polarity
neutral    17958
Name: count, dtype: int64


# **Data Splitting**

In [None]:
# type here

# **Model Building**

### CNN Model 

In [None]:
model_cnn = Sequential()
model_cnn.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model_cnn.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(Flatten())
model_cnn.add(Dense(64, activation='relu'))
model_cnn.add(Dropout(0.2))
model_cnn.add(Dense(3, activation='softmax'))
model_cnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn.build(input_shape=(None, max_len))

### LSTM Model 

In [None]:
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model_lstm.add(LSTM(128, return_sequences=True))
model_lstm.add(LSTM(64))
model_lstm.add(Dense(64, activation='relu'))
model_lstm.add(Dropout(0.2))
model_lstm.add(Dense(3, activation='softmax'))
model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.build(input_shape=(None, max_len))

### RNN Model 

In [None]:
model_rnn = Sequential()
model_rnn.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model_rnn.add(SimpleRNN(128, return_sequences=True))
model_rnn.add(SimpleRNN(64))
model_rnn.add(Dense(64, activation='relu'))
model_rnn.add(Dropout(0.2))
model_rnn.add(Dense(3, activation='softmax'))
model_rnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_rnn.build(input_shape=(None, max_len))

# Model Training

# Model Evaluation