In [None]:
!pip install PySastrawi

Collecting PySastrawi
  Downloading PySastrawi-1.2.0-py2.py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.6/210.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PySastrawi
Successfully installed PySastrawi-1.2.0


In [None]:

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemover, ArrayDictionary
import string
import regex as re
from sklearn.utils import resample

In [None]:
reviews = pd.read_csv("tokocrypto_reviews.csv")
reviews = reviews.drop(columns=["reviewId", "userName"])
reviews = reviews.drop_duplicates()
reviews = reviews.dropna()

print(reviews.info())
print(reviews.shape)
print(reviews.head(10))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4999 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  4999 non-null   object
 1   score    4999 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 117.2+ KB
None
(4999, 2)
                                             content  score
0  Kecewa karena terlalu banyak potongan pajak: 1...      1
1  Scam? GA waktu login ulang berhasil, tapi pada...      1
2  Ribet. Withdraw dalam waktu 60 detik harus mem...      1
3  Exchanger konyol, udah reset GA, giliran mau d...      1
4  Pada saat deposit, akun saya sedang login. Nam...      1
5  Kenapa perdagangan crypto yang tersedia tidak ...      1
6  Tiba2 ga bisa login, muter2, sms , GA, email s...      1
7  Sangat Sulit Menarik Dana Dari Aplikasi Ini.. ...      1
8  Jangan di download jelek bikin rugi,beli xrp h...      1
9  Tolong dev tokocrypto, GA SALAH MULU, APAKAH T...      1


In [None]:
def returnSentiment(score):
    if (score >= 4):
        return "positive"
    elif (score <= 3):
        return "negative"

In [None]:
reviews["sentiment"] = reviews["score"].apply(returnSentiment)

In [None]:
reviews

Unnamed: 0,content,score,sentiment
0,Kecewa karena terlalu banyak potongan pajak: 1...,1,negative
1,"Scam? GA waktu login ulang berhasil, tapi pada...",1,negative
2,Ribet. Withdraw dalam waktu 60 detik harus mem...,1,negative
3,"Exchanger konyol, udah reset GA, giliran mau d...",1,negative
4,"Pada saat deposit, akun saya sedang login. Nam...",1,negative
...,...,...,...
4995,Aplikasi yang sangat bagus.Keamanan transaksi ...,5,positive
4996,"sy lebih suka trading disini, fitur2nya lebih ...",5,positive
4997,Aplikasi nya mudah banget dipelajari buat pemu...,5,positive
4998,Baru pertama main trading. Kirain susah ternya...,5,positive


In [None]:
def remove_emojis(text):
  return str(text.encode('ascii', 'ignore'))

def remove_punctuation(text):
    # Make a regular expression that matches all punctuation
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    # Use the regex
    return regex.sub(' ', text)

# stopwords removal
stopword_factory = StopWordRemoverFactory()
stopword = stopword_factory.create_stop_word_remover()
def remove_stopwords_sastrawi(text):
  return stopword.remove(text)

# stemming
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()
def stem_words(text):
  return stemmer.stem(text)

# normalisasi kata tidak baku
slang_dict = pd.read_csv('new_kamusalay.csv', encoding='latin-1', header=None)
slang_dict = slang_dict.rename(columns={0: 'original',
                                      1: 'replacement'})
slang_dict_map = dict(zip(slang_dict['original'], slang_dict['replacement']))
def normalize_slang(text):
  return ' '.join([slang_dict_map[word] if word in slang_dict_map else word for word in text.split(' ')])

def preprocess(text):
  text1 = text.lower()   # case folding
  text4 = remove_emojis(text1)
  text5 = re.sub(r"\d+", "", text4)   # remove numbers
  text6 = text5.replace('\\n',' ')    # hapus karakter '\n'
  text7 = remove_punctuation(text6)
  text8 = normalize_slang(text7)
  text9 = stem_words(text8)
  text10 = remove_stopwords_sastrawi(text9)
  result = text10.strip()   # remove whitespace
  return result

reviews["preprocessed"] = reviews['content'].apply(preprocess)

In [None]:
reviews

Unnamed: 0,content,score,sentiment,preprocessed
0,Kecewa karena terlalu banyak potongan pajak: 1...,1,negative,kecewa potong pajak waktu beli coin waktu jual...
1,"Scam? GA waktu login ulang berhasil, tapi pada...",1,negative,scam waktu login ulang hasil wd dadak gagal so...
2,Ribet. Withdraw dalam waktu 60 detik harus mem...,1,negative,ribet withdraw waktu detik kode verifikasi ema...
3,"Exchanger konyol, udah reset GA, giliran mau d...",1,negative,exchanger konyol reset gilir re connect wd waj...
4,"Pada saat deposit, akun saya sedang login. Nam...",1,negative,deposit akun login deposit akun log out kecewa...
...,...,...,...,...
4995,Aplikasi yang sangat bagus.Keamanan transaksi ...,5,positive,aplikasi bagus aman transaksi jamin proses tar...
4996,"sy lebih suka trading disini, fitur2nya lebih ...",5,positive,suka trading fiturnya gampang gera candlenya w...
4997,Aplikasi nya mudah banget dipelajari buat pemu...,5,positive,aplikasi mudah banget ajar oi simple trading f...
4998,Baru pertama main trading. Kirain susah ternya...,5,positive,main trading susah frendly banget security dat...


In [None]:
reviews.sentiment.value_counts()

negative    3000
positive    1999
Name: sentiment, dtype: int64

In [None]:
# Split data menjadi dua DataFrame berdasarkan sentiment
positive_reviews = reviews[reviews['sentiment'] == 'positive']
negative_reviews = reviews[reviews['sentiment'] == 'negative']

# Lakukan undersampling pada data negative agar jumlahnya menjadi 1999
undersampled_negative = resample(negative_reviews, replace=False, n_samples=1999, random_state=42)

# Gabungkan data positive dengan data negative yang sudah di-undersample
undersampled_reviews = pd.concat([positive_reviews, undersampled_negative])

# Tampilkan informasi setelah undersampling
print(undersampled_reviews['sentiment'].value_counts())

positive    1999
negative    1999
Name: sentiment, dtype: int64


In [None]:
undersampled_reviews

Unnamed: 0,content,score,sentiment,preprocessed
3000,"Tolong dong min tambahkan pilihan menu SL/TP, ...",4,positive,tolong min pilih menu sl trader min
3001,kalau bisa coin yang ada harus fash update bia...,4,positive,coin fash biar tinggal exchange masaah
3002,Sejauh ini fiturnya oke dan fee nya juga kompe...,4,positive,fiturnya oke fee kompetitif tolong unggah doku...
3003,"Tolong chart dibuat seperti binance, ada hitun...",4,positive,tolong chart binance hitung mundur waktu
3004,"Alamat wallet saya kok failed ini min,, Coba s...",4,positive,alamat wallet failed min coba setor aplikasi b...
...,...,...,...,...
1854,Saya sudah 3 kali konfirmasi kode verifikasi e...,2,negative,konfirmasi kode verifikasi email copy paste an...
1024,"Weak: Lag parah, dan harus sering close dulu b...",2,negative,weak lag parah close harga koin cocok fast tra...
824,Ribet mo wd aj susah amat.. alamat salah alama...,1,negative,ribet wd susah alamat salah alamat salah puyeng
982,Hallo untuk tokocrypto yg terhormat. Kenapa ve...,1,negative,tokocrypto hormat verifikasi akun bantu kembang
