In [21]:
import pandas as pd
from tqdm.notebook import tqdm
import re
import string
from indoNLP.preprocessing import *
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [9]:
df = pd.read_csv('../dataset/KaggleReview.csv')

In [10]:
def ratingToSentiment(row):
  sentiment = -1
  # print(row, row['Review'])
  if (row['Rating'] <= 2):
    sentiment = 0
  elif (row['Rating'] == 3):
    sentiment = 1
  else:
    sentiment = 2
  return sentiment

df['Sentiment'] = df.apply(ratingToSentiment, axis=1)

In [86]:

def get_stopwords(file_path):
  stopwords=[]
  file_stopwords = open(file_path,'r')
  row = file_stopwords.readline()
  while row:
      word = row.strip()
      stopwords.append(word)
      row = file_stopwords.readline()
  file_stopwords.close()
  return stopwords

def handle_stopwords(review, stopwords):
  feature_vector = []
  review = review.split(' ')
  for word in review:
    val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", word) #menghilangkan karakter selain huruf didalam kata
    if (word in stopwords or val is None):
      continue
    else:
      feature_vector.append(word)
  for_stemming = ' '.join(feature_vector)
  return feature_vector, for_stemming

def handle_negative(review):
  negative_review = []
  review = review.split(' ')
  for i in range(len(review)):
    word = review[i]
    if review[i-1] != 'enggak':
      negative_review.append(word)
    else:
      word = 'tidak_'+word
      negative_review.append(word)
  for_stemming = ' '.join(negative_review)
  return for_stemming

In [87]:
# Remove emoji, punctuation, symbol
def preprocess(text):
  # Casefolding to Lowercase
  text = text.lower()

  # Remove punctuation
  text = text.translate(str.maketrans('', '', string.punctuation))

  text = replace_word_elongation(text)  # replace WE

  # Change emoji to words
  # text = emoji_to_words(text)
  text = text.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))

  # Remove HTML tags
  text = remove_html(text) 
  text = remove_url(text)  # remove url
  text = replace_slang(text)  # replace slang words

  file_path ='../dataset/stopwords-indo.txt'
  stopwords = get_stopwords(file_path)
  feature, text = handle_stopwords(text, stopwords)
  text = handle_negative(text)

  # Remove numbers
  text = text.translate(str.maketrans('', '', string.digits))
  # Remove whitespaces at front and back
  text = ' '.join(text.split())

  # Stemming
  factory = StemmerFactory()
  stemmer = factory.create_stemmer()
  text = stemmer.stem(text)
  return text

In [88]:
df["Cleaned"] = [preprocess(x) for x in tqdm(df['Review'].values)]

  0%|          | 0/8646 [00:00<?, ?it/s]

In [77]:
senticnet = pd.read_excel('senticnet.xlsx')
senticnet.head()

Unnamed: 0,CONCEPT,INTROSPECTION,TEMPER,ATTITUDE,SENSITIVITY,PRIMARY EMOTION,SECONDAY EMOTION,POLARITY VALUE,POLARITY INTENSITY,SEMANTICS,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
0,aah,-0.514,0.0,0.0,0.0,#sadness,,negative,-0.514,berlari,absquatulate,li_na_lamont,intisari,deru
1,abadi,0.0,0.0,0.918,0.0,#delight,,positive,0.918,elang,permukaan_trotoar,pahlawan_tanpa_tanda_jasa,elite,tunica_albuginea
2,abdominoplasti,0.0,0.0,0.232,0.0,#acceptance,,positive,0.232,lelucon,mengemudi_dengan_aman,off_color,buruh_tani,menimbulkan_korosi_tungau
3,aberrate,0.0,0.0,-0.564,0.0,#disgust,,negative,-0.564,gelembung_warp,warp_repulse,warp_tug,tekuk,membungkuk_slub
4,abey,0.0,0.0,0.0,-0.329,#anxiety,,negative,-0.329,hemat,penghargaan_pascabayar,setelan_bagus,tradisi_rakyat,mollify_tantrum


In [76]:
senticnet_dict = dict(zip(senticnet['CONCEPT'], senticnet['POLARITY INTENSITY']))
senticnet_dict

{'aah': -0.514,
 'abadi': 0.918,
 'abdominoplasti': 0.232,
 'aberrate': -0.564,
 'abey': -0.329,
 'abeyance_berair': 0.205,
 'ablare': -0.976,
 'ablasi_eyestalk': -0.832,
 'ablasi_kateter': -0.921,
 'ablast': -0.895,
 'abnormal': -0.299,
 'abomasum': -0.27,
 'abortifasient': 0.554,
 'abortisida': -0.883,
 'aboulia': -0.978,
 'aboulik': 0.885,
 'abrasi': -0.194,
 'abreaksi': 0.528,
 'absen': -0.978,
 'absentmind': -0.846,
 'absis': -0.844,
 'absolutepunk': -0.652,
 'absolutif': -0.877,
 'absolutist': -0.86,
 'absolvitasi': 0.93,
 'absquatulate': -0.872,
 'abstain_dari': -0.489,
 'abstensi': 0.505,
 'abstraktif': -0.811,
 'abu_abu': -0.306,
 'abu_abu_belang': -0.861,
 'abu_abu_silver': 0.569,
 'abu_abu_suram': -0.488,
 'abulia': -0.891,
 'ac_id': -0.838,
 'acak': -0.863,
 'acalculia': -0.644,
 'acanthamoeba': -0.183,
 'acanthocephalan': -0.838,
 'acara_sosial': 0.562,
 'acara_sosial_meromorfik': -0.825,
 'acaricide': -0.483,
 'acausal': 0.792,
 'accelerator_proton': -0.865,
 'ace_in_hole

In [70]:
def calculate_sentiment_score(review):
  score = 0
  words = review.split()  # Pisahkan ulasan menjadi kata-kata
  index = 0
  for word in words:
    if word in senticnet_dict:  # Cek apakah kata ada di SenticNet
      cur_score = senticnet_dict[word]
      if ('tidak' in words[max(index-3, 0):index]):
        cur_score *= -1
      print(word, cur_score)
      score += cur_score  # Tambahkan skor sentimen dari SenticNet
    index += 1
  if (score > 0):
    sentiment = 2 #positif
  elif (score == 0):
    sentiment = 1 #netral
  else:
    sentiment = 0 #negatif
  return score, sentiment

In [80]:
test = lexicon_df[:4]['Cleaned'].values[2]
ori = df[:4]['Review'].values[2]
print(ori)
print(test)
calculate_sentiment_score(test)

[SOLD]baru pertama kali nyoba dan beli karna liat yt di famaledaily trs aku pake pertama kali dimalam hari besok pagi nya bruntusan☹️ trs aku pake lagi dipagi dan malam eh lusa nya makin memerah😭 jadi aku mutusin untuk berhenti karna spt nya gacocok di aku dan aku mau ... Read more
soldbaru kali coba beli lihat yt famaledaily pakai kali malam besok pagi nya pakai pagi malam eh lusa nya putus henti nya gacocok read more
lihat 0.287
pakai 0.923
malam 0.808
pakai 0.923
malam 0.808
putus -0.873


(2.8759999999999994, 2)

In [67]:
df[["Score", "Sentiment-Lexicon"]] = [calculate_sentiment_score(x) for x in tqdm(df['Cleaned'].values)]

  0%|          | 0/8646 [00:00<?, ?it/s]

In [68]:
lexicon_df = df[['Cleaned', 'Sentiment-Lexicon', 'Score']]
lexicon_df.head()

Unnamed: 0,Cleaned,Sentiment-Lexicon,Score
0,benar kerja melembabkan banget tidak lebaygak ...,2.0,0.197
1,tidak cocok banget bikin muka bruntusan langsu...,0.0,-0.136
2,soldbaru kali coba beli lihat yt famaledaily p...,2.0,2.876
3,enak sih muka kering lembab pagi bangun kenyal...,2.0,4.637
4,sold out preloved halo preloved ya pakai mingg...,0.0,-0.52
