# Analisis Sentimen Aplikasi Pintu 

In [2]:
# Lokasi dataset yang sudah diberi label POSITIF atau NEGATIF
# variabel ulasan, rating dan nilai
# Membaca file dataset csv

import pandas as pd
data_path=("../data/dataset-pintu-labeling.csv")
read_data=pd.read_csv(data_path)
read_data.head()

Unnamed: 0,ulasan,rating,nilai
0,Good job buat pintu saya kasih 5 ..kembangkan ...,5,POSITIF
1,"Aplikasi ringan, deposit dan wd banyak pilihan...",5,POSITIF
2,Kenapa utk Crypto Solana tdk bisa dilakukan pe...,3,NEGATIF
3,"Mau bikin koin hebat, apa cari cuan aja, emang...",4,POSITIF
4,"Semakin susah kirim idrt ke binance,ngga seper...",3,NEGATIF


In [3]:
# Memfilter dataset yang sudah diberi label menggunakan variabel ulasan dan nilai
# Menyimpan dataset dengan variabel ulasan dan nilai

dataset= read_data[['ulasan','nilai']]
dataset.head()
dataset.to_csv("../data/dataset-pintu-labeling-filter.csv", index=False)

##### Preprocessing

In [4]:
# Install library NLTK : Library python membantu mempermudah dalam memproses teks seperti tokenization, filtering, dll
# Install library sastrawi : Library python yang dapat mengubah kata berimbuhan menjadi kata dasar
# Import tipe data string : Salah satu jenis tipe data pada bahasa pemrograman python dan biasanyan berisi karakter
# Import library Pipeline : Library python yang mempermudah dalam membuat jalur data dengan bantuan peta, filter, dll
# Import library numpy : Libarary python yang fokus pada scientific computing
# Import library pandas : Library python yang fokus pada analisis data seperti manipulasi data, persiapan data dan pembersihan data
# Import library re: Regular expression 

%pip install nltk
%pip install sastrawi
import string
import pandas as pd
import numpy as np
import re


# Untuk proses filtering
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Untuk proses stemming
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory



Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# fungsi casefolding untuk lowercase, removing number, removing punctuation, removing whitespace tujuannya untuk membersihkan data


def casefolding(ulasan):
  #Lowercase : untuk merubah format teks menjadi hurud kecil semua
  ulasan = ulasan.lower()

  #Removing Number : untuk menghapus karakter angka
  ulasan = re.sub("\d+", "", ulasan)

  #Removing Punctuation : untuk menghapus karakter tanda baca
  ulasan = ulasan.translate(str.maketrans("","",string.punctuation))

  #Removing Whitespace : untuk menghapus karakter kosong
  ulasan = ulasan.strip()
  
  return ulasan

dataset['ulasan']=dataset['ulasan'].apply(casefolding)
dataset.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['ulasan']=dataset['ulasan'].apply(casefolding)


Unnamed: 0,ulasan,nilai
0,good job buat pintu saya kasih kembangkan ter...,POSITIF
1,aplikasi ringan deposit dan wd banyak pilihan ...,POSITIF
2,kenapa utk crypto solana tdk bisa dilakukan pe...,NEGATIF
3,mau bikin koin hebat apa cari cuan aja emang m...,POSITIF
4,semakin susah kirim idrt ke binancengga sepert...,NEGATIF


In [6]:
# Fungsi tokenizing untuk memisahkan kalimat menjadi perkata 

def tokenizing(ulasan):
  # Untuk memisahkan perkata
  ulasan = ulasan.split()

  return ulasan 

dataset['ulasan']=dataset['ulasan'].apply(tokenizing)
dataset.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['ulasan']=dataset['ulasan'].apply(tokenizing)


Unnamed: 0,ulasan,nilai
0,"[good, job, buat, pintu, saya, kasih, kembangk...",POSITIF
1,"[aplikasi, ringan, deposit, dan, wd, banyak, p...",POSITIF
2,"[kenapa, utk, crypto, solana, tdk, bisa, dilak...",NEGATIF
3,"[mau, bikin, koin, hebat, apa, cari, cuan, aja...",POSITIF
4,"[semakin, susah, kirim, idrt, ke, binancengga,...",NEGATIF


In [7]:
# fungsi filtering untuk menghilangkan kata yang tidak penting

def removed(ulasan):
  filtering = stopwords.words('indonesian','english')
  x = []
  data = []
  def myFungsi(x):
    if x in filtering:
      return False
    else:
      return True
  fit = filter(myFungsi, ulasan)
  for x in fit:
    data.append(x)
  return data

dataset['ulasan']=dataset['ulasan'].apply(removed)
dataset.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['ulasan']=dataset['ulasan'].apply(removed)


Unnamed: 0,ulasan,nilai
0,"[good, job, pintu, kasih, kembangkan, sukses]",POSITIF
1,"[aplikasi, ringan, deposit, wd, pilihan, mudah...",POSITIF
2,"[utk, crypto, solana, tdk, pembelian]",NEGATIF
3,"[bikin, koin, hebat, cari, cuan, aja, emang, k...",POSITIF
4,"[susah, kirim, idrt, binancengga]",NEGATIF


In [8]:
# fungsi stemming untuk merubah kata berimbuhan menjadi kata dasar

def stemming(ulasan):
  factory = StemmerFactory()
  stemmer = factory.create_stemmer()
  do = []
  for x in ulasan:
    dt = stemmer.stem(x)
    do.append(dt)
  data_clean = []
  data_clean = " ".join(do)
  return data_clean

dataset['ulasan']=dataset['ulasan'].apply(stemming)
dataset.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['ulasan']=dataset['ulasan'].apply(stemming)


Unnamed: 0,ulasan,nilai
0,good job pintu kasih kembang sukses,POSITIF
1,aplikasi ringan deposit wd pilih mudah ajar co...,POSITIF
2,utk crypto solana tdk beli,NEGATIF
3,bikin koin hebat cari cuan aja emang kaya bkin...,POSITIF
4,susah kirim idrt binancengga,NEGATIF


In [10]:
# Menyimpan dataset hasil proses preprocessing 
dataset.to_csv("../output/dataset-pintu-cleaning.csv", index=False)

##### Memisahkan Dataset 


In [11]:
# Import library pandes : Library python yang fokus pada analisis data seperti manipulasi data, persiapan data dan pembersihan data

import pandas as pd

In [12]:
# Lokasi dataset yang sudah bersih 
# Membaca dataset

data_path=("../output/dataset-pintu-cleaning.csv")
read_data=pd.read_csv(data_path)
read_data.head()

Unnamed: 0,ulasan,nilai
0,good job pintu kasih kembang sukses,POSITIF
1,aplikasi ringan deposit wd pilih mudah ajar co...,POSITIF
2,utk crypto solana tdk beli,NEGATIF
3,bikin koin hebat cari cuan aja emang kaya bkin...,POSITIF
4,susah kirim idrt binancengga,NEGATIF


In [13]:
# Merubah typedata pada variabel agar selanjutnya dapat melakukan proses TF-IDF

read_data = read_data.astype({'ulasan':'string'})
read_data = read_data.astype({'nilai':'category'})
read_data.dtypes

ulasan      string
nilai     category
dtype: object

In [14]:
# Melakukan proses TF-IDF

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfIdf=TfidfVectorizer()
text_tfIdf = tfIdf.fit_transform(read_data['ulasan'].astype('U'))
text_tfIdf

<1000x1940 sparse matrix of type '<class 'numpy.float64'>'
	with 6162 stored elements in Compressed Sparse Row format>

In [15]:
# Memisahkan dataset menjadi data training (70%) dan data testing (30%)
# random state = 42, maka berapa kali mengeksekusi kode hasilnya akan sama 

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(text_tfIdf, read_data['nilai'], test_size=0.7, random_state=42)

##### Algoritma Naive Bayes

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

clf = MultinomialNB().fit(x_train, y_train)
prediksi = clf.predict(x_test)
print("Accurancy : ", accuracy_score(y_test, prediksi))
print("Precision : ", precision_score(y_test, prediksi, average="binary", pos_label="NEGATIF"))
print("Recall : ", recall_score(y_test, prediksi, average="binary", pos_label="NEGATIF"))
print("F1 : ", f1_score(y_test, prediksi, average="binary", pos_label="NEGATIF"))

print(f'confusion_matrix:\n {confusion_matrix(y_test, prediksi)}')

Accurancy :  0.7342857142857143
Precision :  0.92
Recall :  0.1111111111111111
F1 :  0.1982758620689655
confusion_matrix:
 [[ 23 184]
 [  2 491]]
