In [24]:
import pandas as pd
import re
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [25]:
dataset = '/content/drive/MyDrive/NLP/Amazon_Unlocked_Mobile.csv'
data = pd.read_csv(dataset)
data

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0
...,...,...,...,...,...,...
413835,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,5,another great deal great price,0.0
413836,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,3,Ok,0.0
413837,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,5,Passes every drop test onto porcelain tile!,0.0
413838,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,3,I returned it because it did not meet my needs...,0.0


In [26]:
data.shape

(413840, 6)

In [27]:
# mengambil data brand samsung
data = data.loc[data['Brand Name'] == 'Samsung', 'Brand Name':'Reviews']
# mengambil 1000 data
data = data.iloc[0:1000]
# ambil hanya feature Reviews dan Rating
data = data[['Reviews', 'Rating']]
# hapus data kosong
data.dropna(inplace=True)

In [5]:
data

Unnamed: 0,Reviews,Rating
0,I feel so LUCKY to have found this used (phone...,5
1,"nice phone, nice up grade from my pantach revu...",4
2,Very pleased,5
3,It works good but it goes slow sometimes but i...,4
4,Great phone to replace my lost phone. The only...,4
...,...,...
184881,Advertised as compatible with Verizon network....,1
184882,Not a user friendly at all. The gal at the sto...,1
184883,As expected!,4
184884,Bought this unlocked phone to take overseas. O...,1


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 999 entries, 0 to 184885
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Reviews  999 non-null    object
 1   Rating   999 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 23.4+ KB


In [7]:
# iris_df.isnull().values.any()
data.isna().values.any() # mendeteksi keberadaan nilai kosong

False

In [8]:
data[data.duplicated()] # tampilkan hanya baris duplikasi sekunder

Unnamed: 0,Reviews,Rating
182883,excelente,5
182896,Great phone,5
182938,Good,5
183032,Nice,5
183044,Very good,5
...,...,...
183554,Excellent choice of cellphone.,5
183555,Great price for a great phone. Even though it ...,5
183556,Outstanding phone at a great price!,5
184779,As expected,5


In [9]:
data.duplicated().value_counts() # hitung jumlah duplikasi data

False    859
True     140
dtype: int64

In [10]:
data.drop_duplicates(inplace=True) # menghapus duplikasi data
data.shape


(859, 2)

In [11]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [12]:
lemma = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def CleanReview(txt):
  txt = re.sub(r'http\S+', ' ', txt)     # hapus url                                            
  txt = re.sub('[^a-zA-Z]','  ', txt)      # hapus teks tidak relevan (karakter selain a-z)                                          
  txt = str(txt).lower()                     # jadikan huruf kecil semua                                        
  txt = word_tokenize(txt)                                   # tokenize                        
  txt = [item for item in txt if item not in stop_words]      # hapus stop words ('saya' 'aku' 'kamu' dll)                       
  txt = [lemma.lemmatize(word=w,pos='v') for w in txt]         # lemmatization : mengembalikan ke bentuk dasar kata                      
  txt = [i for i in txt if len(i) > 2]                          # hapus kata yang kurang dari dua huruf/karakter                     
  txt = ' '.join(txt)                                            # penggabungan setiap token menjadi kalimat utuh                    
  return txt

data['CleanReview'] = data['Reviews'].apply(CleanReview)

In [13]:
from sklearn.model_selection import train_test_split # pembagi dataset menjadi training dan testing set
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report # evaluasi performa model

In [14]:
# melabelkan data menjadi 2 kategori
def pelabelan(rate):
  if rate < 3:
    return 'negatif'
  else:
    return 'positif'

data['Label'] = data['Rating'].apply(pelabelan)
data.head()

Unnamed: 0,Reviews,Rating,CleanReview,Label
0,I feel so LUCKY to have found this used (phone...,5,feel lucky find use phone use hard phone line ...,positif
1,"nice phone, nice up grade from my pantach revu...",4,nice phone nice grade pantach revue clean set ...,positif
2,Very pleased,5,please,positif
3,It works good but it goes slow sometimes but i...,4,work good slow sometimes good phone love,positif
4,Great phone to replace my lost phone. The only...,4,great phone replace lose phone thing volume bu...,positif


In [15]:
# split x dan y
x = data['CleanReview']
y = data['Label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [16]:
y_test

182994    positif
182923    negatif
182926    positif
183499    positif
183408    positif
           ...   
182781    positif
183463    positif
183006    positif
183196    positif
183265    positif
Name: Label, Length: 172, dtype: object

In [17]:
# perform count vectorizer
vectorizer = CountVectorizer()
vectorizer.fit(x_train)

CountVectorizer()

In [18]:
# x_train
x_train = vectorizer.transform(x_train)
x_test = vectorizer.transform(x_test)

In [19]:
x_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

Klasifikasi Menggunakan Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
# model_rf = RandomForestClassifier()
model_rf = RandomForestClassifier(n_estimators=100)
model_rf.fit(x_train,y_train)
pred_rf = model_rf.predict(x_test)

In [22]:
models = [model_rf]
accuracy_scores = []
for model in models:
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    
print(accuracy_scores)

[0.8488372093023255]
