In [120]:
import re
import spacy
import pandas as pd

from string import punctuation
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer

In [121]:
df = pd.read_csv('sampled_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,text,rating,category,product_name,product_id,sold,shop_id,product_url
0,3253,thanks..... .................................,4,elektronik,Rexco 18 Contact Cleaner / Pembersih Komponen ...,184464874,29,2100844,https://www.tokopedia.com/miniperkakas/rexco-1...
1,3559,oke barang sudah di terima dengan baik...,4,elektronik,Alfalink EI 428 - Kamus Elektronik Electronic ...,24896339,283,588756,https://www.tokopedia.com/omegaelectronic/alfa...
2,3131,"brg udh dtg #pontianak dgn slmat gan, mksih..",5,elektronik,Alfalink EI 212 - Kamus Elektronik,20602038,383,588756,https://www.tokopedia.com/omegaelectronic/alfa...
3,3658,Cepat dan tepat! Recommended seller,5,elektronik,TI-84 Plus CE,38187136,53,844505,https://www.tokopedia.com/datasempoa/ti-84-plu...
4,3252,"mantaf,. langsung test.. residu hilang\nkirima...",4,elektronik,Rexco 18 Contact Cleaner / Pembersih Komponen ...,184464874,29,2100844,https://www.tokopedia.com/miniperkakas/rexco-1...


In [122]:
print(df['category'].value_counts())

category
elektronik     1000
fashion        1000
olahraga       1000
handphone      1000
pertukangan    1000
Name: count, dtype: int64


# Pre-Processing

In [123]:
df = df.drop(columns=['Unnamed: 0', 'product_id', 'shop_id', 'sold', 'product_url'])
df

Unnamed: 0,text,rating,category,product_name
0,thanks..... .................................,4,elektronik,Rexco 18 Contact Cleaner / Pembersih Komponen ...
1,oke barang sudah di terima dengan baik...,4,elektronik,Alfalink EI 428 - Kamus Elektronik Electronic ...
2,"brg udh dtg #pontianak dgn slmat gan, mksih..",5,elektronik,Alfalink EI 212 - Kamus Elektronik
3,Cepat dan tepat! Recommended seller,5,elektronik,TI-84 Plus CE
4,"mantaf,. langsung test.. residu hilang\nkirima...",4,elektronik,Rexco 18 Contact Cleaner / Pembersih Komponen ...
...,...,...,...,...
4995,Trims gan paket telah di terima .,4,pertukangan,Staples Gun Tacker Mollar 3 in 1 / Staple Jok ...
4996,terimakasih pengiriman cepat.....................,5,pertukangan,staples tembak HOMASTER / steples tembak / sta...
4997,terima kasih n maaf baru konfirmasi..,5,pertukangan,staples tembak homaster / staple gun
4998,Terima kasih packing rapi,4,pertukangan,staples tembak staple gun tacker staples jok m...


In [118]:
slang_df = pd.read_csv("slang_dict.csv", sep=';')
slang_dict = pd.Series(slang_df['baku'].values, index=slang_df['slang'].values).to_dict()

factory = StemmerFactory()
stemmer = factory.create_stemmer()
nlp = spacy.load("xx_ent_wiki_sm")  # Pastikan model ini cocok

def processtext(text):
    text = text.lower()
    text = re.sub(r"\&\w*;", "", text)  # Menghapus HTML entities
    text = re.sub(r"@[^\s]+", "", text)  # Menghapus mention @
    text = re.sub(r"\$\w*", "", text)  # Menghapus simbol uang
    text = re.sub(r"https?:\/\/.*\/\w*", "", text)  # Menghapus URL
    text = re.sub(r"#\w*", "", text)  # Menghapus hashtag
    text = re.sub(r"[{}]+".format(string.punctuation), " ", text)  # Menghapus tanda baca
    text = re.sub(r"\b\w{1,2}\b", "", text)  # Menghapus kata pendek
    text = re.sub(r"\s+", " ", text).strip()  # Normalisasi spasi
    text = "".join(c for c in text if c <= "\uFFFF")  # Menghapus karakter tidak valid
    return text

def replace_slang(text):
    words = text.split()
    replaced_words = [slang_dict.get(word, word) for word in words] 
    print(replaced_words)  # Debugging: lihat hasil penggantian slang
    return " ".join(replaced_words)


def stem(text):
    return stemmer.stem(text)

def remove_rare_words(text, word_counts, min_freq=3):
    return " ".join([word for word in text.split() if word_counts[word] >= min_freq])

def lemmatize(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

def remove_stopwords(text, stopwords_file="id_stopwords.txt"):
    with open(stopwords_file, "r") as f:
        stopwords_list = f.read().splitlines()
        words = text.split()
        print(words)  # Debugging: lihat kata-kata sebelum menghapus stopwords
        words = [word for word in words if word not in stopwords_list]
        print(words)  # Debugging: lihat kata-kata setelah menghapus stopwords
        return " ".join(words)

# Pipeline Preprocessing
df['cleaned_text'] = df['text'].apply(processtext)
print(df[['text', 'cleaned_text']].head())  # Cek hasil cleaned_text

word_counts = Counter(" ".join(df['cleaned_text']).split())
df['text_no_rare'] = df['cleaned_text'].apply(lambda x: remove_rare_words(x, word_counts))
print(df[['cleaned_text', 'text_no_rare']].head())  # Cek hasil text_no_rare

df['lemmatized_text'] = df['text_no_rare'].apply(lemmatize)
print(df[['text_no_rare', 'lemmatized_text']].head())  # Cek hasil lemmatized_text

df['text_with_slang_replaced'] = df['lemmatized_text'].apply(replace_slang)
print(df[['lemmatized_text', 'text_with_slang_replaced']].head())  # Cek hasil text_with_slang_replaced

df['final_text'] = df['text_with_slang_replaced'].apply(remove_stopwords)
print(df[['text_with_slang_replaced', 'final_text']].head())  # Cek hasil final_text


                                                text  \
0      thanks..... .................................   
1          oke barang sudah di terima dengan baik...   
2      brg udh dtg #pontianak dgn slmat gan, mksih..   
3                Cepat dan tepat! Recommended seller   
4  mantaf,. langsung test.. residu hilang\nkirima...   

                                        cleaned_text  
0                                             thanks  
1                oke barang sudah terima dengan baik  
2                    brg udh dtg dgn slmat gan mksih  
3                 cepat dan tepat recommended seller  
4  mantaf langsung test residu hilang kiriman aga...  
                                        cleaned_text  \
0                                             thanks   
1                oke barang sudah terima dengan baik   
2                    brg udh dtg dgn slmat gan mksih   
3                 cepat dan tepat recommended seller   
4  mantaf langsung test residu hilang kiriman aga... 

In [131]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize

In [137]:
positive_words = {"barang", "sesuai", "cepat", "bagus", "sudah", "sampai", "terima", "baik", "mantap", "puas", "terima kasih", "murah", "recommended", "aman", "bersih", "original", "rapi", "mulus", "cocok", "nyaman", "terpercaya", "top"}
negative_words = {"tidak", "kurang", "rusak", "salah", "lama", "terlambat", "jelek", "cacat", "kecewa", "penipuan", "palsu", "buruk", "lelet", "kotor", "hancur", "lambat", "menyesal", "tidak sesuai", "bohong"}

# Function to label using custom dictionary
def custom_label_sentiment(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    tokens = text.split()
    pos_count = sum(1 for word in tokens if word in positive_words)
    neg_count = sum(1 for word in tokens if word in negative_words)
    
    # Assign label based on word count
    if pos_count > neg_count:
        return 'positive'
    elif neg_count > pos_count:
        return 'negative'
    else:
        return 'neutral'

# Apply custom labeling function
df['label_custom'] = df['text'].apply(custom_label_sentiment)

# Preprocessing: Tokenization and removing punctuation for better performance
def preprocess_text(text):
    # Menghapus tanda baca dan mengubah teks menjadi huruf kecil
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Memisahkan teks menjadi kata-kata
    return ' '.join(text.split())

df['cleaned_text'] = df['text'].apply(preprocess_text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])

X_train, X_test, y_train_custom, y_test_custom = train_test_split(X, df['label_custom'], test_size=0.2, random_state=42)

# Logistic Regression Classification using Custom Labeling
logreg_custom = LogisticRegression(max_iter=1000)  # Added max_iter for convergence
logreg_custom.fit(X_train, y_train_custom)
y_pred_custom_logreg = logreg_custom.predict(X_test)
print("\nLogistic Regression with Custom Labeling:")
print("Accuracy:", accuracy_score(y_test_custom, y_pred_custom_logreg))
print(classification_report(y_test_custom, y_pred_custom_logreg))

# SVM Classification using Custom Labeling
svm_custom = SVC()
svm_custom.fit(X_train, y_train_custom)
y_pred_custom_svm = svm_custom.predict(X_test)
print("\nSVM with Custom Labeling:")
print("Accuracy:", accuracy_score(y_test_custom, y_pred_custom_svm))
print(classification_report(y_test_custom, y_pred_custom_svm))


Logistic Regression with Custom Labeling:
Accuracy: 0.988
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         1
     neutral       0.99      1.00      0.99       972
    positive       1.00      0.59      0.74        27

    accuracy                           0.99      1000
   macro avg       0.66      0.53      0.58      1000
weighted avg       0.99      0.99      0.99      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



SVM with Custom Labeling:
Accuracy: 0.992
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         1
     neutral       0.99      1.00      1.00       972
    positive       1.00      0.74      0.85        27

    accuracy                           0.99      1000
   macro avg       0.66      0.58      0.62      1000
weighted avg       0.99      0.99      0.99      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Split Data

In [119]:
X = df['final_text']  # Fitur
y = df['rating']       # Label 

# Membagi data menjadi training set dan test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Menampilkan jumlah data
print(f'Jumlah data training: {len(X_train)}')
print(f'Jumlah data test: {len(X_test)}')

KeyError: 'rating'

In [91]:
# Ekstraksi fitur dengan TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [93]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Coba menggunakan bigram juga


In [94]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300)
model.fit(X_train_tfidf, y_train)




In [80]:


# Melatih model Naive Bayes
from sklearn.utils import class_weight
import numpy as np

# Menghitung bobot kelas
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
model = MultinomialNB(class_prior=class_weights)

model.fit(X_train_tfidf, y_train)

# Memprediksi data test
y_pred = model.predict(X_test_tfidf)

# Menampilkan laporan klasifikasi
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.08      0.47      0.14        15
           2       0.02      0.80      0.04        10
           3       0.08      0.06      0.07        52
           4       0.26      0.28      0.27       232
           5       0.79      0.26      0.39       691

    accuracy                           0.26      1000
   macro avg       0.25      0.37      0.18      1000
weighted avg       0.61      0.26      0.34      1000



In [81]:
# Ekstraksi fitur dengan TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


# Menghitung bobot kelas
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train_tfidf, y_train)


model.fit(X_train_tfidf, y_train)

# Memprediksi data test
y_pred = model.predict(X_test_tfidf)

# Menampilkan laporan klasifikasi
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.33      0.07      0.11        15
           2       0.00      0.00      0.00        10
           3       0.29      0.04      0.07        52
           4       0.30      0.11      0.16       232
           5       0.72      0.94      0.81       691

    accuracy                           0.68      1000
   macro avg       0.33      0.23      0.23      1000
weighted avg       0.59      0.68      0.60      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [82]:
# Ekstraksi fitur dengan TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


from sklearn.svm import SVC

model = SVC(kernel='linear', class_weight='balanced')  # Gunakan 'linear' kernel untuk teks
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)


model.fit(X_train_tfidf, y_train)

# Memprediksi data test
y_pred = model.predict(X_test_tfidf)

# Menampilkan laporan klasifikasi
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.08      0.13      0.10        15
           2       0.03      0.10      0.05        10
           3       0.12      0.23      0.16        52
           4       0.28      0.41      0.33       232
           5       0.79      0.58      0.67       691

    accuracy                           0.51      1000
   macro avg       0.26      0.29      0.26      1000
weighted avg       0.62      0.51      0.55      1000



In [84]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(class_weight='balanced')
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

model.fit(X_train_tfidf, y_train)

# Memprediksi data test
y_pred = model.predict(X_test_tfidf)

# Menampilkan laporan klasifikasi
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.05      0.27      0.09        15
           2       0.00      0.00      0.00        10
           3       0.08      0.13      0.10        52
           4       0.27      0.30      0.28       232
           5       0.75      0.61      0.67       691

    accuracy                           0.50      1000
   macro avg       0.23      0.26      0.23      1000
weighted avg       0.59      0.50      0.54      1000



In [85]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced')
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

model.fit(X_train_tfidf, y_train)

# Memprediksi data test
y_pred = model.predict(X_test_tfidf)

# Menampilkan laporan klasifikasi
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.11      0.20      0.14        15
           2       0.10      0.30      0.15        10
           3       0.12      0.29      0.17        52
           4       0.30      0.36      0.32       232
           5       0.80      0.63      0.70       691

    accuracy                           0.54      1000
   macro avg       0.29      0.35      0.30      1000
weighted avg       0.63      0.54      0.57      1000

