# **Import Library yang Dibutuhkan**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from wordcloud import WordCloud
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

# **Membuka Dataset Hasil Stemming**

In [None]:
df = pd.read_csv('dataset_swch_roberta_ayame.csv',delimiter=';')
df

Unnamed: 0.1,Unnamed: 0,Stemming,label
0,0,shelaemeliana sehat iya sandwich generation,Neutral
1,1,film rumah swet loan recomended gusy nonton ta...,Neutral
2,2,tanyakanrl teman sandwich generation hedon,Neutral
3,3,kabar iya kakak kakak garuda soft spoken sandw...,Neutral
4,4,film bagus kali menang huhu misuhmisuhinkeluar...,Positive
...,...,...,...
1407,1407,idola perempuanperempuan dady isue sandwich ge...,Neutral
1408,1408,convomf anak of kursus sandwich generation,Neutral
1409,1409,convomf anak of kursus sandwich generation,Neutral
1410,1410,inipita biar enak bada badane suka gel gel san...,Positive


Melakukan preprocess sebelum data diolah

In [None]:
del df ['Unnamed: 0']
df

Unnamed: 0,Stemming,label
0,shelaemeliana sehat iya sandwich generation,Neutral
1,film rumah swet loan recomended gusy nonton ta...,Neutral
2,tanyakanrl teman sandwich generation hedon,Neutral
3,kabar iya kakak kakak garuda soft spoken sandw...,Neutral
4,film bagus kali menang huhu misuhmisuhinkeluar...,Positive
...,...,...
1407,idola perempuanperempuan dady isue sandwich ge...,Neutral
1408,convomf anak of kursus sandwich generation,Neutral
1409,convomf anak of kursus sandwich generation,Neutral
1410,inipita biar enak bada badane suka gel gel san...,Positive


In [None]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Stemming,label
0,shelaemeliana sehat iya sandwich generation,Neutral
1,film rumah swet loan recomended gusy nonton ta...,Neutral
2,tanyakanrl teman sandwich generation hedon,Neutral
3,kabar iya kakak kakak garuda soft spoken sandw...,Neutral
4,film bagus kali menang huhu misuhmisuhinkeluar...,Positive
...,...,...
1407,idola perempuanperempuan dady isue sandwich ge...,Neutral
1408,convomf anak of kursus sandwich generation,Neutral
1409,convomf anak of kursus sandwich generation,Neutral
1410,inipita biar enak bada badane suka gel gel san...,Positive


In [None]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Neutral,640
Negative,513
Positive,259


# **Data Augmentasi Untuk Memperkaya Dataset**

In [None]:
import random
import nltk
from nltk.corpus import wordnet

In [None]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def synonym_replacement(text):
    words = nltk.word_tokenize(text)  # Tokenisasi kalimat menjadi kata-kata
    new_words = words.copy()

    for word in words:
        # Mengambil sinonim dari WordNet
        synonyms = wordnet.synsets(word)
        if synonyms:
            # Ambil satu sinonim secara acak
            synonym = random.choice(synonyms).lemmas()[0].name()  # Ambil sinonim pertama
            new_words = [synonym if w == word else w for w in new_words]

    return ' '.join(new_words)

#Lakukan augmentasi pada dataset
augmented_texts = []
for text in df['Stemming']:
    augmented_texts.append(synonym_replacement(text))

#Buat DataFrame baru untuk menampung teks augmented
augmented_df = pd.DataFrame({
    'Stemming': augmented_texts,
    'label': df['label']  # Tetap menggunakan label asli
})

#Gabungkan dengan DataFrame asli
combined_df = pd.concat([df, augmented_df], ignore_index=True)

#Tampilkan hasil augmentasi
print(combined_df)

                                               Stemming     label
0           shelaemeliana sehat iya sandwich generation   Neutral
1     film rumah swet loan recomended gusy nonton ta...   Neutral
2            tanyakanrl teman sandwich generation hedon   Neutral
3     kabar iya kakak kakak garuda soft spoken sandw...   Neutral
4     film bagus kali menang huhu misuhmisuhinkeluar...  Positive
...                                                 ...       ...
2819  idola perempuanperempuan dady isue sandwich co...   Neutral
2820         convomf anak of kursus sandwich generation   Neutral
2821         convomf anak of kursus sandwich generation   Neutral
2822  inipita biar enak bada badane suka mousse mous...  Positive
2823  mend kost actinium not actinium asumsi gaji ju...  Negative

[2824 rows x 2 columns]


# Konversi data menjadi numerik

In [None]:
#Konversi label menjadi format numerik
X= combined_df['Stemming']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(combined_df['label'])
y_encoded

array([1, 1, 1, ..., 1, 2, 0])

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
#Menggunakan TF-IDF untuk merubah teks menjadi vektor
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

# Menggunakan Smote Untuk Melakukan Pemerataan Pada Data

In [None]:
from imblearn.over_sampling import SMOTE
#Inisialisasi SMOTE
smote = SMOTE(sampling_strategy='auto')
#Terapkan SMOTE pada data pelatihan
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_tfidf, y_train)

# Modelling Dengan Naive-Bayes

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
#Inisialisasi model Naive Bayes
model_nb = MultinomialNB()

#Tuning hyperparameter dengan Grid Search
param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]}
grid_search = GridSearchCV(model_nb, param_grid, cv=5)
grid_search.fit(X_train_balanced, y_train_balanced)

#Tampilkan parameter terbaik
print(f"Best alpha: {grid_search.best_params_['alpha']}")
model_nb = grid_search.best_estimator_

#Evaluasi model
y_val_pred = model_nb.predict(X_val_tfidf)

#Tampilkan laporan klasifikasi
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))

#Cross-validation untuk akurasi
cv_scores = cross_val_score(model_nb, X_train_balanced, y_train_balanced, cv=5)
print(f"Mean CV Accuracy: {cv_scores.mean()}")

Best alpha: 0.1
              precision    recall  f1-score   support

    Negative       0.84      0.91      0.87       205
     Neutral       0.93      0.83      0.88       256
    Positive       0.81      0.89      0.85       104

    accuracy                           0.87       565
   macro avg       0.86      0.88      0.87       565
weighted avg       0.87      0.87      0.87       565

Mean CV Accuracy: 0.8971462620163659
