<a href="https://colab.research.google.com/github/codjere/belajarIPYNBJere/blob/main/04_Klasifikasi_Model_dan_Evaluasi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk
!pip install wordcloud
!pip install Sastrawi
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab')
nltk.download('wordnet')



# Pembacaan Dataset


In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("/content/Sentiment1.csv", encoding='ISO-8859-1')

# DEA (Data Explorasi and Analisis)
Tujuan: Pemahaman data (bentuk data, distribusi, missing values, data duplikat, distribusi label, dan kata-kata yang sering muncul).

### Data Info

In [None]:
# cek bentuk data
data.head()

In [None]:
# melihat informasi didalam data
data.info()

In [None]:
data.drop(columns=['Date', 'Username', 'Length_Text'], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
# melihat data deskripsi
data.describe()

### Distribusi Label

In [None]:
# Melihat distribusi persebaran label
data.Sentiment.value_counts()

In [None]:
print(f"persentase data positif: {len(data[data['Sentiment'] == 'Positive'])/len(data)*100:.2f}%")
print(f"persentase data negatif: {len(data[data['Sentiment'] == 'Negative'])/len(data)*100:.2f}%")
print(f"persentase data netral: {len(data[data['Sentiment'] == 'Neutral'])/len(data)*100:.2f}%")

### Melihat data sampel

In [None]:
# melihat data text pada index data ke 1
data.Text[0]

### Data Missing dan Data Duplikat

In [None]:
# melihat missing value (data yang hilang) pada kolom komentar
data.Text.isnull().sum()

In [None]:
# Melihat baris duplikat
duplicate_rows = data[data.duplicated()]
print("Duplicate rows based on all columns:\n", duplicate_rows)

In [None]:
# menghitung total data duplikat
data.duplicated().sum()

In [None]:
# Harus menghapus duplikasi
data = data.drop_duplicates().reset_index(drop=True)
# atau bisa juga menggunakan kode di bawah ini
# data = data.drop_duplicates(keep='first')

In [None]:
data.duplicated().sum()

In [None]:
data.Sentiment.value_counts()

In [None]:
988 - 979
# ada pengurangan jumlah data positif sebanyak 9 data karena duplikasi

### Sampling Technique

In [None]:
from sklearn.utils import resample

# buat data balancing technique dengan undersampling
# Pisahkan berdasarkan kelas
df_pos = data[data['Sentiment'] == "Positive"]
df_neu = data[data['Sentiment'] == "Neutral"]
df_neg = data[data['Sentiment'] == "Negative"]

In [None]:
print(f"data positive : {df_pos.shape}")
print(f"data neutral : {df_neu.shape}")
print(f"data negative : {df_neg.shape}")

In [None]:
# Tentukan target jumlah data (minoritas)
min_count = min(len(df_neu), len(df_neg))  # = 161

In [None]:
min_count

In [None]:
# Downsampling kelas mayoritas (Positive â†’ 161)
df_pos_down = resample(df_pos,
                       replace=False,   # tidak melakukan duplikasi
                       n_samples=min_count,
                       random_state=42)

In [None]:
df_pos_down.shape

In [None]:
# Gabungkan kembali dataset seimbang
df_balanced = pd.concat([df_pos_down, df_neu, df_neg])

In [None]:
df_balanced.shape

In [None]:
type(df_balanced)

In [None]:
df_balanced.head()

In [None]:
# Shuffle hasil
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df_balanced.head()

In [None]:
print(df_balanced['Sentiment'].value_counts())

## Preprocessing

In [None]:
text = df_balanced.Text.iloc[0]

In [None]:
text

In [None]:
# remove Lowercase
text = text.lower()


In [None]:
text

In [None]:
# Menghapus tanda baca dan angka
import re

text = re.sub(r'[^a-zA-Z\s]', '', text)

In [None]:
text

In [None]:
# menghapus Stopword
from nltk.corpus import stopwords

stop_words = set(stopwords.words('indonesian'))
tokens = [w for w in text.split() if w not in stop_words]
text = " ".join(tokens)

In [None]:
text

In [None]:
# melakukan Tokenisasi
from nltk.tokenize import word_tokenize

tokens = word_tokenize(text)

In [None]:
tokens

In [None]:
# melakukan stemming
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stemmer = StemmerFactory().create_stemmer()
text = stemmer.stem(text)


In [None]:
text

In [None]:
# melakukan lemmatization

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
tokens_lem = [lemmatizer.lemmatize(w) for w in tokens]


In [None]:
tokens_lem

In [None]:
# wrapping semua proses diatas
def preprocess(text):
    # 1. Lowercase
    text = text.lower()

    # 2. Hapus angka & tanda baca
    text = re.sub(r'http\S+|@\w+|#[A-Za-z0-9_]+|www\.\S+', '', text)   # hapus URL, mention, hashtag
    text = re.sub(r'[^a-zA-Z\s]', '', text)                   # hapus angka & tanda baca
    text = re.sub(r'\s+', ' ', text).strip()                  # rapikan spasi

    # 3. Tokenisasi
    tokens = word_tokenize(text)

    # 4. Stopword removal
    tokens = [w for w in tokens if w not in stop_words]

    # 5. Stemming
    text = " ".join(tokens)
    text = stemmer.stem(text)

    return text

In [None]:
df_balanced.Text = df_balanced.Text.apply(preprocess)

In [None]:
df_balanced.head()

In [None]:
from wordcloud import WordCloud, STOPWORDS
from matplotlib import pyplot as plt

def word_clod_plot(feature_name:str, data:pd.DataFrame):
    """_summary_

    Args:
        feature_name (str): _description_
    """
    txt = data[data['Sentiment'] == feature_name]['Text']
    wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=STOPWORDS).generate(' '.join(txt))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for {feature_name} Text')
    plt.axis('off')

In [None]:
word_clod_plot(feature_name='Positive', data=df_balanced)

In [None]:
word_clod_plot(feature_name='Negative', data=df_balanced)

In [None]:
word_clod_plot(feature_name='Neutral', data=df_balanced)

In [None]:
from nltk.tokenize import word_tokenize
# tokenize the text
def text_tokenize(text:str)->list:
  return word_tokenize(text)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_df_balanced = TfidfVectorizer(analyzer=text_tokenize).fit_transform(df_balanced['Text'])
# tfidf_transformer_X_test = TfidfVectorizer(analyzer=text_tokenize).fit_transform(df_balanced['Text'])

In [None]:
# type(tfidf_transformer_X_train)
print(tfidf_df_balanced.toarray())

In [None]:
tfidf_df_balanced.toarray().shape
# 407 = jumlah dokumen (jumlah teks dalam dataset)
# 2105 = jumlah fitur (jumlah kata unik setelah preprocessing)

In [None]:
df = pd.DataFrame(tfidf_df_balanced.toarray())
df.head()

## Machine Learning Modeling

In [None]:
df.shape

In [None]:
df_balanced['Sentiment'].shape

In [None]:
from sklearn.model_selection import train_test_split

X = tfidf_df_balanced
y = df_balanced['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.naive_bayes import MultinomialNB

multiNB = MultinomialNB().fit(X_train, y_train)

In [None]:
multiNB.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=0).fit(X_train, y_train)

In [None]:
rfc.score(X_test, y_test)

## Model Evaluation

In [None]:
y_pred = rfc.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print (classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)

sns.heatmap(cm,
            annot=True,
            fmt='g',
            xticklabels=['0','1','2'],
            yticklabels=['0','1','2'])
plt.ylabel('Actual', fontsize=13)
plt.title('Confusion Matrix', fontsize=17, pad=20)
plt.gca().xaxis.set_label_position('top')
plt.xlabel('Prediction', fontsize=13)
plt.gca().xaxis.tick_top()

plt.gca().figure.subplots_adjust(bottom=0.2)
plt.gca().figure.text(0.5, 0.05, 'Prediction', ha='center', fontsize=13)
plt.show()