<a href="https://colab.research.google.com/github/dhani43/KNN-Model-TFRF-Dinamic-Crawling-Youtube/blob/main/Prepocessing_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. INSTALL REQUIREMENT**

In [None]:
!pip install pandas numpy scikit-learn openpyxl nltk google-api-python-client Sastrawi

**2. IMPORT REQUIREMENT**

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import pickle
import os
import time
from datetime import datetime
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from google.colab import files, drive
from googleapiclient.discovery import build

**3. MENGHUBUNGKAN DENGAN GOOGLE DRIVE**

In [None]:
# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Path penyimpanan model di Google Drive
drive_path = "/content/drive/My Drive/Model_Sentimen/"
os.makedirs(drive_path, exist_ok=True)  # Buat folder jika belum ada

**4. UNDUH RESOURCE NLTK**

In [None]:
# Unduh resource NLTK jika belum ada
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

**5. UPLOAD DAN MENAMPILKAN DATASET**

In [None]:
# Upload file
uploaded = files.upload()

# Ambil nama file yang diunggah
dataset_path = list(uploaded.keys())[0]
df = pd.read_excel(dataset_path)

display(df.head())

**6. PREPROCESSING DATA**

In [None]:
text_column = 'Comment'
sentiment_column = 'Sentimen'
function_column = 'Fungsi'

# Setup stemmer Sastrawi dan stopwords
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = set(stopwords.words('indonesian'))

# Preprocessing
def preprocesses_text(text):
    # 1. Case Folding
    casefolded_text = text.lower()
    # 2. Cleaning
    cleaned_text = re.sub(r'@\w+|http\S+|www\.\S+|<.*?>|[^\w\s]', ' ', casefolded_text)
    cleaned_text = cleaned_text.strip()
    # 3. Tokenizing
    tokens = word_tokenize(cleaned_text)
    # 4. Stopword removal
    filtered = [word for word in tokens if word not in stop_words]
    # 5. Stemming
    stemmed = [stemmer.stem(word) for word in filtered]
    # 6. Final text
    final_text = ' '.join(stemmed)

    return {
        'casefolded_text': casefolded_text,
        'cleaned_text': cleaned_text,
        'tokens': tokens,
        'filtered': filtered,
        'stemmed': stemmed,
        'final_text': final_text
    }

# Terapkan pre-processing
df[text_column] = df[text_column].astype(str).apply(preprocesses_text)
# Case Folding
df['casefolded_text'] = df[text_column].apply(lambda x: x['casefolded_text'])
# Cleaning
df['cleaned_text'] = df[text_column].apply(lambda x: x['cleaned_text'])
# Tokenizing
df['tokens'] = df[text_column].apply(lambda x: x['tokens'])
# Stopword Removal
df['filtered'] = df[text_column].apply(lambda x: x['filtered'])
# Stemming
df['stemmed'] = df[text_column].apply(lambda x: x['stemmed'])
df['final_text'] = df[text_column].apply(lambda x: x['final_text'])
# Encoding labels
sentiment_encoder = LabelEncoder()
function_encoder = LabelEncoder()
df['sentiment_label'] = sentiment_encoder.fit_transform(df[sentiment_column])
df['function_label'] = function_encoder.fit_transform(df[function_column])

# Tampilkan hasil pre-processing secara bertahap
print("\n✅ 1. HASIL CASE FOLDING :")
display(df[['casefolded_text']].head())

print("\n✅ 2. HASIL CLEANING :")
display(df[['cleaned_text']].head())

print("\n✅ 3. HASIL TOKENIZING :")
display(df[['tokens']].head())

print("\n✅ 4. HASIL STOPWORD REMOVAL :")
display(df[['filtered']].head())

print("\n✅ 5. HASIL STEMMING :")
display(df[['stemmed']].head())

print("\n✅ 6. TEKS FINAL :")
display(df[['final_text']].head())

print("\n✅ 7. ENCODING LABEL SENTIMEN :")
display(df[[sentiment_column, 'sentiment_label']].head())

print("\n✅ 8. ENCODING LABEL FUNGSI :")
display(df[[function_column, 'function_label']].head())