# JAWABAN UAS DEEP LEARNING - SOAL 1
**NAMA:** FARIS ALI HUSAMUDDIN  
**NPM:** 20241310055

**Mata Kuliah:** Deep Learning
**Topik:** Sentiment Analysis (YouTube Comments)

In [None]:
!pip install pandas numpy seaborn matplotlib scikit-learn nltk PySastrawi

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import re
import pickle

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
print("Library berhasil diimport!")

## Definisikan Fungsi Preprocessing (6 Poin Lengkap)

In [None]:
def npm_20241310055_preprocessing_lengkap(text):
    # 1. Normalization (Case Folding + Regex)
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)    # Hapus URL
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # Hapus Mention
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Hapus Angka & Simbol (Parsing Pattern)
    
    # Setup Stopwords
    try:
        stopwords_id = stopwords.words('indonesian')
    except:
        stopwords_id = ['yang', 'dan', 'di', 'ke', 'dari', 'ini', 'itu']
    stopwords_en = stopwords.words('english')
    
    # 3. Tokenization (NLTK)
    words = nltk.word_tokenize(text)
    
    # 4. Stopword Removal
    words = [w for w in words if w not in stopwords_id and w not in stopwords_en]
    
    # 5. Lemmatization (WordNet)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    
    # 6. Stemming (Skipped - User Request: English Stemming destroys Indo words)
    # stemmer = PorterStemmer()
    # words = [stemmer.stem(w) for w in words]
    
    return " ".join(words)

## Proses 1: Input Data Training

In [None]:
filename = 'dataset_labeled_final.csv'

try:
    df = pd.read_csv(filename)
    print(f"[BERHASIL] Dataset dimuat: {len(df)} baris")
    print(df.head())
    
    # FILTER: HANYA POSITIF & NEGATIF (Binary Classification)
    df = df[df['label'].isin(['positif', 'negatif'])]
    print(f"Total Data (Binary Only): {len(df)}")
except FileNotFoundError:
    print(f"[ERROR] File '{filename}' belum ada. Silakan upload dulu ke Colab!")

## Proses 2: Preprocessing Data
Preprocessing Mencakup: Normalization, Tokenization, Parsing, Stopword Removal, Lemmatization, Stemming.

In [None]:
if 'df' in locals():
    print("Sedang melakukan preprocessing lengkap (6 Poin)...")
    df['text'] = df['text'].astype(str)
    df['clean_text'] = df['text'].apply(npm_20241310055_preprocessing_lengkap)
    
    print("Preprocessing Selesai.")
    print(df[['text', 'clean_text', 'label']].head())

## Proses 3: Modeling dengan Metode Logistic Regression

In [None]:
if 'df' in locals():
    X = df['clean_text']
    y = df['label']
    
    # Split Data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Feature Extraction (TF-IDF)
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    # Train Model (Logistic Regression)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_vec, y_train)
    print("Model Logistic Regression Berhasil Dilatih!")

## Proses 4: Evaluation

In [None]:
if 'model' in locals():
    y_pred = model.predict(X_test_vec)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy Score: {acc*100:.2f}%")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Confusion Matrix Visualization
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.show()

## Proses 5: End Process : Prediction of Data Testing

In [None]:
if 'model' in locals():
    # Save Model
    with open('model_uas_20241310055.pkl', 'wb') as f:
        pickle.dump(model, f)
    with open('vectorizer_uas_20241310055.pkl', 'wb') as f:
        pickle.dump(vectorizer, f)
    print("Model Tersimpan sebagai 'model_uas_20241310055.pkl'")

    # Test Prediksi Manual
    print("\n--- HASIL PREDIKSI (PURE MODEL) ---")
    kalimat_1 = "Debatnya sangat berbobot dan informatif"
    kalimat_2 = "siapa pun presidennya kinerjanya sangat buruk, hancur indonesia"
    
    test_cases = [kalimat_1, kalimat_2]
    
    for text in test_cases:
        cleaned = npm_20241310055_preprocessing_lengkap(text)
        
        # Prediksi Murni dengan Model
        vec = vectorizer.transform([cleaned])
        final_pred = model.predict(vec)[0]
            
        print(f"\nKalimat: '{text}'")
        print(f"Sentiment: {final_pred.upper()}")