<a href="https://colab.research.google.com/github/dhani43/KNN-Model-TFRF-Dinamic-Crawling-Youtube/blob/main/Skripsi_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. INSTALL REQUIREMENT**

In [None]:
!pip install pandas numpy scikit-learn openpyxl nltk google-api-python-client Sastrawi

**2. IMPORT REQUIREMENT**

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import pickle
import os
import time
from datetime import datetime
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from google.colab import files, drive
from googleapiclient.discovery import build

**3. MENGHUBUNGKAN DENGAN GOOGLE DRIVE**

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Path penyimpanan model di Google Drive
drive_path = "/content/drive/My Drive/Model_Sentimen/"
os.makedirs(drive_path, exist_ok=True)  # Buat folder jika belum ada

**4. UNDUH RESOURCE NLTK**

In [None]:
# Unduh resource NLTK jika belum ada
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

**5. UPLOAD DAN MENAMPILKAN DATASET**

In [None]:
# Upload file
uploaded = files.upload()

# Ambil nama file yang diunggah
dataset_path = list(uploaded.keys())[0]
df = pd.read_excel(dataset_path)

display(df.head())

**6. PREPROCESSING DATA**

In [None]:
text_column = 'Comment'
sentiment_column = 'Sentimen'
function_column = 'Fungsi'

# Setup stemmer Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Preprocessing
def preprocesses_text(text):
    # Case Folding
    text = text.lower()

    # Cleaning
    text = re.sub(r'@\w+|http\S+|www\.\S+|<.*?>|[^\w\s]', ' ', text)
    text = text.strip()
    return text

# Terapkan pre-processing
df[text_column] = df[text_column].astype(str).apply(preprocesses_text)

# Tokenizing
df['tokens'] = df[text_column].apply(word_tokenize)

# Stopword Removal
stop_words = set(stopwords.words('indonesian'))
df['filtered'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

# Stemming
df['stemmed'] = df['filtered'].apply(lambda x: [stemmer.stem(word) for word in x])
df['final_text'] = df['stemmed'].apply(lambda x: ' '.join(x))

# Encoding labels
sentiment_encoder = LabelEncoder()
function_encoder = LabelEncoder()
df['sentiment_label'] = sentiment_encoder.fit_transform(df[sentiment_column])
df['function_label'] = function_encoder.fit_transform(df[function_column])

# Tampilkan hasil pre-processing secara bertahap
print("\n‚úÖ 1. HASIL CASE FOLDING & CLEANING :")
display(df[[text_column]].head())

print("\n‚úÖ 2. HASIL TOKENIZING :")
display(df[['tokens']].head())

print("\n‚úÖ 3. HASIL STOPWORD REMOVAL :")
display(df[['filtered']].head())

print("\n‚úÖ 4. HASIL STEMMING :")
display(df[['stemmed']].head())

print("\n‚úÖ 5. TEKS FINAL :")
display(df[['final_text']].head())

print("\n‚úÖ 6. ENCODING LABEL SENTIMEN :")
display(df[[sentiment_column, 'sentiment_label']].head())

print("\n‚úÖ 7. ENCODING LABEL FUNGSI :")
display(df[[function_column, 'function_label']].head())

**7. PEMBOBOTAN KATA (TF-RF)**

In [None]:
# TF-RF Vectorization
vectorizer = CountVectorizer(ngram_range=(1,2))
X_counts = vectorizer.fit_transform(df['final_text'])
tf_transformer = TfidfTransformer(use_idf=False).fit(X_counts)
X_tf = tf_transformer.transform(X_counts)
df_counts = np.sum(X_counts.toarray() > 0, axis=0)
n_docs = X_counts.shape[0]
b = df_counts
c = n_docs - b
rf = np.log(2 + b / np.maximum(1, c))
rf = rf.reshape(1, -1)
X_tfrf = X_tf.multiply(rf)

**8. SPLIT DATA, MELATIH MODEL, MENAMPILKAN HASIL EVALUASI MODEL, MENYIMPAN MODEL**

In [None]:
# Split data
X_train, X_test, y_train_sentiment, y_test_sentiment = train_test_split(
    X_tfrf, df['sentiment_label'], test_size=0.3, random_state=42)

X_train_func, X_test_func, y_train_function, y_test_function = train_test_split(
    X_tfrf, df['function_label'], test_size=0.3, random_state=42)

# Train KNN models
knn_sentiment = KNeighborsClassifier(n_neighbors=3, metric='cosine')
knn_sentiment.fit(X_train, y_train_sentiment)

knn_function = KNeighborsClassifier(n_neighbors=3, metric='cosine')
knn_function.fit(X_train_func, y_train_function)

# Evaluation
y_pred_sentiment = knn_sentiment.predict(X_test)
y_pred_function = knn_function.predict(X_test_func)

print("\nüéØ HASIL EVALUASI MODEL SENTIMEN:")
# Get unique labels in y_test_sentiment and y_pred_sentiment
unique_labels = np.unique(np.concatenate((y_test_sentiment, y_pred_sentiment)))

# Filter target names to include only the present labels
target_names = [sentiment_encoder.classes_[i] for i in unique_labels]

# Print the classification report with the filtered target names
print(classification_report(y_test_sentiment, y_pred_sentiment, target_names=target_names))

print("\nüéØ HASIL EVALUASI MODEL FUNGSI:")
print(classification_report(y_test_function, y_pred_function, target_names=function_encoder.classes_))

# Save models
pickle.dump(knn_sentiment, open(drive_path + "knn_sentiment.pkl", "wb"))
pickle.dump(knn_function, open(drive_path + "knn_function.pkl", "wb"))
pickle.dump(vectorizer, open(drive_path + "vectorizer.pkl", "wb"))
pickle.dump(tf_transformer, open(drive_path + "tf_transformer.pkl", "wb"))
pickle.dump(sentiment_encoder, open(drive_path + "sentiment_encoder.pkl", "wb"))
pickle.dump(function_encoder, open(drive_path + "function_encoder.pkl", "wb"))

print(f"\n‚úÖ Model berhasil disimpan di Google Drive: {drive_path}")

**9. FUNGSI LOAD MODEL KNN, PREPROCESSING TEXT, CRAWLING DATA YOUTUBE, DAN PREDIKSI SENTIMEN**

In [None]:
# Fungsi untuk memuat model dan vectorizer
def load_models():
    with open(drive_path + "knn_sentiment.pkl", "rb") as f:
        sentiment_model = pickle.load(f)
    with open(drive_path + "knn_function.pkl", "rb") as f:
        function_model = pickle.load(f)
    with open(drive_path + "vectorizer.pkl", "rb") as f:
        vectorizer = pickle.load(f)
    with open(drive_path + "tf_transformer.pkl", "rb") as f:
        tf_transformer = pickle.load(f)
    with open(drive_path + "sentiment_encoder.pkl", "rb") as f:
        sentiment_encoder = pickle.load(f)
    with open(drive_path + "function_encoder.pkl", "rb") as f:
        function_encoder = pickle.load(f)

    print("‚úÖ Model dan vectorizer berhasil dimuat!")
    return sentiment_model, function_model, vectorizer, tf_transformer, sentiment_encoder, function_encoder

# Fungsi untuk membersihkan teks
def preprocesses_text(text):
    text = text.lower()
    text = re.sub(r'@\w+|http\S+|www\.\S+|<.*?>|[^\w\s]', ' ', text)
    text = text.strip()
    return text

df[text_column] = df[text_column].astype(str).apply(preprocesses_text)
df['tokens'] = df[text_column].apply(word_tokenize)
stop_words = set(stopwords.words('indonesian'))
df['filtered'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
df['stemmed'] = df['filtered'].apply(lambda x: [stemmer.stem(word) for word in x])
df['final_text'] = df['stemmed'].apply(lambda x: ' '.join(x))

# Fungsi untuk mengambil komentar dari YouTube
def get_video_comments(api_key, video_id):
    try:
        youtube = build('youtube', 'v3', developerKey=api_key)
        all_comments = []
        next_page_token = None
        count = 0

        while True:
            response = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                textFormat="plainText",
                pageToken=next_page_token
            ).execute()

            for item in response.get("items", []):
                comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
                cleaned_comment = preprocesses_text(comment)
                all_comments.append(cleaned_comment)
                count += 1

            next_page_token = response.get("nextPageToken")
            if not next_page_token:
                break

        print(f"\n‚úÖ Berhasil mengambil {count} komentar dari video YouTube.")

        if not all_comments:
            print("‚ö† Tidak ada komentar yang ditemukan.")

        return all_comments
    except Exception as e:
        print(f"‚ùå Terjadi kesalahan saat mengambil komentar: {e}")
        return []

# Fungsi untuk melakukan prediksi sentimen dan fungsi
def predict_comments(comments, sentiment_model, function_model, vectorizer, tf_transformer, sentiment_encoder, function_encoder):
    if not comments:
        print("‚ö† Tidak ada komentar yang dapat diprediksi.")
        return []

    print("\nüîπ Melakukan preprocessing untuk komentar yang diambil...")

    # Vectorisasi komentar
    X_counts = vectorizer.transform(comments)
    X_tf = tf_transformer.transform(X_counts)

    # Prediksi sentimen
    sentiment_predictions = sentiment_model.predict(X_tf)
    sentiment_labels = sentiment_encoder.inverse_transform(sentiment_predictions)

    # Prediksi fungsi
    function_predictions = function_model.predict(X_tf)
    function_labels = function_encoder.inverse_transform(function_predictions)

    print("\nüéØ HASIL PREDIKSI SENTIMEN DAN FUNGSI:")
    for comment, sentiment, function in zip(comments, sentiment_labels, function_labels):
        print(f"üó® Komentar: {comment}\nüîπ Sentimen: {sentiment}\nüîπ Fungsi: {function}\n")

    return list(zip(comments, sentiment_labels, function_labels))

**10. PREDIKSI SENTIMEN GADGET**

In [None]:
# Fungsi utama untuk menjalankan program secara periodik
def main():
    api_key = "Change With Your API KEY"
    video_id = "Change With Your Video Id"
    interval = 150  # Waktu tunggu (dalam detik)

    print("üöÄ Memuat model dan vectorizer...")
    sentiment_model, function_model, vectorizer, tf_transformer, sentiment_encoder, function_encoder = load_models()

    while True:
        print(f"\n‚è≥ Mengambil komentar pada {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}...")
        comments = get_video_comments(api_key, video_id)

        if not comments:
            print("‚ö† Tidak ada komentar yang diambil. Menunggu periode berikutnya...")
        else:
            print("\nüìä Melakukan prediksi sentimen dan fungsi...")
            predictions = predict_comments(comments, sentiment_model, function_model, vectorizer, tf_transformer, sentiment_encoder, function_encoder)
            print("\n‚úÖ Prediksi selesai!")

        print(f"üïí Menunggu {interval / 60} menit sebelum mengambil komentar lagi...\n")
        time.sleep(interval)  # Tunggu sebelum mengambil komentar lagi

if __name__ == "__main__":
    main()