<a href="https://colab.research.google.com/github/dhani43/KNN-Model-TFRF-Dinamic-Crawling-Youtube/blob/main/Skripsi_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. INSTALL REQUIREMENT**

In [None]:
!pip install pandas numpy scikit-learn openpyxl nltk google-api-python-client Sastrawi

**2. IMPORT REQUIREMENT**

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import pickle
import os
import time
from datetime import datetime
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from google.colab import files, drive
from googleapiclient.discovery import build

**3. CRAWLING DATA**

In [None]:
def get_video_comments(api_key, video_ids):
    youtube = build('youtube', 'v3', developerKey=api_key)

    # Inisialisasi list untuk menyimpan semua komentar dari semua video
    all_comments = []

    for video_id in video_ids:
        # Inisialisasi parameter untuk paginasi
        next_page_token = None

        # Loop untuk mengambil semua halaman komentar dari video saat ini
        while True:
            try:
                # Lakukan request untuk mendapatkan data komentar
                response = youtube.commentThreads().list(
                    part="snippet",
                    videoId=video_id,
                    textFormat="plainText",
                    pageToken=next_page_token
                ).execute()

                # Lakukan loop untuk menambahkan komentar ke dalam list
                for item in response["items"]:
                    comment = item["snippet"]["topLevelComment"]
                    author = comment["snippet"]["authorDisplayName"]
                    text = comment["snippet"]["textDisplay"]
                    published_at = comment["snippet"]["publishedAt"]
                    comment_time = datetime.strptime(published_at, "%Y-%m-%dT%H:%M:%SZ")
                    all_comments.append({"Time": comment_time, "Author": author, "Comment": text})

                    # Periksa apakah ada balasan untuk komentar ini
                    if "replies" in item:
                        for reply in item["replies"]["comments"]:
                            reply_author = reply["snippet"]["authorDisplayName"]
                            reply_text = reply["snippet"]["textDisplay"]
                            reply_published_at = reply["snippet"]["publishedAt"]
                            reply_time = datetime.strptime(reply_published_at, "%Y-%m-%dT%H:%M:%SZ")
                            all_comments.append({"Time": reply_time, "Author": reply_author, "Comment": reply_text})

                # Periksa apakah masih ada halaman komentar berikutnya
                next_page_token = response.get("nextPageToken")
                if not next_page_token:
                    break  # Keluar dari loop jika tidak ada halaman berikutnya

            except Exception as e:
                print(f"An error occurred: {e}")
                print("Retrying in 5 seconds...")
                time.sleep(30)  # Tunggu 30 detik sebelum mencoba lagi

    # Simpan data komentar ke dalam file Excel
    df = pd.DataFrame(all_comments)
    df.to_excel("NewComments.xlsx", index=False)
    print("Comments saved to 'NewComments.xlsx'")

# Main function
def main():
    api_key = "AIzaSyC216MP_3O1-VblW-zDAxweSUuAoRJ1U2I"
    video_ids = ["c545HEI7OAU", "x7wSRgwMIpU", "l_kOERYYUkg"]
    get_video_comments(api_key, video_ids)

if __name__ == "__main__":
    main()

**4. MENGHUBUNGKAN DENGAN GOOGLE DRIVE**

In [None]:
# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Path penyimpanan model di Google Drive
drive_path = "/content/drive/My Drive/Model_Sentimen/"
os.makedirs(drive_path, exist_ok=True)  # Buat folder jika belum ada

**5. UNDUH RESOURCE NLTK**

In [None]:
# Unduh resource NLTK jika belum ada
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

**6. UPLOAD DAN MENAMPILKAN DATASET**

In [None]:
# Upload file
uploaded = files.upload()

# Ambil nama file yang diunggah
dataset_path = list(uploaded.keys())[0]
df = pd.read_excel(dataset_path)

display(df.head())

**7. PREPROCESSING DATA**

In [None]:
text_column = 'Comment'
sentiment_column = 'Sentimen'
function_column = 'Fungsi'

# Setup stemmer Sastrawi dan stopwords
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = set(stopwords.words('indonesian'))

# Preprocessing
def preprocesses_text(text):
    # 1. Case Folding
    casefolded_text = text.lower()
    # 2. Cleaning
    cleaned_text = re.sub(r'@\w+|http\S+|www\.\S+|<.*?>|[^\w\s]', ' ', casefolded_text)
    cleaned_text = cleaned_text.strip()
    # 3. Tokenizing
    tokens = word_tokenize(cleaned_text)
    # 4. Stopword removal
    filtered = [word for word in tokens if word not in stop_words]
    # 5. Stemming
    stemmed = [stemmer.stem(word) for word in filtered]
    # 6. Final text
    final_text = ' '.join(stemmed)

    return {
        'casefolded_text': casefolded_text,
        'cleaned_text': cleaned_text,
        'tokens': tokens,
        'filtered': filtered,
        'stemmed': stemmed,
        'final_text': final_text
    }

# Terapkan pre-processing
df[text_column] = df[text_column].astype(str).apply(preprocesses_text)
# Case Folding
df['casefolded_text'] = df[text_column].apply(lambda x: x['casefolded_text'])
# Cleaning
df['cleaned_text'] = df[text_column].apply(lambda x: x['cleaned_text'])
# Tokenizing
df['tokens'] = df[text_column].apply(lambda x: x['tokens'])
# Stopword Removal
df['filtered'] = df[text_column].apply(lambda x: x['filtered'])
# Stemming
df['stemmed'] = df[text_column].apply(lambda x: x['stemmed'])
df['final_text'] = df[text_column].apply(lambda x: x['final_text'])
# Encoding labels
sentiment_encoder = LabelEncoder()
function_encoder = LabelEncoder()
df['sentiment_label'] = sentiment_encoder.fit_transform(df[sentiment_column])
df['function_label'] = function_encoder.fit_transform(df[function_column])

# Tampilkan hasil pre-processing secara bertahap
print("\n✅ 1. HASIL CASE FOLDING :")
display(df[['casefolded_text']].head())

print("\n✅ 2. HASIL CLEANING :")
display(df[['cleaned_text']].head())

print("\n✅ 3. HASIL TOKENIZING :")
display(df[['tokens']].head())

print("\n✅ 4. HASIL STOPWORD REMOVAL :")
display(df[['filtered']].head())

print("\n✅ 5. HASIL STEMMING :")
display(df[['stemmed']].head())

print("\n✅ 6. TEKS FINAL :")
display(df[['final_text']].head())

print("\n✅ 7. ENCODING LABEL SENTIMEN :")
display(df[[sentiment_column, 'sentiment_label']].head())

print("\n✅ 8. ENCODING LABEL FUNGSI :")
display(df[[function_column, 'function_label']].head())

**8. PEMBOBOTAN KATA (TF-RF)**

In [None]:
# TF-RF Vectorization
vectorizer = CountVectorizer(ngram_range=(1,2))
X_counts = vectorizer.fit_transform(df['final_text'])
tf_transformer = TfidfTransformer(use_idf=False).fit(X_counts)
X_tf = tf_transformer.transform(X_counts)
df_counts = np.sum(X_counts.toarray() > 0, axis=0)
n_docs = X_counts.shape[0]
b = df_counts
c = n_docs - b
rf = np.log(2 + b / np.maximum(1, c))
rf = rf.reshape(1, -1)
X_tfrf = X_tf.multiply(rf)

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', True)
# Mendapatkan daftar fitur (kata/phrase) dari vectorizer
feature_names = vectorizer.get_feature_names_out()
# Mengonversi hasil TF-RF (sparse matrix) ke bentuk array
tfrf_array = X_tfrf.toarray()
df_tfrf = pd.DataFrame(tfrf_array, columns=feature_names)
print(df_tfrf)

**9. SPLIT DATA, MELATIH MODEL, MENAMPILKAN HASIL EVALUASI MODEL, MENYIMPAN MODEL**

In [None]:
# Split data
X_train, X_test, y_train_sentiment, y_test_sentiment = train_test_split(
    X_tfrf, df['sentiment_label'], test_size=0.3, random_state=42)

X_train_func, X_test_func, y_train_function, y_test_function = train_test_split(
    X_tfrf, df['function_label'], test_size=0.3, random_state=42)

# Train KNN models
knn_sentiment = KNeighborsClassifier(n_neighbors=3, metric='cosine')
knn_sentiment.fit(X_train, y_train_sentiment)

knn_function = KNeighborsClassifier(n_neighbors=3, metric='cosine')
knn_function.fit(X_train_func, y_train_function)

# Evaluation
y_pred_sentiment = knn_sentiment.predict(X_test)
y_pred_function = knn_function.predict(X_test_func)

print("\n🎯 HASIL EVALUASI MODEL SENTIMEN:")
# Get unique labels in y_test_sentiment and y_pred_sentiment
unique_labels = np.unique(np.concatenate((y_test_sentiment, y_pred_sentiment)))

# Filter target names to include only the present labels
target_names = [sentiment_encoder.classes_[i] for i in unique_labels]

# Print the classification report with the filtered target names
print(classification_report(y_test_sentiment, y_pred_sentiment, target_names=target_names))

print("\n🎯 HASIL EVALUASI MODEL FUNGSI:")
print(classification_report(y_test_function, y_pred_function, target_names=function_encoder.classes_))

# Save models
pickle.dump(knn_sentiment, open(drive_path + "knn_sentiment.pkl", "wb"))
pickle.dump(knn_function, open(drive_path + "knn_function.pkl", "wb"))
pickle.dump(vectorizer, open(drive_path + "vectorizer.pkl", "wb"))
pickle.dump(tf_transformer, open(drive_path + "tf_transformer.pkl", "wb"))
pickle.dump(sentiment_encoder, open(drive_path + "sentiment_encoder.pkl", "wb"))
pickle.dump(function_encoder, open(drive_path + "function_encoder.pkl", "wb"))

print(f"\n✅ Model berhasil disimpan di Google Drive: {drive_path}")

**10. FUNGSI LOAD MODEL KNN, PREPROCESSING TEXT, CRAWLING DATA YOUTUBE, DAN PREDIKSI SENTIMEN**

In [None]:
# Fungsi untuk memuat model dan vectorizer
def load_models():
    with open(drive_path + "knn_sentiment.pkl", "rb") as f:
        sentiment_model = pickle.load(f)
    with open(drive_path + "knn_function.pkl", "rb") as f:
        function_model = pickle.load(f)
    with open(drive_path + "vectorizer.pkl", "rb") as f:
        vectorizer = pickle.load(f)
    with open(drive_path + "tf_transformer.pkl", "rb") as f:
        tf_transformer = pickle.load(f)
    with open(drive_path + "sentiment_encoder.pkl", "rb") as f:
        sentiment_encoder = pickle.load(f)
    with open(drive_path + "function_encoder.pkl", "rb") as f:
        function_encoder = pickle.load(f)

    print("✅ Model dan vectorizer berhasil dimuat!")
    return sentiment_model, function_model, vectorizer, tf_transformer, sentiment_encoder, function_encoder

# Fungsi untuk membersihkan teks
def preprocesses_text(text):
    casefolded_text = text.lower()
    cleaned_text = re.sub(r'@\w+|http\S+|www\.\S+|<.*?>|[^\w\s]', ' ', casefolded_text)
    cleaned_text = cleaned_text.strip()
    tokens = word_tokenize(cleaned_text)
    filtered = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in filtered]
    final_text = ' '.join(stemmed)
    return {
        'casefolded_text': casefolded_text,
        'cleaned_text': cleaned_text,
        'tokens': tokens,
        'filtered': filtered,
        'stemmed': stemmed,
        'final_text': final_text
    }

results = df[text_column].astype(str).apply(preprocesses_text)
df['casefolded_text'] = results.apply(lambda x: x['casefolded_text'])
df['cleaned_text'] = results.apply(lambda x: x['cleaned_text'])
df['tokens'] = results.apply(lambda x: x['tokens'])
df['filtered'] = results.apply(lambda x: x['filtered'])
df['stemmed'] = results.apply(lambda x: x['stemmed'])
df['final_text'] = results.apply(lambda x: x['final_text'])

# Fungsi untuk mengambil komentar dari YouTube
def get_video_comments(api_key, video_id):
    try:
        youtube = build('youtube', 'v3', developerKey=api_key)
        all_comments = []
        next_page_token = None
        count = 0

        while True:
            response = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                textFormat="plainText",
                pageToken=next_page_token
            ).execute()

            for item in response.get("items", []):
                comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
                processed = preprocesses_text(comment)
                all_comments.append(processed['final_text'])
                count += 1

            next_page_token = response.get("nextPageToken")
            if not next_page_token:
                break

        print(f"\n✅ Berhasil mengambil {count} komentar dari video YouTube.")

        if not all_comments:
            print("⚠ Tidak ada komentar yang ditemukan.")

        return all_comments
    except Exception as e:
        print(f"❌ Terjadi kesalahan saat mengambil komentar: {e}")
        return []

# Fungsi untuk melakukan prediksi sentimen dan fungsi
def predict_comments(comments, sentiment_model, function_model, vectorizer, tf_transformer, sentiment_encoder, function_encoder):
    if not comments:
        print("⚠ Tidak ada komentar yang dapat diprediksi.")
        return []

    # Vectorisasi komentar
    X_counts = vectorizer.transform(comments)
    X_tf = tf_transformer.transform(X_counts)

    # Prediksi sentimen
    sentiment_predictions = sentiment_model.predict(X_tf)
    sentiment_labels = sentiment_encoder.inverse_transform(sentiment_predictions)

    # Prediksi fungsi
    function_predictions = function_model.predict(X_tf)
    function_labels = function_encoder.inverse_transform(function_predictions)

    print("\n🎯 HASIL PREDIKSI SENTIMEN DAN FUNGSI:")
    results = []
    for comment, sentiment, function in zip(comments, sentiment_labels, function_labels):
        print(f"🗨 Komentar: {comment}\n🔹 Sentimen: {sentiment}\n🔹 Fungsi: {function}\n")
        results.append({
            'comment': comment,
            'predicted_sentiment': sentiment,
            'predicted_function': function
        })
    return results

**11. PREDIKSI SENTIMEN GADGET**

In [None]:
def main():
    api_key = "AIzaSyC216MP_3O1-VblW-zDAxweSUuAoRJ1U2I"
    video_id = "Mari1pJzhWM"
    interval = 150  # Waktu tunggu (dalam detik)

    print("🚀 Memuat model dan vectorizer...")
    sentiment_model, function_model, vectorizer, tf_transformer, sentiment_encoder, function_encoder = load_models()

    while True:
        print(f"\n⏳ Mengambil komentar pada {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}...")
        comments = get_video_comments(api_key, video_id)

        if not comments:
            print("⚠ Tidak ada komentar yang diambil. Menunggu periode berikutnya...")
        else:
            print("\n📊 Melakukan prediksi sentimen dan fungsi...")
            predictions = predict_comments(comments, sentiment_model, function_model, vectorizer, tf_transformer, sentiment_encoder, function_encoder)
            predictions_df = pd.DataFrame(predictions)
            print("\n✅ Prediksi selesai!")

            if not predictions_df.empty:
                positif_df = predictions_df[predictions_df['predicted_sentiment'] == 'Positif']
                counts = positif_df['predicted_function'].value_counts()

                print("\n📈 Jumlah komentar dengan sentimen positif per fungsi gadget:")
                for fungsi, jumlah in counts.items():
                    print(f"   - {fungsi}: {jumlah} komentar positif")

        print(f"🕒 Menunggu {interval / 60} menit sebelum mengambil komentar lagi...\n")
        time.sleep(interval)

if __name__ == "__main__":
    main()