In [1]:
!pip install scikit-learn sentence-transformers faiss-cpu transformers





Instalasi Library dan Pra-pemrosesan Teks

In [2]:
import pandas as pd
import re
import numpy as np
import os
os.makedirs("data", exist_ok=True) # Pastikan folder data ada

# 1. Muat data aktual
df = pd.read_csv("sample_skripsi_it_variatif_100.csv")
df['id'] = df.index # Tambahkan ID

def preprocess_text(text):
    """Normalisasi teks: lowercase, hapus non-alphanumeric, dan hapus spasi berlebih."""
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s\.]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def extract_methods_rule_based(text):
    """
    Ekstraksi Metode: Menggunakan rule-based (regex) untuk mencari metode populer.
    """
    methods_keywords = [
        r'cnn', r'svm', r'k-means', r'rnn', r'lstm', r'transformer', 
        r'yolov8', r'a\* search', r'agile', r'waterfall', r'scrumban', 
        r'naive bayes', r'deep learning', r'machine learning', r'aes'
    ]
    
    extracted = []
    text_lower = text.lower()
    
    for method in methods_keywords:
        # Cari metode sebagai kata terpisah
        if re.search(r'\b' + method + r'\b', text_lower):
            extracted.append(method.replace(r'\b', '').replace(r'\*', '*'))
            
    return ", ".join(sorted(list(set(extracted))))

# Terapkan pra-pemrosesan dan ekstraksi
df['abstract_clean'] = df['abstrak'].apply(preprocess_text)
df['methods_extracted'] = df['abstrak'].apply(extract_methods_rule_based)
df['title_clean'] = df['judul'].apply(preprocess_text)

# *** LANGKAH PENTING: SIMPAN HASIL KE CSV ***
TEMP_DATA_PATH = "data/thesis_data_temp_c1.csv"
df.to_csv(TEMP_DATA_PATH, index=False)

print(f"Preprocessing dan Ekstraksi Metode Selesai. Data disimpan di: {TEMP_DATA_PATH}")
print(df[['judul', 'methods_extracted']].head())

Preprocessing dan Ekstraksi Metode Selesai. Data disimpan di: data/thesis_data_temp_c1.csv
                                               judul methods_extracted
0  Klasifikasi Musik Tradisional Menggunakan Meto...               svm
1  Klasifikasi Musik Tradisional Menggunakan Meto...       transformer
2  Clusterisasi Data Wisatawan Menggunakan Metode...               rnn
3  Deteksi Serangan Siber Menggunakan Metode LSTM...              lstm
4  Prediksi Cuaca Harian Menggunakan Metode RNN b...               rnn


In [3]:
# Muat data hasil Cell 1
df = pd.read_csv("D:\SistemChatbot\sample_skripsi_it_variatif_100.csv") 

Semi-Otomatis Labeling Topik

In [4]:
import pandas as pd

# *** LANGKAH PENTING: MUAT KEMBALI DATA DARI FILE PERANTARA ***
TEMP_DATA_PATH = "data/thesis_data_temp_c1.csv"
df = pd.read_csv(TEMP_DATA_PATH)

# 1. Definisi Topik dan Kata Kunci
topic_mapping = {
    0: {"name": "Klasifikasi/Deteksi", "keywords": ["klasifikasi", "deteksi", "prediksi", "citra", "musik", "penyakit"]},
    1: {"name": "Sistem Informasi/Aplikasi", "keywords": ["sistem informasi", "rancang bangun", "aplikasi", "akademik", "e-commerce", "pengembangan aplikasi"]},
    2: {"name": "Clusterisasi/Rekomendasi", "keywords": ["clusterisasi", "rekomendasi", "wisatawan", "pelanggan"]},
    3: {"name": "Jaringan/Siber", "keywords": ["jaringan", "siber", "kerentanan", "keamanan", "iot", "cloud"]},
}
DEFAULT_LABEL_ID = 4
topic_mapping[DEFAULT_LABEL_ID] = {"name": "Lain-lain/Umum", "keywords": []}


def semi_automatic_labeling(title, abstract):
    """Memberikan label ID berdasarkan kata kunci pada judul dan abstrak."""
    # Pastikan menggunakan data yang sudah di-clean/lowercase untuk pencarian
    text = (title + " " + abstract).lower() 
    
    # Prioritaskan Topik 0-3
    for label_id, data in topic_mapping.items():
        if label_id == DEFAULT_LABEL_ID: continue
            
        for kw in data['keywords']:
            if kw in text:
                return label_id
    
    return DEFAULT_LABEL_ID

# 2. Terapkan labeling
df['topic_label_id'] = df.apply(lambda row: semi_automatic_labeling(row['title_clean'], row['abstract_clean']), axis=1)

# 3. Simpan mapping label ke CSV
topic_labels_df = pd.DataFrame([
    {'topic_label_id': id, 'topic_name': data['name']} 
    for id, data in topic_mapping.items()
])
topic_labels_df.to_csv("topic_mapping.csv", index=False)

# 4. Simpan data yang sudah diperkaya fitur (FINAL FEATURE DATA)
df.to_csv("thesis_data_features.csv", index=False)

print("Semi-Otomatis Labeling Topik Selesai.")
print(df[['judul', 'methods_extracted', 'topic_label_id']].head(10))

Semi-Otomatis Labeling Topik Selesai.
                                               judul methods_extracted  \
0  Klasifikasi Musik Tradisional Menggunakan Meto...               svm   
1  Klasifikasi Musik Tradisional Menggunakan Meto...       transformer   
2  Clusterisasi Data Wisatawan Menggunakan Metode...               rnn   
3  Deteksi Serangan Siber Menggunakan Metode LSTM...              lstm   
4  Prediksi Cuaca Harian Menggunakan Metode RNN b...               rnn   
5  Deteksi Emosi pada Teks Menggunakan Metode YOL...            yolov8   
6  Deteksi Emosi pada Teks Menggunakan Metode Nai...       naive bayes   
7  Pengenalan Wajah untuk Absensi Menggunakan Met...               svm   
8  Deteksi Serangan Siber Menggunakan Metode LSTM...              lstm   
9  Deteksi Emosi pada Teks Menggunakan Metode ARI...               NaN   

   topic_label_id  
0               0  
1               0  
2               2  
3               0  
4               0  
5               0  
6      

Membangun Indexing (Retrieval Module)

In [5]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import torch
import os

# 1. Konfigurasi
SBERT_MODEL_PATH = "models/sbert_model"
FAISS_INDEX_PATH = "data/faiss_index.bin"
os.makedirs(SBERT_MODEL_PATH, exist_ok=True)
os.makedirs("data", exist_ok=True)

# Muat data yang sudah dibersihkan
df = pd.read_csv("thesis_data_features.csv")
texts = df['abstract_clean'].tolist()

# 2. Muat dan Simpan Model SBERT
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2') 
model.save(SBERT_MODEL_PATH)
print(f"Model SBERT berhasil dimuat dan disimpan di: {SBERT_MODEL_PATH}")

# 3. Buat Embeddings
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

print(f"Membuat embeddings untuk {len(texts)} abstrak menggunakan {device}...")
with torch.no_grad():
    embeddings = model.encode(texts, 
                              convert_to_numpy=True, 
                              show_progress_bar=True,
                              device=device)

embeddings = np.asarray(embeddings, dtype=np.float32) 
D = embeddings.shape[1] 
print(f"Embeddings selesai. Dimensi vektor: {D}")

# 4. Membangun dan Menyimpan Index FAISS
index = faiss.IndexFlatL2(D)
index.add(embeddings)

faiss.write_index(index, FAISS_INDEX_PATH)

print(f"Index FAISS berhasil dibangun (N={index.ntotal}) dan disimpan di: {FAISS_INDEX_PATH}")

Model SBERT berhasil dimuat dan disimpan di: models/sbert_model
Membuat embeddings untuk 100 abstrak menggunakan cpu...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Embeddings selesai. Dimensi vektor: 768
Index FAISS berhasil dibangun (N=100) dan disimpan di: data/faiss_index.bin


Membangun Klasifikasi Topik

In [6]:
!pip install --upgrade transformers datasets





In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np
import torch
import os 

# Muat data dengan label topik baru
df = pd.read_csv("thesis_data_features.csv")
topic_labels_df = pd.read_csv("topic_mapping.csv")

# 1. Pembagian Data (Fix Imbalance/Single Class Error)
X = df['abstract_clean'].tolist()
y = df['topic_label_id'].tolist()

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) 

# 2. Setup Model dan Tokenizer
MODEL_NAME = "indolem/indobert-base-uncased"
NUM_LABELS = topic_labels_df.shape[0] 
CLASSIFIER_MODEL_PATH = "models/topic_classifier"
os.makedirs(CLASSIFIER_MODEL_PATH, exist_ok=True)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

# --- PERBAIKAN FUNGSI TOKENISASI UTAMA ---
def tokenize_and_add_labels(examples):
    """
    Tokenisasi teks dan secara eksplisit menambahkan kolom 'labels' ke output.
    """
    tokenized_output = tokenizer(examples['text'], truncation=True, padding='max_length')
    tokenized_output['labels'] = examples['labels'] 
    return tokenized_output
# ----------------------------------------

# 3. Konversi ke Hugging Face Dataset (PERBAIKAN UTAMA DI SINI)
train_df = pd.DataFrame({'text': X_train, 'labels': y_train}) 
val_df = pd.DataFrame({'text': X_val, 'labels': y_val})

train_dataset = Dataset.from_pandas(train_df).map(
    tokenize_and_add_labels, 
    batched=True, 
    remove_columns=["text"] # HANYA HAPUS 'text'
)

val_dataset = Dataset.from_pandas(val_df).map(
    tokenize_and_add_labels, 
    batched=True, 
    remove_columns=["text"] # HANYA HAPUS 'text'
)
# ----------------------------------------

# 4. Definisikan Metrik Evaluasi
def compute_metrics(p):
    logits = p.predictions
    predictions = np.argmax(logits, axis=1)
    f1_macro = f1_score(p.label_ids, predictions, average='macro', zero_division=0)
    acc = accuracy_score(p.label_ids, predictions)
    return {'accuracy': acc, 'f1_macro': f1_macro}

# 5. Definisikan Hyperparameters Training (Minimal untuk menghindari TypeErrors)
training_args = TrainingArguments(
    output_dir=CLASSIFIER_MODEL_PATH,
    num_train_epochs=5,                          
    per_device_train_batch_size=8,             
    per_device_eval_batch_size=8,
    learning_rate=3e-5,                        
    logging_steps=100,
    do_eval=True 
)

# 6. Setup Trainer 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

print("Setup Fine-Tuning Klasifikasi Topik IndoBERT selesai.")
print("Semua konfigurasi data dan trainer sudah diperbaiki. Silakan jalankan trainer.train() di cell berikutnya.")

# Setelah pelatihan, Anda dapat melanjutkan ke Tahap 5 (Generation Setup).

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Setup Fine-Tuning Klasifikasi Topik IndoBERT selesai.
Semua konfigurasi data dan trainer sudah diperbaiki. Silakan jalankan trainer.train() di cell berikutnya.


  trainer = Trainer(


Title Suggestion / Generation Module

In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import os

# Muat data yang sudah dibersihkan
df = pd.read_csv("thesis_data_features.csv")

# 1. Pembagian Data (Train/Val)
X = df['abstract_clean'].tolist()
y = df['title_clean'].tolist()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Setup Model dan Tokenizer
MODEL_NAME = "google/mt5-small" 
GENERATOR_MODEL_PATH = "models/title_generator"
os.makedirs(GENERATOR_MODEL_PATH, exist_ok=True)

# Tokenizer mT5 memerlukan SentencePiece
# Jika error SentencePiece masih muncul, coba tambahkan use_fast=False (meskipun tidak perlu setelah SentencePiece terinstal)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) 
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# 3. Fungsi Tokenisasi untuk T5
def preprocess_function(examples, tokenizer):
    inputs = [f"generate title: {abstract}" for abstract in examples['input_text']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    labels = tokenizer(examples['target_text'], max_length=64, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 4. Konversi ke Hugging Face Dataset
train_df = pd.DataFrame({'input_text': X_train, 'target_text': y_train})
val_df = pd.DataFrame({'input_text': X_val, 'target_text': y_val})

train_dataset = Dataset.from_pandas(train_df).map(
    lambda examples: preprocess_function(examples, tokenizer), 
    batched=True, 
    remove_columns=["input_text", "target_text", "__index_level_0__"] 
)

val_dataset = Dataset.from_pandas(val_df).map(
    lambda examples: preprocess_function(examples, tokenizer), 
    batched=True, 
    remove_columns=["input_text", "target_text", "__index_level_0__"]
)

# 5. Definisikan Hyperparameters Training (Minimal)
training_args = Seq2SeqTrainingArguments(
    output_dir=GENERATOR_MODEL_PATH,
    num_train_epochs=5,                       
    per_device_train_batch_size=4,          
    per_device_eval_batch_size=4,
    learning_rate=3e-4,                       
    logging_steps=100,
    do_eval=True,
    predict_with_generate=True,
)

# 6. Setup Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

print("Setup Fine-Tuning Title Generation (T5) Selesai.")
print("Sekarang Anda dapat menjalankan trainer.train() di cell berikutnya untuk memulai pelatihan.")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

ValueError: Column to remove ['__index_level_0__'] not in the dataset. Current columns in the dataset: ['input_text', 'target_text']

In [None]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp311-cp311-win_amd64.whl.metadata (10 kB)
Downloading sentencepiece-0.2.1-cp311-cp311-win_amd64.whl (1.1 MB)
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   ------------------- -------------------- 0.5/1.1 MB 2.8 MB/s eta 0:00:01
   ---------------------------------------- 1.1/1.1 MB 2.4 MB/s  0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.1


In [None]:
# Pastikan Anda telah menjalankan semua kode setup Tahap 5
# dan objek 'trainer', 'tokenizer', dan 'CLASSIFIER_MODEL_PATH' sudah terdefinisi.

# Mulai pelatihan
trainer.train()

# Simpan model terbaik setelah pelatihan selesai
trainer.save_model(GENERATOR_MODEL_PATH)
tokenizer.save_pretrained(GENERATOR_MODEL_PATH)

print(f"Pelatihan Title Generation Selesai. Model disimpan di: {GENERATOR_MODEL_PATH}")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Step,Training Loss


Pelatihan Title Generation Selesai. Model disimpan di: models/title_generator
