<a href="https://colab.research.google.com/github/faisalrizqin/UAS-STKI/blob/main/UAS_STKI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd

# 1. Load dataset CSV dari lokasi penyimpanan
dataset_path = "bbc-text.csv"
df = pd.read_csv(dataset_path)

# 2. Menampilkan 5 baris pertama dataset
print("Contoh 5 baris data:")
print(df.head())

# 3. Menampilkan informasi struktur dataset
print("\nInformasi struktur dataset:")
print(df.info())

# 4. Menampilkan jumlah data per kategori (opsional, membantu analisis awal)
print("\nDistribusi kategori:")
print(df['category'].value_counts())


Contoh 5 baris data:
        category                                               text
0           tech  tv future in the hands of viewers with home th...
1       business  worldcom boss  left books alone  former worldc...
2          sport  tigers wary of farrell  gamble  leicester say ...
3          sport  yeading face newcastle in fa cup premiership s...
4  entertainment  ocean s twelve raids box office ocean s twelve...

Informasi struktur dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2225 non-null   object
 1   text      2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB
None

Distribusi kategori:
category
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64


In [5]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Download resource (semua yang dibutuhkan)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# 1. Siapkan alat preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    tokens = [stemmer.stem(word) for word in tokens]

    # Gabungkan kembali menjadi kalimat
    return " ".join(tokens)

# Terapkan preprocessing ke dataset
df["clean_text"] = df["text"].apply(preprocess_text)

# Cek hasil
df[["text", "clean_text"]].head()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,text,clean_text
0,tv future in the hands of viewers with home th...,tv futur hand viewer home theatr system plasma...
1,worldcom boss left books alone former worldc...,worldcom boss left book alon former worldcom b...
2,tigers wary of farrell gamble leicester say ...,tiger wari farrel gambl leicest say rush make ...
3,yeading face newcastle in fa cup premiership s...,yead face newcastl fa cup premiership side new...
4,ocean s twelve raids box office ocean s twelve...,ocean twelv raid box offic ocean twelv crime c...


In [6]:
# Simpan dataset hasil preprocessing
output_path = "bbc-text-preprocessed.csv"

df.to_csv(output_path, index=False)

print("File preprocessing berhasil disimpan di:")
print(output_path)

# Tampilkan 5 baris awal untuk memastikan hasilnya benar
df.head()

File preprocessing berhasil disimpan di:
bbc-text-preprocessed.csv


Unnamed: 0,category,text,clean_text
0,tech,tv future in the hands of viewers with home th...,tv futur hand viewer home theatr system plasma...
1,business,worldcom boss left books alone former worldc...,worldcom boss left book alon former worldcom b...
2,sport,tigers wary of farrell gamble leicester say ...,tiger wari farrel gambl leicest say rush make ...
3,sport,yeading face newcastle in fa cup premiership s...,yead face newcastl fa cup premiership side new...
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean twelv raid box offic ocean twelv crime c...


In [7]:
# ============================================================
# — TOKENISASI GPT + PADDING & TRUNCATION + TRAIN-TEST SPLIT
# ============================================================

from transformers import GPT2Tokenizer
from sklearn.model_selection import train_test_split
import torch

# -------------------------------
# 1. Load GPT2 tokenizer
# -------------------------------
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# GPT2 tidak punya token pad, jadi kita tambahkan manual
tokenizer.pad_token = tokenizer.eos_token

# Tentukan max_length untuk truncation/padding
MAX_LEN = 128


# -------------------------------
# 2. Fungsi tokenisasi ke seluruh dataset
# -------------------------------
def tokenize_batch(text_list):
    return tokenizer(
        text_list,
        padding="max_length",     # padding ke ukuran MAX_LEN
        truncation=True,          # memotong teks yang terlalu panjang
        max_length=MAX_LEN,
        return_tensors="pt"       # output PyTorch tensor
    )


# -------------------------------
# 3. Train-test split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"],
    df["category"],
    test_size=0.2,
    random_state=42,
    stratify=df["category"]     # agar distribusi label tetap seimbang
)

# -------------------------------
# 4. Tokenisasi data train & test
# -------------------------------
train_encodings = tokenize_batch(X_train.tolist())
test_encodings = tokenize_batch(X_test.tolist())

# Konversi label ke format tensor (numerik)
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = torch.tensor(label_encoder.fit_transform(y_train.values))
y_test_encoded = torch.tensor(label_encoder.transform(y_test.values))


# -------------------------------
# 5. Tampilkan informasi hasil
# -------------------------------
print("Jumlah data train:", len(X_train))
print("Jumlah data test :", len(X_test))
print("Contoh token IDs:\n", train_encodings["input_ids"][0][:30])

Jumlah data train: 1780
Jumlah data test : 445
Contoh token IDs:
 tensor([20077,  5750, 21419, 17945,   823,   666,    66,  3096,   773,   666,
        38092,   823,   666,    66,  2477,  2648,  2822,  1891,  3753,  1245,
         1176,  3569,    75,  1630,  1145,  2403,  2822,  1891,  2210, 10145])


In [8]:
# ============================================================
# — IMPLEMENTASI MODEL GPT-2 UNTUK KLASIFIKASI TEKS
# ============================================================

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import GPT2ForSequenceClassification

# --------------------------------------------
# 1. Dataset Class untuk PyTorch
# --------------------------------------------
class BBCDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)


# --------------------------------------------
# 2. Persiapan Dataset untuk DataLoader
# --------------------------------------------
train_dataset = BBCDataset(train_encodings, y_train_encoded)
test_dataset = BBCDataset(test_encodings, y_test_encoded)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)


# --------------------------------------------
# 3. Load Model GPT-2 untuk Classification
# --------------------------------------------
model = GPT2ForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=len(label_encoder.classes_)
)

# GPT2 tidak punya pad token → set manual
model.config.pad_token_id = tokenizer.eos_token_id

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


# --------------------------------------------
# 4. Optimizer AdamW dari PyTorch
# --------------------------------------------
optimizer = AdamW(model.parameters(), lr=5e-5)


# --------------------------------------------
# 5. Training Loop
# --------------------------------------------
EPOCHS = 1
BATCH_SIZE = 1

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"[Epoch {epoch+1}] Training Loss: {total_loss / len(train_loader):.4f}")


# --------------------------------------------
# 6. Evaluation
# --------------------------------------------
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)

        correct += (predictions == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Akurasi data uji: {accuracy * 100:.2f}%")


# --------------------------------------------
# 7. Prediksi Teks Baru
# --------------------------------------------
def predict(text):
    model.eval()
    tokens = tokenizer(
        text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=128
    ).to(device)

    with torch.no_grad():
        output = model(**tokens)
        prediction = torch.argmax(output.logits, dim=1).item()

    return label_encoder.inverse_transform([prediction])[0]


# Contoh prediksi
sample_text = "The football team won their match with a strong performance."
print("Prediksi kategori:", predict(sample_text))


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 1] Training Loss: 0.4217
Akurasi data uji: 93.93%
Prediksi kategori: sport
