# ðŸ“Š Pemodelan Klasifikasi

## 1. Import Library

In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib


## 2. Load Dataset

In [2]:
# Ganti 'dataset.csv' dengan nama file dataset kamu
df = pd.read_csv('Data Siswa.csv', delimiter=';;;', engine='python')
# Split the combined column into 'Nilai Akademik' and 'Kehadiran(%)'
df[['Nilai Akademik', 'Kehadiran(%)']] = df['Nilai Akademik;;Kehadiran(%)'].str.split(';;', expand=True)
# Convert relevant columns to numeric, coercing errors
df['Nilai Akademik'] = pd.to_numeric(df['Nilai Akademik'], errors='coerce')
df['Kehadiran(%)'] = pd.to_numeric(df['Kehadiran(%)'], errors='coerce')
# Drop the original combined column and any rows with NaN values introduced by coercion
df = df.drop('Nilai Akademik;;Kehadiran(%)', axis=1).dropna()
df.head()

Unnamed: 0,Nama Siswa,Nilai Akademik,Kehadiran(%)
1,Aidan Muhammad Prasetya,83.0,94.0
2,Aluna Zahra,90.0,99.0
3,Adeliana Syakira Maharani,88.0,97.0
4,Adnan Rifki,79.0,92.0
5,Alvaro Pradipta Rahman,85.0,97.0


## 3. Eksplorasi Data

In [3]:
df.info()
df.describe()
df.isnull().sum()
# Assuming 'Hasil Klasifikasi' is the target column based on typical classification tasks
if 'Hasil Klasifikasi' in df.columns:
    display(df['Hasil Klasifikasi'].value_counts())
else:
    print("Kolom 'Hasil Klasifikasi' tidak ditemukan. Mohon sesuaikan nama kolom target jika berbeda.")

<class 'pandas.core.frame.DataFrame'>
Index: 35 entries, 1 to 35
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Nama Siswa      35 non-null     object 
 1   Nilai Akademik  35 non-null     float64
 2   Kehadiran(%)    35 non-null     float64
dtypes: float64(2), object(1)
memory usage: 1.1+ KB
Kolom 'Hasil Klasifikasi' tidak ditemukan. Mohon sesuaikan nama kolom target jika berbeda.


## 4. Preprocessing Data

In [4]:
# Mengisi missing value jika ada - This step is now handled after splitting columns
# df = df.fillna(df.mode().iloc[0])

# Label Encoding for categorical columns - Now only applied to 'Nama Siswa'
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    # Exclude target column if it were categorical and not 'Nama Siswa'
    if col == 'Nama Siswa':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# Define the target column based on your criteria
# Example: Create a binary target 'Klasifikasi' based on 'Nilai Akademik' and 'Kehadiran(%)'
# Replace this with your actual logic for creating the target column
df['Klasifikasi'] = ((df['Nilai Akademik'] >= 80) & (df['Kehadiran(%)'] >= 90)).astype(int)


# Pisahkan fitur dan target
X = df.drop('Klasifikasi', axis=1) # Drop the created target column
y = df['Klasifikasi']

# Normalisasi
scaler = StandardScaler()
# Select only the numeric columns for scaling
numeric_cols = X.select_dtypes(include=np.number).columns
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

## 5. Split Data

In [5]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## 6. Pemodelan

In [6]:
from sklearn.cluster import KMeans

# Replace RandomForestClassifier with KMeans
# Choose a suitable number of clusters (e.g., 3 or 4)
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) # Set n_init explicitly
kmeans.fit(X_train)

In [7]:
# Calculate and print inertia
inertia = kmeans.inertia_
print(f"Inertia: {inertia}")

# Predict cluster labels for the test set
y_pred_clusters = kmeans.predict(X_test)

# Calculate and print silhouette score
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(X_test, y_pred_clusters)
print(f"Silhouette Score: {silhouette_avg}")

Inertia: 31.375700470287963
Silhouette Score: 0.05474649608190407


## 7. Evaluasi Model

In [9]:
y_pred = kmeans.predict(X_test)

print("Akurasi:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Akurasi: 0.42857142857142855

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.43      0.60         7
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0

    accuracy                           0.43         7
   macro avg       0.33      0.14      0.20         7
weighted avg       1.00      0.43      0.60         7


Confusion Matrix:
 [[3 1 3]
 [0 0 0]
 [0 0 0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 8. Simpan Model

In [10]:
joblib.dump(kmeans, 'kmeans_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')
print("KMeans model, scaler, and encoder have been saved successfully.")

KMeans model, scaler, and encoder have been saved successfully.


## 9. Pemakaian Model

In [11]:
# Load model, scaler, and encoder
kmeans_loaded = joblib.load('kmeans_model.pkl')
scaler_loaded = joblib.load('scaler.pkl')
label_encoders_loaded = joblib.load('label_encoders.pkl')

# Example new data
# Make sure the column names match the training data features (excluding 'Klasifikasi')
data_baru = pd.DataFrame({
    'Nama Siswa': ['Siswa Baru'],
    'Nilai Akademik': [88.0],
    'Kehadiran(%)': [95.0]
})

# Apply label encoding to 'Nama Siswa' if it was used in training
if 'Nama Siswa' in label_encoders_loaded:
    le = label_encoders_loaded['Nama Siswa']
    # Add the new label to the encoder's classes and then transform
    # This is a workaround for demonstration; in a real scenario, handle unseen labels properly
    # by either retraining the encoder with new data or using a different approach like one-hot encoding
    # with handle_unknown='ignore'.
    le.classes_ = np.append(le.classes_, data_baru['Nama Siswa'].unique())
    data_baru['Nama Siswa'] = le.transform(data_baru['Nama Siswa'])


# Ensure the order of columns in data_baru matches the training data X
# Get the order of columns from the original X used for training
# Assuming X is available in the environment after previous cell execution
try:
    original_X_columns = X.columns.tolist()
    # Reindex data_baru to match the column order of original_X
    data_baru = data_baru.reindex(columns=original_X_columns)
except NameError:
    print("Error: Original training data 'X' not found. Please run the preprocessing steps first.")
    # Handle the error or exit if X is not available
    pass # Keep going for demonstration if X is not critical for the rest of the fix


# Scale the features - the scaler expects all features that were used during fit, including encoded 'Nama Siswa'
data_baru_scaled = scaler_loaded.transform(data_baru)

# Predict the cluster label
hasil_prediksi_cluster = kmeans_loaded.predict(data_baru_scaled)

print("Hasil Prediksi Cluster:", hasil_prediksi_cluster)

Hasil Prediksi Cluster: [1]




In [12]:
# Load model, scaler, and encoder
kmeans_loaded = joblib.load('kmeans_model.pkl')
scaler_loaded = joblib.load('scaler.pkl')
label_encoders_loaded = joblib.load('label_encoders.pkl')

# Example new data
# Make sure the column names match the training data features (excluding 'Klasifikasi')
data_baru = pd.DataFrame({
    'Nama Siswa': ['Siswa Baru'],
    'Nilai Akademik': [88.0],
    'Kehadiran(%)': [95.0]
})

# Apply label encoding to 'Nama Siswa' if it was used in training
if 'Nama Siswa' in label_encoders_loaded:
    le = label_encoders_loaded['Nama Siswa']
    # Add the new label to the encoder's classes and then transform
    # This is a workaround for demonstration; in a real scenario, handle unseen labels properly
    # by either retraining the encoder with new data or using a different approach like one-hot encoding
    # with handle_unknown='ignore'.
    le.classes_ = np.append(le.classes_, data_baru['Nama Siswa'].unique())
    data_baru['Nama Siswa'] = le.transform(data_baru['Nama Siswa'])


# Ensure the order of columns in data_baru matches the training data X
# Get the order of columns from the original X used for training
original_X_columns = X.columns.tolist()

# Reindex data_baru to match the column order of original_X
data_baru = data_baru.reindex(columns=original_X_columns)


# Scale the features - the scaler expects all features that were used during fit, including encoded 'Nama Siswa'
data_baru_scaled = scaler_loaded.transform(data_baru)

# Predict the cluster label
hasil_prediksi_cluster = kmeans_loaded.predict(data_baru_scaled)

print("Hasil Prediksi Cluster:", hasil_prediksi_cluster)

Hasil Prediksi Cluster: [1]


