# Predict

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import joblib

## Muat Model dan X_train


In [2]:
xgb_model = joblib.load('xgboost_model.pkl')
X_train = pd.read_csv('dataset/X_train.csv')

## Definisi Kolom dan Kategori


In [3]:
raw_columns = [
    'age', 'gender', 'major', 'study_hours_per_day', 'social_media_hours', 
    'netflix_hours', 'part_time_job', 'attendance_percentage', 'sleep_hours', 
    'diet_quality', 'exercise_frequency', 'parental_education_level', 
    'internet_quality', 'mental_health_rating', 'extracurricular_participation', 
    'previous_gpa', 'semester', 'stress_level', 'dropout_risk', 'social_activity', 
    'screen_time', 'study_environment', 'access_to_tutoring', 'family_income_range', 
    'parental_support_level', 'motivation_level', 'exam_anxiety_score', 
    'learning_style', 'time_management_score'
]

numerical_cols = [
    'age', 'study_hours_per_day', 'social_media_hours', 'netflix_hours', 
    'attendance_percentage', 'sleep_hours', 'previous_gpa', 'semester', 
    'screen_time', 'exam_anxiety_score', 'time_management_score'
]
boolean_cols = ['part_time_job', 'extracurricular_participation', 'access_to_tutoring']
nominal_cols = ['gender', 'major', 'study_environment', 'learning_style']
ordinal_cols = [
    'diet_quality', 'exercise_frequency', 'parental_education_level', 
    'internet_quality', 'mental_health_rating', 'stress_level', 
    'dropout_risk', 'social_activity', 'family_income_range', 
    'parental_support_level', 'motivation_level'
]

# Kategori berdasarkan kolom one-hot di X_train


In [4]:
categories = {
    'gender': ['Female', 'Male', 'Other'],  # Female di-drop
    'major': ['Mathematics', 'Biology', 'Business', 'Computer Science', 'Engineering', 'Psychology'],  # Mathematics di-drop (asumsi)
    'study_environment': ['Home', 'Co-Learning Group', 'Dorm', 'Library', 'Quiet Room'],  # Home di-drop (asumsi)
    'learning_style': ['Auditory', 'Kinesthetic', 'Reading', 'Visual'],  # Auditory di-drop (asumsi)
    'diet_quality': ['Poor', 'Average', 'Good'],
    'exercise_frequency': ['Never', 'Rarely', 'Sometimes', 'Often'],
    'parental_education_level': ['High School', 'Bachelor', 'Master', 'PhD'],
    'internet_quality': ['Poor', 'Average', 'Good'],
    'mental_health_rating': ['Poor', 'Fair', 'Good', 'Excellent'],
    'stress_level': ['Low', 'Medium', 'High'],
    'dropout_risk': ['Low', 'Medium', 'High'],
    'social_activity': ['Low', 'Medium', 'High'],
    'family_income_range': ['Low', 'Medium', 'High'],
    'parental_support_level': ['Low', 'Medium', 'High'],
    'motivation_level': ['Low', 'Medium', 'High']
}

## Buat Objek Preprocessing


In [5]:
# StandardScaler untuk numerik
scaler = StandardScaler()
scaler.fit(X_train[numerical_cols])

# OneHotEncoder untuk nominal
onehot_encoder = OneHotEncoder(
    categories=[categories[col] for col in nominal_cols],
    sparse_output=False,
    drop='first',
    handle_unknown='ignore'
)
# Latih dengan satu baris data dummy untuk inisialisasi
dummy_data = pd.DataFrame({col: [categories[col][0]] for col in nominal_cols})
onehot_encoder.fit(dummy_data)

# LabelEncoder untuk ordinal
label_encoders = {}
for col in ordinal_cols:
    le = LabelEncoder()
    le.fit(categories[col])
    label_encoders[col] = le

In [None]:









## Hitung Mean, Modus, dan IQR dari X_train
means = X_train[numerical_cols].mean()
modes = {
    'part_time_job': 0,  # Asumsi modus (sesuaikan jika diketahui)
    'extracurricular_participation': 0,
    'access_to_tutoring': 0,
    'gender': 'Female',
    'major': 'Mathematics',
    'study_environment': 'Home',
    'learning_style': 'Auditory',
    'diet_quality': 'Average',
    'exercise_frequency': 'Sometimes',
    'parental_education_level': 'Bachelor',
    'internet_quality': 'Average',
    'mental_health_rating': 'Fair',
    'stress_level': 'Medium',
    'dropout_risk': 'Low',
    'social_activity': 'Medium',
    'family_income_range': 'Medium',
    'parental_support_level': 'Medium',
    'motivation_level': 'Medium'
}

# IQR untuk outlier
iqr_bounds = {}
for col in numerical_cols:
    Q1 = X_train[col].quantile(0.25)
    Q3 = X_train[col].quantile(0.75)
    IQR = Q3 - Q1
    iqr_bounds[col] = {'lower': Q1 - 1.5 * IQR, 'upper': Q3 + 1.5 * IQR}

## Fungsi untuk Preprocessing Satu Data
def preprocess_single_data(input_data, raw_columns, means, modes, iqr_bounds):
    data = pd.DataFrame([input_data], columns=raw_columns)
    
    # Tangani nilai hilang
    for col in numerical_cols:
        if col in data.columns and pd.isna(data[col].iloc[0]):
            data[col] = means[col]
    
    for col in boolean_cols + nominal_cols + ordinal_cols:
        if col in data.columns and pd.isna(data[col].iloc[0]):
            data[col] = modes[col]
    
    # Tangani outlier untuk kolom numerik
    for col in numerical_cols:
        if col in data.columns:
            value = data[col].iloc[0]
            lower = iqr_bounds.get(col, {'lower': -np.inf})['lower']
            upper = iqr_bounds.get(col, {'upper': np.inf})['upper']
            data[col] = np.clip(value, lower, upper)
    
    # Boolean Encoding
    for col in boolean_cols:
        if col in data.columns:
            value = data[col].iloc[0]
            if value in ['Yes', 'No', 'True', 'False', '1', '0']:
                data[col] = 1 if value in ['Yes', 'True', '1'] else 0
            else:
                print(f"Warning: Nilai '{value}' di kolom {col} tidak dikenali. Menggunakan modus.")
                data[col] = modes[col]
    
    # One-Hot Encoding untuk Nominal
    if nominal_cols:
        nominal_data = data[nominal_cols]
        encoded_nominal = onehot_encoder.transform(nominal_data)
        encoded_nominal_df = pd.DataFrame(
            encoded_nominal,
            columns=onehot_encoder.get_feature_names_out(nominal_cols)
        )
        data = data.drop(columns=nominal_cols)
        data = pd.concat([data, encoded_nominal_df], axis=1)
    
    # Label Encoding untuk Ordinal
    for col in ordinal_cols:
        if col in data.columns:
            try:
                data[col] = label_encoders[col].transform([data[col].iloc[0]])[0]
            except ValueError:
                print(f"Warning: Nilai '{data[col].iloc[0]}' di kolom {col} tidak dikena#lMenggunakan modus.")
                data[col] = label_encoders[col].transform([modes[col]])[0]
    
    # Normalisasi fitur numerik
    if numerical_cols:
        data[numerical_cols] = scaler.transform(data[numerical_cols])
    
    # Pastikan urutan kolom sesuai X_train
    data = data[X_train.columns]
    
    return data

## Input Pengguna
print("Masukkan data untuk prediksi exam_score:")
input_data = {}
for col in raw_columns:
    while True:
        try:
            value = input(f"Masukkan {col} (tekan Enter untuk menggunakan nilai default): ")
            if value.strip() == "":
                input_data[col] = np.nan
            else:
                if col in numerical_cols:
                    input_data[col] = float(value)
                else:
                    input_data[col] = value
            break
        except ValueError:
            # print(f"Input tidak valid untuk {co#lCoba lagi.")

## Preprocessing Input
processed_data = preprocess_single_data(input_data, raw_columns, means, modes, iqr_bounds)

## Prediksi
prediction = xgb_model.predict(processed_data)[0]

## Tampilkan Hasil
print("\n=== Hasil Prediksi ===")
print(f"Prediksi Exam Score: {prediction:.2f}")

ValueError: All arrays must be of the same length