In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

# import logistic regression, random forest, decision tree, gradient boosting and SVM
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
import warnings

warnings.filterwarnings("ignore")

In [3]:
url = "https://raw.githubusercontent.com/dicodingacademy/dicoding_dataset/main/students_performance/data.csv"
df = pd.read_csv(url, sep=";")
df.head()

Unnamed: 0,Marital_status,Application_mode,Application_order,Course,Daytime_evening_attendance,Previous_qualification,Previous_qualification_grade,Nacionality,Mothers_qualification,Fathers_qualification,...,Curricular_units_2nd_sem_credited,Curricular_units_2nd_sem_enrolled,Curricular_units_2nd_sem_evaluations,Curricular_units_2nd_sem_approved,Curricular_units_2nd_sem_grade,Curricular_units_2nd_sem_without_evaluations,Unemployment_rate,Inflation_rate,GDP,Status
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [4]:
# Pilih fitur yang memiliki korelasi cukup tinggi dengan target
selected_features = [
    "Curricular_units_2nd_sem_approved",
    "Curricular_units_2nd_sem_grade",
    "Curricular_units_1st_sem_approved",
    "Curricular_units_1st_sem_grade",
    "Tuition_fees_up_to_date",
    "Scholarship_holder",
    "Age_at_enrollment",
    "Debtor",
    "Admission_grade",
    "Application_mode",
    "Displaced",
    "Previous_qualification_grade",
]

df_selected = df[selected_features + ["Status"]]
df_selected

Unnamed: 0,Curricular_units_2nd_sem_approved,Curricular_units_2nd_sem_grade,Curricular_units_1st_sem_approved,Curricular_units_1st_sem_grade,Tuition_fees_up_to_date,Scholarship_holder,Age_at_enrollment,Debtor,Admission_grade,Application_mode,Displaced,Previous_qualification_grade,Status
0,0,0.000000,0,0.000000,1,0,20,0,127.3,17,1,122.0,Dropout
1,6,13.666667,6,14.000000,0,0,19,0,142.5,15,1,160.0,Graduate
2,0,0.000000,0,0.000000,0,0,19,0,124.8,1,1,122.0,Dropout
3,5,12.400000,6,13.428571,1,0,20,0,119.6,17,1,122.0,Graduate
4,6,13.000000,5,12.333333,1,0,45,0,141.5,39,0,100.0,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,5,12.666667,5,13.600000,1,0,19,0,122.2,1,0,125.0,Graduate
4420,2,11.000000,6,12.000000,0,0,18,1,119.0,1,1,120.0,Dropout
4421,1,13.500000,7,14.912500,1,1,30,0,149.5,1,1,154.0,Dropout
4422,5,12.000000,5,13.800000,1,1,20,0,153.8,1,1,180.0,Graduate


In [5]:
binary_mapping = {0: "No", 1: "Yes"}

df_selected["Tuition_fees_up_to_date"] = df_selected["Tuition_fees_up_to_date"].map(
    binary_mapping
)
df_selected["Scholarship_holder"] = df_selected["Scholarship_holder"].map(
    binary_mapping
)
df_selected["Displaced"] = df_selected["Displaced"].map(binary_mapping)
df_selected["Debtor"] = df_selected["Debtor"].map(binary_mapping)

In [6]:
application_mode_mapping = {
    1: "1st phase - general contingent",
    2: "Ordinance No. 612/93",
    5: "1st phase - special contingent (Azores Island)",
    7: "Holders of other higher courses",
    10: "Ordinance No. 854-B/99",
    15: "International student (bachelor)",
    16: "1st phase - special contingent (Madeira Island)",
    17: "2nd phase - general contingent",
    18: "3rd phase - general contingent",
    26: "Ordinance No. 533-A/99, item b2) (Different Plan)",
    27: "Ordinance No. 533-A/99, item b3 (Other Institution)",
    39: "Over 23 years old",
    42: "Transfer",
    43: "Change of course",
    44: "Technological specialization diploma holders",
    51: "Change of institution/course",
    53: "Short cycle diploma holders",
    57: "Change of institution/course (International)"
}

df_selected["Application_mode"] = df_selected["Application_mode"].map(application_mode_mapping)

In [7]:
df_selected.head()

Unnamed: 0,Curricular_units_2nd_sem_approved,Curricular_units_2nd_sem_grade,Curricular_units_1st_sem_approved,Curricular_units_1st_sem_grade,Tuition_fees_up_to_date,Scholarship_holder,Age_at_enrollment,Debtor,Admission_grade,Application_mode,Displaced,Previous_qualification_grade,Status
0,0,0.0,0,0.0,Yes,No,20,No,127.3,2nd phase - general contingent,Yes,122.0,Dropout
1,6,13.666667,6,14.0,No,No,19,No,142.5,International student (bachelor),Yes,160.0,Graduate
2,0,0.0,0,0.0,No,No,19,No,124.8,1st phase - general contingent,Yes,122.0,Dropout
3,5,12.4,6,13.428571,Yes,No,20,No,119.6,2nd phase - general contingent,Yes,122.0,Graduate
4,6,13.0,5,12.333333,Yes,No,45,No,141.5,Over 23 years old,No,100.0,Graduate


In [9]:
# simpan df_selected sebagai csv
df_selected.to_csv("df_jaya_institut_selected.csv", index=False)