In [1]:
import pandas as pd
import numpy as np
import os

pd.set_option("display.max_columns", None)

fuoye_df = pd.read_csv("../../Data/FUOYE.csv")

print("Shape:", fuoye_df.shape)
fuoye_df.head()
fuoye_df.info()
fuoye_df.describe()

print("Missing values:\n", fuoye_df.isnull().sum())
print("Duplicates:", fuoye_df.duplicated().sum())


Shape: (5000, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Faculty           5000 non-null   object 
 1   Department        5000 non-null   object 
 2   UTME_Score        5000 non-null   int64  
 3   UTME_Cutoff       5000 non-null   int64  
 4   Aggregate_Score   5000 non-null   float64
 5   Aggregate_Cutoff  5000 non-null   float64
 6   Olevel_Valid      5000 non-null   bool   
 7   Sittings          5000 non-null   int64  
 8   Admission_Status  5000 non-null   object 
 9   Olevel_Grades     5000 non-null   object 
dtypes: bool(1), float64(2), int64(3), object(4)
memory usage: 356.6+ KB
Missing values:
 Faculty             0
Department          0
UTME_Score          0
UTME_Cutoff         0
Aggregate_Score     0
Aggregate_Cutoff    0
Olevel_Valid        0
Sittings            0
Admission_Status    0
Olevel_Grades       0
d

In [4]:
#Encode Admission Status (Target Variable)
fuoye_df['admitted_numeric'] = fuoye_df['Admission_Status'].map({
    "admitted": 1,
    "not admitted": 0
})

print(fuoye_df['admitted_numeric'].value_counts(dropna=False))

#Convert Boolean Columns to Numeric
fuoye_df['Olevel_Valid'] = fuoye_df['Olevel_Valid'].astype(int)
fuoye_df[['Olevel_Valid']].head()

def olevel_avg(grades_str):
    scale = {
        "A1": 6,
        "B2": 5,
        "B3": 4,
        "C4": 3,
        "C5": 2,
        "C6": 1
    }
    grades = grades_str.split(", ")
    return np.mean([scale[g] for g in grades])

fuoye_df['olevel_avg_points'] = fuoye_df['Olevel_Grades'].apply(olevel_avg)

fuoye_df[['Olevel_Grades', 'olevel_avg_points']].head()

#Gap-to-Cutoff
fuoye_df['utme_gap'] = fuoye_df['UTME_Score'] - fuoye_df['UTME_Cutoff']
fuoye_df['aggregate_gap'] = (fuoye_df['Aggregate_Score'] - fuoye_df['Aggregate_Cutoff']).round(2)

fuoye_df[['utme_gap', 'aggregate_gap']].head()

processed_df = fuoye_df[[
    "Faculty",
    "Department",
    "UTME_Score",
    "UTME_Cutoff",
    "Aggregate_Score",
    "Aggregate_Cutoff",
    "utme_gap",
    "aggregate_gap",
    "Olevel_Valid",
    "olevel_avg_points",
    "admitted_numeric"
]]

processed_df.head()

# Ensure Data folder exists and save
os.makedirs("../../Data", exist_ok=True)

processed_path = "../../Data/FUOYE_processed.csv"
processed_df.to_csv(processed_path, index=False)

print(f"Processed dataset saved to {processed_path}")


admitted_numeric
0    2604
1    2396
Name: count, dtype: int64
Processed dataset saved to ../../Data/FUOYE_processed.csv
