In [2]:
import pandas as pd
import numpy as np
import os

# Make pandas output cleaner
pd.set_option("display.max_columns", None)

# Load UNILAG dataset
unilag_df = pd.read_csv("../../Data/UNILAG.csv")
print("Shape:", unilag_df.shape)
print(unilag_df.head())
unilag_df.info()
print(unilag_df.describe())

# Check missing values and duplicates
print("Missing values:\n", unilag_df.isnull().sum())
print("Duplicates:", unilag_df.duplicated().sum())

# Encode Admission_Status -> numeric
# Here we keep 3 classes: admitted=2, considered=1, not admitted=0
unilag_df['admitted_numeric'] = unilag_df['Admission_Status'].map({
    "not admitted": 0,
    "considered": 1,
    "admitted": 2
})
print(unilag_df['admitted_numeric'].value_counts(dropna=False))

# If Olevel validity column is present, convert True/False to int
if "Olevel_Valid" in unilag_df.columns:
    unilag_df['Olevel_Valid'] = unilag_df['Olevel_Valid'].astype(int)
else:
    # If Olevel_Valid was not saved, infer it (>= C6 across all 5 subjects)
    def olevel_valid(grades_str):
        scale = {"A1":4.0,"B2":3.6,"B3":3.2,"C4":2.8,"C5":2.4,"C6":2.0}
        grades = grades_str.split(", ")
        return int(all(scale[g] >= 2.0 for g in grades))
    unilag_df['Olevel_Valid'] = unilag_df['Olevel_Grades'].apply(olevel_valid)

# Compute average O’Level points
def olevel_avg(grades_str):
    scale = {"A1":4.0,"B2":3.6,"B3":3.2,"C4":2.8,"C5":2.4,"C6":2.0}
    grades = grades_str.split(", ")
    return round(np.mean([scale[g] for g in grades]), 2)

unilag_df['olevel_avg_points'] = unilag_df['Olevel_Grades'].apply(olevel_avg)
print(unilag_df[['Olevel_Grades', 'olevel_avg_points']].head())

# Select relevant processed features
processed_df = unilag_df[[
    "Faculty", "Department", "UTME_Score", "Post_UTME_Score",
    "Aggregate", "Cutoff", "Olevel_Valid", "olevel_avg_points",
    "admitted_numeric"
]]

print(processed_df.head())

# Save Processed Dataset
os.makedirs("../../Data", exist_ok=True)
processed_path = "../../Data/UNILAG_processed.csv"
processed_df.to_csv(processed_path, index=False)
print(f"Processed dataset saved to {processed_path}")


Shape: (10000, 8)
                  Faculty          Department  UTME_Score       Olevel_Grades  \
0       Clinical Sciences       Physiotherapy         325  B2, B2, C6, B2, B3   
1  Basic Medical Sciences        Pharmacology         184  C6, A1, B2, C5, B2   
2       Clinical Sciences             Nursing         243  B2, C4, B3, B2, B3   
3         Dental Sciences           Dentistry         348  C4, C6, C4, B3, C5   
4         Social Sciences  Mass Communication         352  C5, C6, B3, A1, B3   

   Post_UTME_Score  Aggregate  Cutoff Admission_Status  
0               22      78.62  73.750         admitted  
1                1      39.60  72.625     not admitted  
2               24      70.78  70.875       considered  
3               18      74.70  77.400       considered  
4               19      77.80  72.350         admitted  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  