In [2]:
import pandas as pd
import numpy as np

# Make pandas output cleaner
pd.set_option("display.max_columns", None)

eksu_df = pd.read_csv("../../Data/EKSU.csv")
print("Shape:", eksu_df.shape)
eksu_df.head()
eksu_df.info()
eksu_df.describe()

print("Missing values:\n", eksu_df.isnull().sum())
print("Duplicates:", eksu_df.duplicated().sum())


Shape: (10000, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Faculty           10000 non-null  object 
 1   Department        10000 non-null  object 
 2   UTME_Score        10000 non-null  int64  
 3   Screening_Score   10000 non-null  float64
 4   Olevel_Valid      10000 non-null  bool   
 5   Sittings          10000 non-null  int64  
 6   Admission_Status  10000 non-null  object 
 7   Olevel_Grades     10000 non-null  object 
dtypes: bool(1), float64(1), int64(2), object(4)
memory usage: 556.8+ KB
Missing values:
 Faculty             0
Department          0
UTME_Score          0
Screening_Score     0
Olevel_Valid        0
Sittings            0
Admission_Status    0
Olevel_Grades       0
dtype: int64
Duplicates: 0


In [3]:
eksu_df['admitted_numeric'] = eksu_df['Admission_Status'].map({
    "admitted": 1,
    "not admitted": 0
})

print(eksu_df['admitted_numeric'].value_counts(dropna=False))

# Convert True/False to 1/0
eksu_df['Olevel_Valid'] = eksu_df['Olevel_Valid'].astype(int)
eksu_df[['Olevel_Valid']].head()

# Example: compute average O'level points
def olevel_avg(grades_str):
    scale = {"A1": 8, "B2": 7, "B3": 6,
    "C4": 5, "C5": 4, "C6": 3,
    "D7": 2}
    grades = grades_str.split(", ")
    return np.mean([scale[g] for g in grades])

eksu_df['olevel_avg_points'] = eksu_df['Olevel_Grades'].apply(olevel_avg)
eksu_df[['Olevel_Grades', 'olevel_avg_points']].head()

processed_df = eksu_df[[
    "Faculty", "Department", "UTME_Score", "Screening_Score",
    "Olevel_Valid", "olevel_avg_points", "admitted_numeric"
]]
processed_df.head()

# Save Processed Dataset
import os

# Ensure Data folder exists one level up
os.makedirs("../Data", exist_ok=True)
processed_path = "../../Data/EKSU_processed.csv"
processed_df.to_csv(processed_path, index=False)
print(f"Processed dataset saved to {processed_path}")


admitted_numeric
0    5666
1    4334
Name: count, dtype: int64
Processed dataset saved to ../../Data/EKSU_processed.csv
