## Loading data

In [82]:
from pathlib import Path
import csv
import pandas as pd
import numpy as np

# --- Cell 1: Load rows ---
p = Path("../../data/irish.csv")
rows = []
data_section = False

with open(p, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()
        if not line or line.startswith("%") or (not data_section and not line.lower().startswith("@data")):
            continue
        if line.lower().startswith("@data"):
            data_section = True
            continue
        rows.append(line)

# --- Cell 2: Parse rows ---
parsed = [row.split(",") for row in rows]

# --- Cell 3: Create initial DataFrame ---
n_cols = len(parsed[0])
colnames = [f"col_{i}" for i in range(n_cols)]
df = pd.DataFrame(parsed, columns=colnames)

# --- Cell 4: Name columns ---
df.columns = [
    "Sex",          # col_0
    "DVRT",         # col_1
    "Education",    # col_2
    "Course",       # col_3
    "Score",        # col_4
    "Outcome"       # col_5
]

df.head()

Unnamed: 0,Sex,DVRT,Education,Course,Score,Outcome
0,male,113,Junior_cycle_incomplete-secondary_school,not_taken,28,secondary
1,male,101,Primary_terminal_leaver,not_taken,28,primary_terminal_leaver
2,male,110,Senior_cycle_terminal_leaver-secondary_school,taken,69,secondary
3,male,121,Junior_cycle_terminal_leaver-secondary_school,not_taken,57,secondary
4,male,82,Junior_cycle_terminal_leaver-vocational_school,not_taken,18,vocational


In [83]:
# Convert numeric columns
df["DVRT"] = pd.to_numeric(df["DVRT"], errors="coerce")
df["Score"] = pd.to_numeric(df["Score"], errors="coerce")

# Add an 'ID' column to match the loan notebook's format
df['ID'] = range(len(df))

print(df.dtypes)

Sex           object
DVRT           int64
Education     object
Course        object
Score        float64
Outcome       object
ID             int64
dtype: object


In [84]:
df = df.replace("?", np.nan)

In [85]:
from sklearn.preprocessing import LabelEncoder

# Define the target variable 'Course'
target_name = "Course"

# Encode the target 'y' from text to numbers
# not_taken -> 0
# taken -> 1
le = LabelEncoder()
y = le.fit_transform(df[target_name])
print(f"Target classes: {le.classes_}")

# Define the features 'X'
# We drop the target 'Course' AND the leaky 'Outcome' column
X = df.drop(columns=[target_name, "Outcome"])

print(f"\nX shape: {X.shape}, y shape: {y.shape}")
X.head()

Target classes: ['not_taken' 'taken']

X shape: (500, 5), y shape: (500,)


Unnamed: 0,Sex,DVRT,Education,Score,ID
0,male,113,Junior_cycle_incomplete-secondary_school,28.0,0
1,male,101,Primary_terminal_leaver,28.0,1
2,male,110,Senior_cycle_terminal_leaver-secondary_school,69.0,2
3,male,121,Junior_cycle_terminal_leaver-secondary_school,57.0,3
4,male,82,Junior_cycle_terminal_leaver-vocational_school,18.0,4


In [86]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Create the Solution Set ---
# This matches the 'loan-10k.sol.ex.csv' format
df_sol = pd.DataFrame({
    'ID': X_test['ID'],
    'Course': y_test
})

# We'll keep X_train and X_test as DataFrames for now
# This makes it easy to keep the 'ID' column separate
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape:  {X_test.shape}")
print(f"df_sol shape:  {df_sol.shape}")

df_sol.head()

X_train shape: (400, 5)
X_test shape:  (100, 5)
df_sol shape:  (100, 2)


Unnamed: 0,ID,Course
439,439,1
417,417,0
140,140,1
123,123,0
466,466,0


## Preprocessing

In [87]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd

class IQRWinsorizer(BaseEstimator, TransformerMixin):
    def __init__(self, factor=1.5):
        self.factor = factor
        self.caps_ = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X).copy()
        for col in X.columns:
            if pd.api.types.is_numeric_dtype(X[col]):
                q1 = X[col].quantile(0.25)
                q3 = X[col].quantile(0.75)
                iqr = q3 - q1
                lower = q1 - self.factor * iqr
                upper = q3 + self.factor * iqr
                self.caps_[col] = (lower, upper)
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        for col, (lower, upper) in self.caps_.items():
            if col in X.columns:
                X[col] = np.clip(X[col], lower, upper)
        return X

In [88]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler # Changed to RobustScaler
from sklearn.impute import SimpleImputer

# Identify categorical and numerical columns, excluding 'ID'
cat_cols = X_train.select_dtypes(include="object").columns.tolist()
num_cols = X_train.select_dtypes(include=np.number).columns.drop("ID").tolist()

print(f"Categorical columns: {cat_cols}")
print(f"Numeric columns: {num_cols}")

# --- Define the pipelines (removed Imputers, as we used fillna(0)) ---
numeric_transformer = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("winsor", IQRWinsorizer(factor=1.5)),
    ("scaler", RobustScaler())
    
])

categorical_transformer = Pipeline([
    # Add sparse_output=False to output a dense array
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    

])

# --- Create the main preprocessor ---
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
], remainder="drop")

Categorical columns: ['Sex', 'Education']
Numeric columns: ['DVRT', 'Score']


In [89]:
# Separate the 'ID' columns before processing
X_train_ids = X_train['ID']
X_test_ids = X_test['ID']

# Drop 'ID' from the feature sets so it's not processed
X_train_features = X_train.drop(columns=['ID'])
X_test_features = X_test.drop(columns=['ID'])

# Fit on X_train and transform X_train
print("Fitting preprocessor on training data...")
X_train_processed = preprocessor.fit_transform(X_train_features)

# Only transform X_test (uses stats from X_train)
print("Transforming test data...")
X_test_processed = preprocessor.transform(X_test_features)

# --- Debug: Check the shapes ---
print(f"Processed X_train shape: {X_train_processed.shape}")
print(f"Processed X_test shape: {X_test_processed.shape}")

Fitting preprocessor on training data...
Transforming test data...
Processed X_train shape: (400, 15)
Processed X_test shape: (100, 15)


In [90]:
# Get the new feature names from the preprocessor
cat_feature_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(cat_cols)
all_feature_names = num_cols + list(cat_feature_names) # num_cols is already a list

print(f"Total features names generated: {len(all_feature_names)}")

# --- Create the preprocessed Train DataFrame ---
# No .toarray() needed because sparse_output=False
train_set = pd.DataFrame(X_train_processed, columns=all_feature_names)

# Add the ID and Target columns
# .values is critical here to align the new 0-N index
train_set['ID'] = X_train_ids.values
train_set['Course'] = y_train

# Reorder to put ID first and Target last
train_set = train_set[['ID'] + [col for col in train_set.columns if col not in ['ID', 'Course']] + ['Course']]


# --- Create the preprocessed Test DataFrame ---
test_set = pd.DataFrame(X_test_processed, columns=all_feature_names)

# Add the ID column
test_set['ID'] = X_test_ids.values

# Reorder to put ID first
test_set = test_set[['ID'] + [col for col in test_set.columns if col != 'ID']]


print("\nPreprocessed Training Set:")
print(train_set.head())
print("\nPreprocessed Test Set:")
print(test_set.head())

Total features names generated: 15

Preprocessed Training Set:
    ID      DVRT     Score  Sex_female  Sex_male  \
0  434  0.571429  1.611111         0.0       1.0   
1  314  0.380952  0.000000         0.0       1.0   
2  273  0.761905 -1.055556         0.0       1.0   
3  478  0.809524  1.888889         0.0       1.0   
4  499  1.523810  0.000000         1.0       0.0   

   Education_3rd_level_complete  Education_3rd_level_incomplete  \
0                           0.0                             0.0   
1                           0.0                             0.0   
2                           0.0                             0.0   
3                           1.0                             0.0   
4                           1.0                             0.0   

   Education_Junior_cycle_incomplete-secondary_school  \
0                                                0.0    
1                                                0.0    
2                                                0

In [91]:
# Define output paths
output_path_train = "../../data/processed/irish-preprocessed-train.csv"
output_path_test= "../../data/processed/irish-preprocessed-test.csv"
output_path_sol= "../../data/processed/irish-preprocessed-sol.csv"

# Save the files
train_set.to_csv(output_path_train, index=False)
test_set.to_csv(output_path_test, index=False)
df_sol.to_csv(output_path_sol, index=False)

print(f"\nSuccessfully saved preprocessed files:")
print(f"Train: {output_path_train}")
print(f"Test:  {output_path_test}")
print(f"Sol:   {output_path_sol}")


Successfully saved preprocessed files:
Train: ../../data/processed/irish-preprocessed-train.csv
Test:  ../../data/processed/irish-preprocessed-test.csv
Sol:   ../../data/processed/irish-preprocessed-sol.csv
