## Implement a Parser to read the Data

In [14]:
%run ls

Exception: File `'ls'` not found.

In [16]:
%run venv/Scripts/activate.bat

SyntaxError: unterminated string literal (detected at line 21) (activate.bat, line 21)

In [10]:
!pip install numpy pandas scipy scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.7.2-cp312-cp312-win_amd64.whl (8.7 MB)
Using cached joblib-1.5.2-py3-none-any.whl (308 kB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn

   ------------- -------------------------- 1/3 [joblib]
   ------------- -------------------------- 1/3 [joblib]
   ------------- -------------------------- 1/3 [joblib]
   ------------- -------------------------- 1/3 [joblib]
   ------------- -------------------------- 1/3 [joblib]
   ------------- -------------------------- 1/3 [joblib]
   ------------- -------------------------- 1/3 [joblib]
   -----------------------

In [7]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
from scipy.io import arff

# Function to load ARFF files
def load_arff_dataset(base_path, dataset_name, num_splits=10):
    """
    Load a dataset from ARFF files with multiple splits.

    Parameters:
    - base_path: str, base folder containing datasets
    - dataset_name: str, name of the dataset folder (e.g., 'adult')
    - num_splits: int, number of folds to load

    Returns:
    - data_splits: list of tuples, each tuple is (train_df, test_df)
    """
    data_splits = []

    dataset_path = os.path.join(base_path, dataset_name)

    for i in range(num_splits):
        fold_num = f"{i:06d}"
        train_file = os.path.join(dataset_path, f"{dataset_name}.fold.{fold_num}.train.arff")
        test_file = os.path.join(dataset_path, f"{dataset_name}.fold.{fold_num}.test.arff")

        # Load ARFF files
        train_data, train_meta = arff.loadarff(train_file)
        test_data, test_meta = arff.loadarff(test_file)

        # Convert to pandas DataFrame
        train_df = pd.DataFrame(train_data)
        test_df = pd.DataFrame(test_data)

        # Append to list
        data_splits.append((train_df, test_df))

    return data_splits

# Example usage
base_path = "datasetsCBR/datasetsCBR"
dataset_name = "adult"
num_splits = 5  # Change to the number of splits you want

splits = load_arff_dataset(base_path, dataset_name, num_splits)

# Access first split
train_df, test_df = splits[0]
print(train_df.head())
print(train_df.describe())



    age            workclass    fnlwgt        education  education-num  \
0  66.0  b'Self-emp-not-inc'  174788.0  b'Some-college'           10.0   
1  68.0           b'Private'  211162.0       b'HS-grad'            9.0   
2  90.0           b'Private'  139660.0  b'Some-college'           10.0   
3  52.0           b'Private'  230657.0       b'HS-grad'            9.0   
4  25.0           b'Private'   66622.0  b'Some-college'           10.0   

          marital-status            occupation      relationship      race  \
0       b'Never-married'              b'Sales'  b'Not-in-family'  b'White'   
1  b'Married-civ-spouse'    b'Exec-managerial'        b'Husband'  b'White'   
2            b'Divorced'              b'Sales'      b'Unmarried'  b'Black'   
3  b'Married-civ-spouse'  b'Machine-op-inspct'        b'Husband'  b'Other'   
4       b'Never-married'      b'Other-service'      b'Own-child'  b'White'   

         sex  capital-gain  capital-loss  hours-per-week    native-country  \
0  b'Fem

###  Handle missing values

In [8]:
# Fill numeric NaNs with column mean
train_df = train_df.fillna(train_df.mean(numeric_only=True))
test_df = test_df.fillna(test_df.mean(numeric_only=True))

### Normalization / Scaling

In [11]:
from sklearn.preprocessing import (
    scale, StandardScaler, MinMaxScaler, Normalizer
)
numeric_cols = train_df.select_dtypes(include=["number"]).columns

# Standardisation (Z-score)
train_standardised = scale(train_df[numeric_cols])
test_standardised = scale(test_df[numeric_cols])

# Mean normalisation (μ=0, values ~ -1 to 1)
scaler_mean = StandardScaler(with_mean=True, with_std=False)
train_mean_norm = scaler_mean.fit_transform(train_df[numeric_cols])
test_mean_norm = scaler_mean.transform(test_df[numeric_cols])

# Min-Max scaling (0 to 1)
scaler_minmax = MinMaxScaler()
train_minmax = scaler_minmax.fit_transform(train_df[numeric_cols])
test_minmax = scaler_minmax.transform(test_df[numeric_cols])

# Unit vector normalization
scaler_unit = Normalizer()
train_unit = scaler_unit.fit_transform(train_df[numeric_cols])
test_unit = scaler_unit.transform(test_df[numeric_cols])