# Classification Task – Dataset Loading & Preparation

# 1. Notebook Preparation

## 1.1. Install necessary libraries and dependencies

In [19]:
# Upgrade pip and install Jupyter Notebook and other dependencies
%pip install --upgrade notebook ipywidgets nbformat nbimporter import_ipynb --quiet

# Install ucimlrepo package for accessing UCI ML datasets
%pip install ucimlrepo --quiet

# Install kaeggle package for Kaggle datasets
%pip install kaggle --quiet

%pip install pandas numpy scikit-learn plotly seaborn --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## 1.2 Import necessary libraries

In [27]:
import os
import sys
import logging

import pandas as pd
import numpy as np

from ucimlrepo import fetch_ucirepo



## 1.3. Create Utils & Helpers

### 1.3.1. Logger Helper Function

In [21]:
import logging

class Logger:
    def __init__(self, name):
        """
        Initializes a logger for the class with the given name.
        
        Args:
            name (str): The name to be used for the logger. It is typically 
                        the name of the module or class.
        
        This method configures the logger to write log messages to the console
        using a StreamHandler and sets the log level to DEBUG. It ensures that 
        handlers are added only once to prevent duplicate log entries.
        """
        
        # Create a logger with the provided name
        self.logger = logging.getLogger(name)
        
        # Check if the logger already has any handlers to avoid duplicate logs
        if not self.logger.hasHandlers():
            # Create a stream handler to log messages to the console
            handler = logging.StreamHandler()
            
            # Set up a log message format: timestamp, logger name, log level, and message
            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            handler.setFormatter(formatter)  # Apply the formatter to the handler
            
            # Add the handler to the logger
            self.logger.addHandler(handler)
            
            # Set the log level to DEBUG, so all messages of level DEBUG and above are shown
            self.logger.setLevel(logging.DEBUG)


    def __getattr__(self, attr):
        """
        Delegates attribute access to the internal logger instance.
        
        This method is invoked when an attribute that doesn't exist in the 
        current object is accessed. It forwards the request to the logger 
        instance, allowing access to all logging-related attributes and methods 
        as if they were part of the object.

        Args:
            attr (str): The name of the attribute being accessed.

        Returns:
            The value of the attribute from the internal logger instance.
        """

        # Forward the attribute access to the logger object
        return getattr(self.logger, attr)

In [22]:
# Create a logger instance for the current module
logger = Logger(__name__)  

### 1.3.2. Data Set Analysis Helper Function

In [None]:
def get_dataset_dimensions(df: pd.DataFrame) -> tuple:
    """
    Returns the dimensions of the DataFrame.

    Parameters:
        - df (pd.DataFrame): The input DataFrame.

    Returns:
        - tuple: A tuple containing the number of rows and columns in the DataFrame.
    """
    logger.info(f"Dataset dimensions: {df.shape}")
    return df.shape  # Returns a tuple (number of rows, number of columns)

In [None]:
def count_unique_values(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    """
    Returns a DataFrame with the counts of unique values for specified columns.

    Parameters:
        - df (pd.DataFrame): The input DataFrame.
        - columns (list): List of column names to count unique values for.

    Returns:
        - pd.DataFrame: A DataFrame with columns ['feature', unique values...]
        where each row corresponds to a feature and the count of each unique value.
    """
    # Initialize a dictionary to collect counts
    value_counts = {}
    
    # Iterate over each specified column
    for col in columns:
        if col in df.columns:
            counts = df[col].value_counts(dropna=False)
            value_counts[col] = counts
        else:
            raise KeyError(f"Column '{col}' does not exist in the DataFrame.")
    
    # Create a combined DataFrame, filling missing values with 0
    counts_df = pd.DataFrame(value_counts).fillna(0).astype(int).T
    counts_df.index.name = 'feature'
    
    return counts_df

In [None]:
def summarize_missing(df: pd.DataFrame) -> pd.DataFrame:
    """
    Summarizes the count and percentage of missing values in each column of the dataframe.

    Parameters:
        - df (pd.DataFrame): The input DataFrame.

    Returns:
        - pd.DataFrame: A DataFrame with two columns:
        - 'missing_count': The count of missing values per column.
        - 'missing_pct': The percentage of missing values per column.
    """
    total = len(df)  # Get the total number of rows in the dataframe
    missing_count = df.isna().sum()  # Count missing values per column
    missing_pct = missing_count / total * 100  # Calculate the percentage of missing values
    return pd.DataFrame({
        "missing_count": missing_count,
        "missing_pct": missing_pct
    })

### 1.3.3. Data Processing Missing Values Helper Function

In [None]:
def replace_values(df: pd.DataFrame, columns: list, replace_map: dict) -> pd.DataFrame:
    """
    Replaces specified values in given columns according to a replacement map.

    Parameters:
        - df (pd.DataFrame): The input DataFrame.
        - columns (list): List of column names where replacements should occur.
        - replace_map (dict): Dictionary specifying which values to replace with what.
        Example: {'unknown': np.nan, 'yes': 1, 'no': 0}

    Returns:
        - pd.DataFrame: A new DataFrame with the values replaced.
    """
    # Work on a copy to avoid modifying the original DataFrame
    df_copy = df.copy()

    
    # Iterate over each specified column and apply the replacement
    for col in columns:
        if col in df_copy.columns:
            # Replace values
            df_copy[col] = df_copy[col].replace(replace_map)

            # Explicitly fix object types
            df_copy[col] = df_copy[col].infer_objects(copy=False)
        else:
            raise KeyError(f"Column '{col}' does not exist in the DataFrame.")
    
    return df_copy

In [None]:
def drop_columns_missing(df: pd.DataFrame, threshold: float) -> pd.DataFrame:
    """
    Drops columns with a missing value percentage greater than the specified threshold.

    Parameters:
        - df (pd.DataFrame): The input DataFrame.
        - threshold (float): The percentage threshold for missing values (0.0 - 1.0).
        Columns with more missing values than this threshold will be dropped.

    Returns:
        - pd.DataFrame: A copy of the DataFrame with columns dropped where missing values exceed the threshold.
    """
    df_copy = df.copy()  # Work with a copy to avoid modifying the original DataFrame
    miss = df_copy.isna().mean()  # Calculate the percentage of missing values per column
    cols_to_drop = miss[miss > threshold].index  # Find columns with more missing values than the threshold
    return df_copy.drop(columns=cols_to_drop)  # Drop those columns and return the cleaned DataFrame


In [None]:
def drop_rows_missing(df: pd.DataFrame, threshold: float) -> pd.DataFrame:
    """
    Drops rows with a missing value percentage greater than the specified threshold.

    Parameters:
        - df (pd.DataFrame): The input DataFrame.
        - threshold (float): The percentage threshold for missing values (0.0 - 1.0).
        Rows with more missing values than this threshold will be dropped.

    Returns:
        - pd.DataFrame: A copy of the DataFrame with rows dropped where missing values exceed the threshold.
    """
    df_copy = df.copy()  # Work with a copy to avoid modifying the original DataFrame
    mask = df_copy.isna().mean(axis=1) <= threshold  # Create a mask to keep rows with less missing data
    return df_copy.loc[mask].copy()  # Apply the mask and return the cleaned DataFrame



In [None]:
def impute_mode(df: pd.DataFrame, columns: list = None) -> pd.DataFrame:
    """
    Imputes missing values in the specified columns using the mode (most frequent value).

    Parameters:
        - df (pd.DataFrame): The input DataFrame.
        - columns (list, optional): A list of column names where missing values should be imputed.
        If None, all columns with missing values will be considered.

    Returns:
        - pd.DataFrame: A copy of the DataFrame with missing values replaced by the mode.
    """
    df_copy = df.copy()  # Work with a copy to avoid modifying the original DataFrame
    if columns is None:
        columns = df_copy.columns.tolist()  # If no columns are specified, use all columns

    imputer = SimpleImputer(strategy="most_frequent")  # Create an imputer that uses the most frequent value
    df_copy[columns] = imputer.fit_transform(df_copy[columns])  # Impute missing values in the specified columns
    return df_copy  # Return the DataFrame with imputed values


## 1.4 Prepare the folder structure

In [23]:
# Clean the data directory
!rm -rf ../data/archiv/
!rm -rf ../data/raw/
# !rm -rf ../data/processed

In [24]:
# Prepare Folders Structure
!mkdir -p ../data/archiv
!mkdir -p ../data/raw/kaggle/reviews
!mkdir -p ../data/raw/kaggle/congress
!mkdir -p ../data/raw/uci/mental_health_risk
!mkdir -p ../data/raw/uci/autistic_spectrum

## 1.3. Load The Datasets

- From UCI Machine Learning Repository
  - Maternal Health Risk -> https://archive.ics.uci.edu/dataset/863/maternal+health+risk
  - Autistic Spectrum Disorder Screening Data for Children -> https://archive.ics.uci.edu/dataset/419/autistic+spectrum+disorder+screening+data+for+children
- From Kaggle
  - 184.702 TU ML 2025S - Reviews -> https://www.kaggle.com/competitions/184-702-tu-ml-2025-s-reviews/data
  - 184.702 TU ML 2025S - Congressional Voting -> https://www.kaggle.com/competitions/184-702-tu-ml-2025-s-congressional-voting/data

### 1.3.1 Download the Kaggle Datasets

In [25]:
# Reviews-Dataset in ../data/reviews speichern
!kaggle competitions download -c 184-702-tu-ml-2025-s-reviews -p ../data/archiv/kaggle/
!unzip -qo ../data/archiv/kaggle/184-702-tu-ml-2025-s-reviews.zip -d ../data/raw/kaggle/reviews/
logger.info("Reviews dataset downloaded and extracted.")

# Congressional Voting Records in ../data/congress speichern
!kaggle competitions download -c 184-702-tu-ml-2025-s-congressional-voting -p ../data/archiv/kaggle/
!unzip -qo ../data/archiv/kaggle/184-702-tu-ml-2025-s-congressional-voting.zip -d ../data/raw/kaggle/congress/
logger.info("Congressional Voting Records dataset downloaded and extracted.")

# Load the datasets into a pandas DataFrame
df_reviews_raw = pd.read_csv("../data/raw/kaggle/reviews/amazon_review_ID.shuf.lrn.csv")
df_congress_raw = pd.read_csv("../data/raw/kaggle/congress/CongressionalVotingID.shuf.lrn.csv")


2025-04-27 19:10:53,954 - __main__ - INFO - Reviews dataset downloaded and extracted.
2025-04-27 19:10:55,376 - __main__ - INFO - Congressional Voting Records dataset downloaded and extracted.


### 1.3.2 Download the UCI Datasets

In [28]:
# ------- Mental Health Risk ------- #
# Download the mental health risk dataset
# fetch dataset 
maternal_health_risk = fetch_ucirepo(id=863) 
  
# data (as pandas dataframes) 
X_maternal = maternal_health_risk.data.features
y_maternal = maternal_health_risk.data.targets
df_maternal = pd.concat([X_maternal, y_maternal], axis=1)
  
# Save the dataset to CSV
maternal_path = '../data/raw/uci/mental_health_risk/maternal_health_risk.csv'
df_maternal.to_csv(maternal_path, index=False)
logger.info(f"Maternal Health Risk dataset successfully saved at {maternal_path}")




# ------- Autistic Spectrum ------- #
# Download the autistic spectrum dataset
# fetch dataset 
asd_data = fetch_ucirepo(id=419) 
  
# data (as pandas dataframes) 
X_asd = asd_data.data.features
y_asd = asd_data.data.targets
df_asd = pd.concat([X_asd, y_asd], axis=1)
  
# Save the dataset to CSV
asd_path = '../data/raw/uci/autistic_spectrum/asd_screening.csv'
df_asd.to_csv(asd_path, index=False)
logger.info(f"Autistic Spectrum dataset successfully saved at {asd_path}")

2025-04-27 19:11:29,934 - __main__ - INFO - Maternal Health Risk dataset successfully saved at ../data/raw/uci/mental_health_risk/maternal_health_risk.csv
2025-04-27 19:11:33,055 - __main__ - INFO - Autistic Spectrum dataset successfully saved at ../data/raw/uci/autistic_spectrum/asd_screening.csv


# 2. Data Analysis & Preparation

# 2.1. Maternal Health Risk Data Set

# 2.2. Autistic Spectrum Disorder Screening Data for Children

# 2.3. Amazon Review Data Set

# 2.4. Congressional Voting Data Set