In [18]:
# Generate dummy data

import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler

In [19]:
# Create a dummy dataset
np.random.seed(0)
dummy_data = {
    'Feature1': np.random.normal(100, 10, 100).tolist() + [np.nan, 200],  # Normally distributed with an outlier
    'Feature2': np.random.randint(0, 100, 102).tolist(),  # Random integers
    'Category': ['A', 'B', 'C', 'D'] * 25 + [np.nan, 'A'],  # Categorical with some missing values
    'Target': np.random.choice([0, 1], 102).tolist()  # Binary target variable
}

# Convert the dictionary to a pandas DataFrame
df_dummy = pd.DataFrame(dummy_data)

# Display the first few rows of the dummy dataset
print(df_dummy.head())

     Feature1  Feature2 Category  Target
0  117.640523        32        A       1
1  104.001572        70        B       1
2  109.787380        85        C       0
3  122.408932        31        D       1
4  118.675580        13        A       0


In [20]:
# Preprocessing function definition

def load_data(df):
    return df

def handle_missing_values(df):
    """
    Handles missing values in a DataFrame.
    For numeric columns, fills missing values with the mean of that column.
    For non-numeric columns, fills missing values with the mode of that column,
    or a specified placeholder if mode is not suitable or missing.

    Args:
        df (pd.DataFrame): The input DataFrame with potential missing values.

    Returns:
        pd.DataFrame: The DataFrame with missing values handled.
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame.")

    df_copy = df.copy() # Work on a copy to avoid modifying the original DataFrame

    # Handle numeric columns
    numeric_cols = df_copy.select_dtypes(include=['number']).columns
    if not numeric_cols.empty:
        try:
            df_copy[numeric_cols] = df_copy[numeric_cols].fillna(df_copy[numeric_cols].mean())
        except Exception as e:
            print(f"Error handling missing values in numeric columns: {e}")
            # Depending on your needs, you might want to re-raise the exception,
            # fill with a different value, or log it.

    # Handle non-numeric (object/string) columns
    # For non-numeric columns, filling with the mode is often a reasonable strategy.
    # You might want to customize this based on the specific data.
    object_cols = df_copy.select_dtypes(include=['object', 'category']).columns
    if not object_cols.empty:
        for col in object_cols:
            if df_copy[col].isnull().any(): # Check if there are any missing values
                try:
                    mode_val = df_copy[col].mode()[0]
                    df_copy[col] = df_copy[col].fillna(mode_val)
                except IndexError:
                    # This can happen if a column is entirely NaN or has no mode
                    print(f"Warning: Could not determine mode for column '{col}'. Filling with 'Unknown'.")
                    df_copy[col] = df_copy[col].fillna('Unknown')
                except Exception as e:
                    print(f"Error handling missing values in non-numeric column '{col}': {e}")
                    # Fallback for other errors in non-numeric columns
                    df_copy[col] = df_copy[col].fillna('ErrorPlaceholder') # Or a more suitable placeholder

    # You might also want to consider datetime columns or other specific dtypes
    # and add handling for them if necessary.

    return df_copy

def remove_outliers(df):
    z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
    return df[(z_scores < 3).all(axis=1)]  # Remove rows with any outliers

def scale_data(df):
    scaler = StandardScaler()
    df[df.select_dtypes(include=[np.number]).columns] = scaler.fit_transform(df.select_dtypes(include=[np.number]))
    return df

def encode_categorical(df, categorical_columns):
    return pd.get_dummies(df, columns=categorical_columns)

def save_data(df, output_filepath):
    df.to_csv(output_filepath, index=False)

In [21]:
df_dummy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102 entries, 0 to 101
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Feature1  101 non-null    float64
 1   Feature2  102 non-null    int64  
 2   Category  101 non-null    object 
 3   Target    102 non-null    int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 3.3+ KB


In [22]:
# Clean and pre-process data

# Load the data
df_preprocessed = load_data(df_dummy)

# Handle missing values
df_preprocessed = handle_missing_values(df_preprocessed)

# Remove outliers
df_preprocessed = remove_outliers(df_preprocessed)

# Scale the data
df_preprocessed = scale_data(df_preprocessed)

# Encode categorical variables
df_preprocessed = encode_categorical(df_preprocessed, ['Category'])

# Display the preprocessed data
print(df_preprocessed.head())

   Feature1  Feature2    Target  Category_A  Category_B  Category_C  \
0  1.698298 -0.519379  0.932936        True       False       False   
1  0.338384  0.887380  0.932936       False        True       False   
2  0.915276  1.442679 -1.071884       False       False        True   
3  2.173747 -0.556399  0.932936       False       False       False   
4  1.801501 -1.222759 -1.071884        True       False       False   

   Category_D  
0       False  
1       False  
2       False  
3        True  
4       False  
