## Data Cleaning

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Set the random seed for reproducibility
np.random.seed(3354354524)

# Load the dataset
url = 'https://philchodrow.github.io/PIC16A/datasets/palmer_penguins.csv'
penguins = pd.read_csv(url)

# Saving simple species names
penguins["Species"] = penguins["Species"].str.split().str.get(0)
selected_columns = ["Species", "Island", "Region", "Culmen Length (mm)", "Culmen Depth (mm)", "Flipper Length (mm)", "Body Mass (g)", "Sex", "Delta 13 C (o/oo)", "Delta 15 N (o/oo)"]
penguins = penguins[selected_columns] #saving only relavent columns

# Split the data into training and test sets (80/20 split) before cleaning
X = penguins.drop(columns='Species').copy()  # All columns except 'Species'
y = penguins['Species'].copy()  # Target is 'Species'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the cleaning function
def clean_data(X, y=None):
    """
    Cleans the dataset by handling missing values, removing unusual placeholders,
    and optionally aligning the target variable if provided.
    """
    # Drop rows with missing values
    X_cleaned = X.dropna()

    # Remove rows that arent male or female in 'Sex' column
    if "Sex" in X_cleaned.columns:
        X_cleaned = X_cleaned[X_cleaned["Sex"] != "."]

    # Align target variable if provided
    if y is not None:
        y = y.loc[X_cleaned.index]

    return X_cleaned, y

# Clean the training and testing sets
X_train_cleaned, y_train_cleaned = clean_data(X_train, y_train)
X_test_cleaned, y_test_cleaned = clean_data(X_test, y_test)