# Data Load

In [2]:
import pandas as pd
from sklearn.datasets import fetch_california_housing, load_iris
import os

# 데이터셋 디렉토리 생성
os.makedirs("datasets", exist_ok=True)

# California Housing 데이터셋 다운로드
california = fetch_california_housing()
california_df = pd.DataFrame(california.data, columns=california.feature_names)
california_df["PRICE"] = california.target
california_df.to_csv("datasets/california_housing.csv", index=False)
print("California Housing 데이터셋이 저장되었습니다.")

# Iris 데이터셋 다운로드
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df["target"] = iris.target
iris_df["species"] = pd.Categorical.from_codes(iris.target, iris.target_names)
iris_df.to_csv("datasets/iris.csv", index=False)
print("Iris 데이터셋이 저장되었습니다.")

California Housing 데이터셋이 저장되었습니다.
Iris 데이터셋이 저장되었습니다.


In [4]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,species
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


# Data Preprocessing

In [6]:
import numpy as np
import pandas as pd


class DataPreprocessing:
    def __init__(self):
        self.categorical_maps = {}

    def scale_features(self, data, columns):
        pass

    def onehot_encode(self, data, columns):
        """
        Perform one-hot encoding on categorical variables.

        Args:
        data (pd.DataFrame): Input data
        columns (list): List of column names to encode

        Returns:
        pd.DataFrame: Data with one-hot encoded categories
        """
        # Create a copy of the input DataFrame
        encoded = data.copy()

        for column in columns:
            # Get unique values in the column
            unique_values = data[column].unique()

            # Create a mapping of unique values to integers
            self.categorical_maps[column] = {val: i for i, val in enumerate(unique_values)}

            # Create new columns for each unique value
            for value in unique_values:
                encoded[f"{column}_{value}"] = (data[column] == value).astype(int)  # compare each value with the column

            # Drop the original column
            encoded.drop(column, axis=1, inplace=True)

        return encoded

    def normalize(self, data, columns):
        pass

In [10]:
preprocessing = DataPreprocessing()
preprocessing.onehot_encode(iris_df, columns=["species"])
# categorical_maps를 확인합니다
print("Categorical maps:")
print(preprocessing.categorical_maps)

Categorical maps:
{'species': {'setosa': 0, 'versicolor': 1, 'virginica': 2}}
