In [3]:
from kamp.preprocess import KampDataLoader

DATA_PATH = './data/경진대회용 주조 공정최적화 데이터셋.csv'

data_loader = KampDataLoader(
    path = DATA_PATH,

    # 처리 안한게 더 좋았음
    # 처리 안한 것 : 0.944
    # 처리 한 것 : 최대 0.922
    do_count_trend=False,
    drop_count=False,

    get_useful_p_data=True,
    p_threshold=0.10,

    outlier_method='iso',
    iso_outlier_rate=0.008,

    do_resample=True,
    downsampled_pass_rate=1.0,
    upsampled_fail_rate_about_pass=0.30,
    upsample_method='adasyn',

    do_pca=False,
    # variance_rate=0.99
)

data_loader.process()

data = data_loader.load()

x_train = data['train_data']
y_train = data['train_label']
x_test = data['test_data']
y_test=  data['test_label']


[Process Log] Loading Raw Data...
[Process Log] Done

[Process Log] Processing Nan Value...
[Process Log] Done

[Process Log] Encoding Categorical Features...
[Process Log] Done

[Process Log] Removing Outliers (IsoForest)...
[Outlier-Remover Log] With Outliers Shape : (89753, 23)
[Outlier-Remover Log] Without Outliers Shape : (89034, 23)
[Process Log] Done

[Process Log] T-Testing...
[Process Log] Done

[Process Log] Data Scaling (MinMaxScaler)...
[Process Log] Done

[Process Log] Train Test Spliting...
[Process Log] Done

[Process Log] Data Resampling (adasyn)...
[Process Log] Done



In [4]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((89214, 17), (89214,), (17807, 17), (17807,))

In [None]:
import pandas as pd
from kamp.preprocess import load_data, NanProcessor, CatFeatureEncoder, check_fail_rate
from kamp.preprocess import NAN_GRID, ENCODE_GRID
from kamp.preprocess import remove_outliers_by_isoforest, DataResampler

DATA_PATH = './data/경진대회용 주조 공정최적화 데이터셋.csv'

In [27]:
data_configs = load_data(DATA_PATH)

data = data_configs['data']
numeric_features = data_configs['numeric_features']
object_features = data_configs['object_features']

In [28]:
data = NanProcessor(NAN_GRID).process(data)

In [29]:
class FeatureEngineer:
    def __init__(self, do_count_trend=True):
        self.do_count_trend=True
    
    def get_count_trend_feature(self, data):
        count_trend = []

        for count in data['count']:
            if (count >= 1) and (count <= 5):
                count_trend.append(2)
            elif (count >= 6) and (count <= 10):
                count_trend.append(1)
            else:
                count_trend.append(0)

        data['count_trend'] = count_trend

        return data
    
    def process(self, data):
        if self.do_count_trend:
            data = self.get_count_trend_feature(data)
        
        return data

In [30]:
data = FeatureEngineer(do_count_trend=True).process(data)

In [31]:
data = CatFeatureEncoder(ENCODE_GRID).process(data)

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

# 1. 데이터 로드 (Iris 데이터셋 예시)
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# 2. 데이터 표준화
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.drop('target', axis=1))

# 3. PCA 모델 적합
pca = PCA()
pca.fit(scaled_data)

# 4. 누적 설명 분산 비율 계산
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# 5. 95% 분산을 설명하는 주성분 개수 찾기
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1

print(f"Number of components to explain 95% of variance: {n_components_95}")

In [None]:
class PCAProcessor:
    def __init__(self, variance_rate):
        self.variance_rate = variance_rate

        self.pca_computer = PCA()
    
    def process(self, data):
        self.pca_computer.fit(data)

        explained_variance = self.pca_computer.explained_variance_
        cumulative_variance = np.cumsum(explained_variance)

        n_components = np.argmax(cumulative_variance >= self.variance_rate) + 1

        self.pca_computer = PCA(n_components=n_components)

        pca_result = self.pca_computer.fit_transform(data)
        pca_result = pd.DataFrame(data=pca_result, columns=[f'PC{i+1}' for i in range(n_components_95)])