In [1]:
from kamp.preprocess import KampDataLoader

DATA_PATH = './data/경진대회용 주조 공정최적화 데이터셋.csv'


data_loader = KampDataLoader(path = DATA_PATH,

                             outlier_method='iso',
                             p_threshold=0.05,
                             get_useful_p_data=True,
                             iso_outlier_rate=0.015,
                             
                             do_resample=True,
                             downsampled_pass_rate=0.6,
                             upsampled_total_fail_rate=0.15)

data_loader.process()

data = data_loader.load()

x_train = data['train_data']
y_train = data['train_label']
x_test = data['test_data']
y_test=  data['test_label']


[process Log] Loading Raw Data...
[process Log] Done

[process Log] Processing Nan Value...
[process Log] Done

[process Log] Encoding Categorical Features...
[process Log] Done

[process Log] Removing Outliers (IsoForest)...
[Outlier-Remover Log] With Outliers Shape : (92014, 23)
[Outlier-Remover Log] Without Outliers Shape : (90633, 23)
[process Log] Done

[process Log] T-Testing...
[process Log] Done

[process Log] Data Scaling (MinMaxScaler)...
[process Log] Done

[process Log] Train Test Spliting...
[process Log] Done

[process Log] Data Resampling...
[process Log] Done



In [2]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((48571, 17), (48571,), (46284, 17), (46284,))

In [99]:
import pandas as pd
from kamp.preprocess import load_data, NanProcessor, CatFeatureEncoder, check_fail_rate
from kamp.preprocess import NAN_GRID, ENCODE_GRID

In [103]:
data_configs = load_data(DATA_PATH)

data = data_configs['data']
numeric_features = data_configs['numeric_features']
object_features = data_configs['object_features']

In [104]:
class NanProcessor:
    def __init__(self, nan_grid):
        self.drop_features = nan_grid['drop_features']
        self.simple_fill_dict = nan_grid['simple_fill_dict']
        self.mode_fill_features = nan_grid['mode_fill_features']
        self.mode_criterion = nan_grid['mode_criterion']
    
    def process(self, data):
        data = data.dropna(subset=['passorfail'])

        for feature, fill_val in self.simple_fill_dict.items():
            if feature == 'heating_furnace':
                condition = (data[feature].isna()) & (data['molten_volume'].isna())
                data.loc[condition] = data.loc[condition].fillna('D').astype('object')
            data[feature] = data[feature].fillna(fill_val)
        
        data = data.drop(columns=self.drop_features)
        
        for feature in self.mode_fill_features:
            data[feature] = data.groupby(self.mode_criterion)[feature].transform(
                lambda x : x.fillna(x.mode()[0] if not x.mode().empty else x.mean())
            )
        
        data = data.reset_index(drop=True)

        return data

In [105]:
data = NanProcessor(nan_grid=NAN_GRID).process(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feature] = data[feature].fillna(fill_val)
  data.loc[condition] = data.loc[condition].fillna('D').astype('object')
  data.loc[condition] = data.loc[condition].fillna('D').astype('object')
  data.loc[condition] = data.loc[condition].fillna('D').astype('object')
  data.loc[condition] = data.loc[condition].fillna('D').astype('object')
  data.loc[condition] = data.loc[condition].fillna('D').astype('object')
  data.loc[condition] = data.loc[condition].fillna('D').astype('object')
  data.loc[condition] = data.loc[condition].fillna('D').astype('object')
  data.loc[condition] = data.loc[condition].fillna('D').astype('object')
  data.loc[condition] = data.loc[condition].fillna('D').astype('object')
  data.loc[condition] = data.lo

In [7]:
data = CatFeatureEncoder(encode_grid=ENCODE_GRID).process(data)

In [8]:
data.head()

Unnamed: 0,count,working,molten_temp,facility_operation_cycleTime,production_cycletime,low_section_speed,high_section_speed,cast_pressure,biscuit_thickness,upper_mold_temp1,...,lower_mold_temp2,lower_mold_temp3,sleeve_temperature,physical_strength,Coolant_temperature,EMS_operation_time,passorfail,tryshot_signal,mold_code,heating_furnace
0,258,1.0,731.0,119,120,110.0,112.0,331.0,35.0,198.0,...,316.0,1449.0,550.0,700.0,34.0,23,0.0,0.0,5.0,2.0
1,243,1.0,720.0,98,125,109.0,109.0,309.0,48.0,250.0,...,163.0,1449.0,481.0,0.0,30.0,25,0.0,0.0,0.0,1.0
2,244,1.0,721.0,98,122,109.0,109.0,309.0,48.0,250.0,...,163.0,1449.0,481.0,0.0,30.0,25,0.0,0.0,0.0,1.0
3,245,1.0,721.0,100,125,112.0,109.0,308.0,49.0,268.0,...,179.0,1449.0,483.0,0.0,30.0,25,0.0,0.0,0.0,1.0
4,246,1.0,721.0,99,123,109.0,110.0,309.0,49.0,276.0,...,187.0,1449.0,486.0,0.0,30.0,25,0.0,0.0,0.0,1.0


In [9]:
numeric_features

Index(['count', 'molten_temp', 'facility_operation_cycleTime',
       'production_cycletime', 'low_section_speed', 'high_section_speed',
       'molten_volume', 'cast_pressure', 'biscuit_thickness',
       'upper_mold_temp1', 'upper_mold_temp2', 'upper_mold_temp3',
       'lower_mold_temp1', 'lower_mold_temp2', 'lower_mold_temp3',
       'sleeve_temperature', 'physical_strength', 'Coolant_temperature',
       'EMS_operation_time', 'passorfail', 'mold_code'],
      dtype='object')