In [5]:
import yaml
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as scs
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

## 1. Todo List

1. Missing value handling</br>
1.1. pm10      : mean kelas</br>
1.2. pm25      : mean kelas</br>
1.3. so2       : mean</br>
1.4. co        : median</br>
1.5. o3        : median</br>
1.6. no2       : median</br>

2. Join kategori sedang dan tidak sehat menjadi tidak baik

3. Balancing label baik dan tidak baik

4. Outlier removal

## 2. Import Params

In [7]:
params_dir = "C:/Users/farha/Documents/pacmann_mlp/config/config.yaml"

In [8]:
def load_params(param_dir):
    with open(param_dir, "r") as file:
        params = yaml.safe_load(file)
        
    return params

In [9]:
def params_updater(key, value, params, params_dir):
    params = params.copy()
    params[key] = value

    with open(params_dir, "w") as file:
        yaml.dump(params, file)
    
    params = load_params(params_dir)

    return params

In [10]:
params = load_params(params_dir)

In [11]:
params

{'dataset_dir': 'C:/Users/farha/Documents/pacmann_mlp/data/raw/',
 'datetime_columns': ['tanggal'],
 'int32_columns': ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max'],
 'label': 'categori',
 'label_categories': ['BAIK', 'SEDANG', 'TIDAK SEHAT'],
 'label_categories_new': ['BAIK', 'TIDAK BAIK'],
 'missing_value_co': 11,
 'missing_value_no2': 18,
 'missing_value_o3': 29,
 'missing_value_pm10': {'BAIK': 28, 'TIDAK BAIK': 55},
 'missing_value_pm25': {'BAIK': 38, 'TIDAK BAIK': 82},
 'missing_value_so2': 35,
 'object_columns': ['stasiun', 'critical', 'categori'],
 'predictors': ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'],
 'range_co': [-1, 100],
 'range_no2': [-1, 100],
 'range_o3': [-1, 160],
 'range_pm10': [-1, 800],
 'range_pm25': [-1, 400],
 'range_so2': [-1, 500],
 'range_stasiun': ['DKI1 (Bunderan HI)',
  'DKI2 (Kelapa Gading)',
  'DKI3 (Jagakarsa)',
  'DKI4 (Lubang Buaya)',
  'DKI5 (Kebon Jeruk) Jakarta Barat']}

## 3. Load Dataset

In [12]:
x_train = joblib.load("C:/Users/farha/Documents/pacmann_mlp/data/processed/x_train.pkl")
y_train = joblib.load("C:/Users/farha/Documents/pacmann_mlp/data/processed/y_train.pkl")

x_valid = joblib.load("C:/Users/farha/Documents/pacmann_mlp/data/processed/x_valid.pkl")
y_valid = joblib.load("C:/Users/farha/Documents/pacmann_mlp/data/processed/y_valid.pkl")

x_test = joblib.load("C:/Users/farha/Documents/pacmann_mlp/data/processed/x_test.pkl")
y_test = joblib.load("C:/Users/farha/Documents/pacmann_mlp/data/processed/y_test.pkl")

In [13]:
dataset = pd.concat([x_train, y_train], axis = 1)

valid_set = pd.concat([x_valid, y_valid], axis = 1)

test_set = pd.concat([x_test, y_test], axis = 1)

## 4. Join Categories

In [14]:
def join_cat(set_data, params):
    if params["label"] in set_data.columns.to_list():
        set_data = set_data.copy()
        set_data.categori.replace(params["label_categories"][1], params["label_categories"][2], inplace = True)
        set_data.categori.replace(params["label_categories"][2], params["label_categories_new"][1], inplace = True)
        return set_data
    else:
        raise RuntimeError("Kolom label tidak terdeteksi pada set data yang diberikan!")

In [15]:
params = params_updater("label", "categori", params, params_dir)
params = params_updater("label_categories", ["BAIK", "SEDANG", "TIDAK SEHAT"], params, params_dir)
params = params_updater("label_categories_new", ["BAIK", "TIDAK BAIK"], params, params_dir)

In [16]:
params

{'dataset_dir': 'C:/Users/farha/Documents/pacmann_mlp/data/raw/',
 'datetime_columns': ['tanggal'],
 'int32_columns': ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max'],
 'label': 'categori',
 'label_categories': ['BAIK', 'SEDANG', 'TIDAK SEHAT'],
 'label_categories_new': ['BAIK', 'TIDAK BAIK'],
 'missing_value_co': 11,
 'missing_value_no2': 18,
 'missing_value_o3': 29,
 'missing_value_pm10': {'BAIK': 28, 'TIDAK BAIK': 55},
 'missing_value_pm25': {'BAIK': 38, 'TIDAK BAIK': 82},
 'missing_value_so2': 35,
 'object_columns': ['stasiun', 'critical', 'categori'],
 'predictors': ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'],
 'range_co': [-1, 100],
 'range_no2': [-1, 100],
 'range_o3': [-1, 160],
 'range_pm10': [-1, 800],
 'range_pm25': [-1, 400],
 'range_so2': [-1, 500],
 'range_stasiun': ['DKI1 (Bunderan HI)',
  'DKI2 (Kelapa Gading)',
  'DKI3 (Jagakarsa)',
  'DKI4 (Lubang Buaya)',
  'DKI5 (Kebon Jeruk) Jakarta Barat']}

### 4.1. Train set

In [17]:
dataset.categori.value_counts()

categori
SEDANG         914
TIDAK SEHAT    223
BAIK           132
Name: count, dtype: int64

In [18]:
dataset = join_cat(dataset, params)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  set_data.categori.replace(params["label_categories"][1], params["label_categories"][2], inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  set_data.categori.replace(params["label_categories"][2], params["label_categories_new"][1], inplace = True)


In [19]:
dataset.categori.value_counts()

categori
TIDAK BAIK    1137
BAIK           132
Name: count, dtype: int64

### 4.2. Valid set

In [20]:
valid_set.categori.value_counts()

categori
SEDANG         196
TIDAK SEHAT     48
BAIK            28
Name: count, dtype: int64

In [21]:
valid_set = join_cat(valid_set, params)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  set_data.categori.replace(params["label_categories"][1], params["label_categories"][2], inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  set_data.categori.replace(params["label_categories"][2], params["label_categories_new"][1], inplace = True)


In [22]:
valid_set.categori.value_counts()

categori
TIDAK BAIK    244
BAIK           28
Name: count, dtype: int64

### 4.3. Test set

In [23]:
test_set.categori.value_counts()

categori
SEDANG         195
TIDAK SEHAT     48
BAIK            29
Name: count, dtype: int64

In [24]:
test_set = join_cat(test_set, params)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  set_data.categori.replace(params["label_categories"][1], params["label_categories"][2], inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  set_data.categori.replace(params["label_categories"][2], params["label_categories_new"][1], inplace = True)


In [25]:
test_set.categori.value_counts()

categori
TIDAK BAIK    243
BAIK           29
Name: count, dtype: int64

## 5. Handling Missing Value

In [26]:
def nan_detector(set_data):
    set_data = set_data.copy()
    set_data.replace(-1, np.nan, inplace = True)
    return set_data

### 5.1. Convert -1 to NaN

#### 5.1.1. Trainset

In [27]:
dataset.describe()

Unnamed: 0,pm10,pm25,so2,co,o3,no2
count,1269.0,1269.0,1269.0,1269.0,1269.0,1269.0
mean,51.152088,75.731284,33.360126,11.536643,30.835303,19.267928
std,17.246488,29.30646,14.710573,5.045186,15.319329,9.231295
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,42.0,60.0,25.0,8.0,21.0,13.0
50%,54.0,77.0,34.0,11.0,28.0,18.0
75%,62.0,93.0,43.0,14.0,37.0,25.0
max,100.0,174.0,82.0,44.0,151.0,65.0


In [28]:
dataset.isnull().sum()

stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
categori    0
dtype: int64

In [29]:
dataset = nan_detector(dataset)

In [30]:
dataset.isnull().sum()

stasiun      0
pm10        35
pm25        55
so2         68
co          10
o3          35
no2          9
categori     0
dtype: int64

#### 5.1.2. Validset

In [31]:
valid_set.describe()

Unnamed: 0,pm10,pm25,so2,co,o3,no2
count,272.0,272.0,272.0,272.0,272.0,272.0
mean,51.496324,75.459559,34.341912,11.290441,32.533088,18.786765
std,17.315394,28.64051,15.026324,5.297794,14.238053,9.295802
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,43.0,60.0,26.0,8.0,23.0,12.0
50%,54.0,77.0,34.0,10.5,30.0,17.0
75%,62.0,95.0,45.0,13.0,41.0,24.25
max,94.0,150.0,80.0,47.0,85.0,62.0


In [32]:
valid_set.isnull().sum()

stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
categori    0
dtype: int64

In [33]:
valid_set = nan_detector(valid_set)

In [34]:
valid_set.isnull().sum()

stasiun      0
pm10         9
pm25        13
so2         15
co           3
o3           3
no2          4
categori     0
dtype: int64

#### 5.1.1. Testset

In [35]:
test_set.describe()

Unnamed: 0,pm10,pm25,so2,co,o3,no2
count,272.0,272.0,272.0,272.0,272.0,272.0
mean,51.121324,73.444853,32.360294,11.816176,31.194853,18.669118
std,18.589466,29.945903,14.567148,5.170456,16.550122,8.879725
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,40.0,59.0,23.0,9.0,21.0,13.0
50%,54.0,76.0,32.0,11.0,28.0,18.0
75%,61.0,91.0,43.0,14.0,38.25,24.0
max,179.0,150.0,64.0,44.0,93.0,49.0


In [36]:
test_set.isnull().sum()

stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
categori    0
dtype: int64

In [37]:
test_set = nan_detector(test_set)

In [38]:
test_set.isnull().sum()

stasiun      0
pm10         9
pm25        18
so2         14
co           3
o3          10
no2          6
categori     0
dtype: int64

### 5.2. Handling in PM10

#### 5.2.1. Trainset

In [39]:
impute_1 = int(dataset[dataset.categori == "BAIK"].pm10.mean())
impute_0 = int(dataset[dataset.categori == "TIDAK BAIK"].pm10.mean())

In [40]:
impute_1, impute_0

(28, 55)

In [41]:
params = params_updater("missing_value_pm10", {"BAIK": 28, "TIDAK BAIK": 55}, params, params_dir)

In [42]:
dataset[dataset.categori == "BAIK"].pm10.isnull().sum(), dataset[dataset.categori == "TIDAK BAIK"].pm10.isnull().sum()

(np.int64(5), np.int64(30))

In [43]:
dataset.loc[dataset[(dataset.categori == "BAIK") & (dataset.pm10.isnull() == True)].index, "pm10"] = impute_1
dataset.loc[dataset[(dataset.categori == "TIDAK BAIK") & (dataset.pm10.isnull() == True)].index, "pm10"] = impute_0

In [44]:
dataset[dataset.categori == "BAIK"].pm10.isnull().sum(), dataset[dataset.categori == "TIDAK BAIK"].pm10.isnull().sum()

(np.int64(0), np.int64(0))

#### 5.2.2. Validset

In [45]:
valid_set[valid_set.categori == "BAIK"].pm10.isnull().sum(), valid_set[valid_set.categori == "TIDAK BAIK"].pm10.isnull().sum()

(np.int64(2), np.int64(7))

In [46]:
valid_set.loc[valid_set[(valid_set.categori == "BAIK") & (valid_set.pm10.isnull() == True)].index, "pm10"] = impute_1
valid_set.loc[valid_set[(valid_set.categori == "TIDAK BAIK") & (valid_set.pm10.isnull() == True)].index, "pm10"] = impute_0

In [47]:
valid_set[valid_set.categori == "BAIK"].pm10.isnull().sum(), valid_set[valid_set.categori == "TIDAK BAIK"].pm10.isnull().sum()

(np.int64(0), np.int64(0))

#### 5.2.3. Testset

In [48]:
test_set[test_set.categori == "BAIK"].pm10.isnull().sum(), test_set[test_set.categori == "TIDAK BAIK"].pm10.isnull().sum()

(np.int64(3), np.int64(6))

In [49]:
test_set.loc[test_set[(test_set.categori == "BAIK") & (test_set.pm10.isnull() == True)].index, "pm10"] = impute_1
test_set.loc[test_set[(test_set.categori == "TIDAK BAIK") & (test_set.pm10.isnull() == True)].index, "pm10"] = impute_0

In [50]:
test_set[test_set.categori == "BAIK"].pm10.isnull().sum(), test_set[test_set.categori == "TIDAK BAIK"].pm10.isnull().sum()

(np.int64(0), np.int64(0))

### 5.3. Handling in PM25

#### 5.3.1. Trainset

In [51]:
impute_1 = int(dataset[dataset.categori == "BAIK"].pm25.mean())
impute_0 = int(dataset[dataset.categori == "TIDAK BAIK"].pm25.mean())

In [52]:
impute_1, impute_0

(38, 82)

In [53]:
params = params_updater("missing_value_pm25", {"BAIK": 38, "TIDAK BAIK": 82}, params, params_dir)

In [54]:
dataset[dataset.categori == "BAIK"].pm25.isnull().sum(), dataset[dataset.categori == "TIDAK BAIK"].pm25.isnull().sum()

(np.int64(35), np.int64(20))

In [55]:
dataset.loc[dataset[(dataset.categori == "BAIK") & (dataset.pm25.isnull() == True)].index, "pm25"] = impute_1
dataset.loc[dataset[(dataset.categori == "TIDAK BAIK") & (dataset.pm25.isnull() == True)].index, "pm25"] = impute_0

In [56]:
dataset[dataset.categori == "BAIK"].pm10.isnull().sum(), dataset[dataset.categori == "TIDAK BAIK"].pm10.isnull().sum()

(np.int64(0), np.int64(0))

#### 5.3.2. Validset

In [57]:
valid_set[valid_set.categori == "BAIK"].pm25.isnull().sum(), valid_set[valid_set.categori == "TIDAK BAIK"].pm25.isnull().sum()

(np.int64(5), np.int64(8))

In [58]:
valid_set.loc[valid_set[(valid_set.categori == "BAIK") & (valid_set.pm25.isnull() == True)].index, "pm25"] = impute_1
valid_set.loc[valid_set[(valid_set.categori == "TIDAK BAIK") & (valid_set.pm25.isnull() == True)].index, "pm25"] = impute_0

In [59]:
dataset[dataset.categori == "BAIK"].pm10.isnull().sum(), dataset[dataset.categori == "TIDAK BAIK"].pm10.isnull().sum()

(np.int64(0), np.int64(0))

#### 5.3.3. Testset

In [60]:
test_set[test_set.categori == "BAIK"].pm25.isnull().sum(), test_set[test_set.categori == "TIDAK BAIK"].pm25.isnull().sum()

(np.int64(10), np.int64(8))

In [61]:
test_set.loc[test_set[(test_set.categori == "BAIK") & (test_set.pm25.isnull() == True)].index, "pm25"] = impute_1
test_set.loc[test_set[(test_set.categori == "TIDAK BAIK") & (test_set.pm25.isnull() == True)].index, "pm25"] = impute_0

In [62]:
test_set[test_set.categori == "BAIK"].pm10.isnull().sum(), test_set[test_set.categori == "TIDAK BAIK"].pm10.isnull().sum()

(np.int64(0), np.int64(0))

### 5.4. Handling in SO2, CO, O3, dan NO2

#### 5.4.1. Trainset

In [65]:
impute_so2 = int(dataset.so2.mean())
impute_co = int(dataset.co.median())
impute_o3 = int(dataset.o3.median())
impute_no2 = int(dataset.no2.median())

In [66]:
impute_values = {"so2" : impute_so2, "co" : impute_co, "o3" : impute_o3, "no2" : impute_no2}

In [67]:
impute_values

{'so2': 35, 'co': 11, 'o3': 29, 'no2': 18}

In [68]:
params = params_updater("missing_value_so2", 35, params, params_dir)
params = params_updater("missing_value_co", 11, params, params_dir)
params = params_updater("missing_value_o3", 29, params, params_dir)
params = params_updater("missing_value_no2", 18, params, params_dir)

In [69]:
dataset.isnull().sum()

stasiun      0
pm10         0
pm25         0
so2         68
co          10
o3          35
no2          9
categori     0
dtype: int64

In [70]:
dataset.fillna(value = impute_values, inplace = True)

In [71]:
dataset.isnull().sum()

stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
categori    0
dtype: int64

#### 5.4.2. Validset

In [72]:
valid_set.isnull().sum()

stasiun      0
pm10         0
pm25         0
so2         15
co           3
o3           3
no2          4
categori     0
dtype: int64

In [73]:
valid_set.fillna(value = impute_values, inplace = True)

In [74]:
valid_set.isnull().sum()

stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
categori    0
dtype: int64

#### 5.4.3. Testset

In [75]:
test_set.isnull().sum()

stasiun      0
pm10         0
pm25         0
so2         14
co           3
o3          10
no2          6
categori     0
dtype: int64

In [76]:
test_set.fillna(value = impute_values, inplace = True)

In [77]:
test_set.isnull().sum()

stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
categori    0
dtype: int64

## 6. Encoding Stasiun

In [1]:
ohe_statiun = OneHotEncoder(sparse_output = False)

NameError: name 'OneHotEncoder' is not defined

In [None]:
ohe_statiun.fit(np.array(params["range_stasiun"]).reshape(-1, 1))

In [None]:
ohe_statiun.categories_

In [None]:
joblib.dump(ohe_statiun, "C:/Users/farha/Documents/pacmann_mlp/models/ohe_stasiun.pkl")

### 6.1. Trainset

In [None]:
stasiun_features = ohe_statiun.transform(np.array(dataset.stasiun.to_list()).reshape(-1, 1))

In [None]:
stasiun_features = pd.DataFrame(stasiun_features, columns = params["range_stasiun"])

In [None]:
stasiun_features.set_index(dataset.index, inplace = True)

In [None]:
dataset = pd.concat([stasiun_features, dataset], axis = 1)

In [None]:
dataset.drop(columns = "stasiun", inplace = True)

### 6.1. Validset

In [None]:
stasiun_features = ohe_statiun.transform(np.array(valid_set.stasiun.to_list()).reshape(-1, 1))

In [None]:
stasiun_features = pd.DataFrame(stasiun_features, columns = params["range_stasiun"])

In [None]:
stasiun_features.set_index(valid_set.index, inplace = True)

In [None]:
valid_set = pd.concat([stasiun_features, valid_set], axis = 1)

In [None]:
valid_set.drop(columns = "stasiun", inplace = True)

### 6.1. Testset

In [None]:
stasiun_features = ohe_statiun.transform(np.array(test_set.stasiun.to_list()).reshape(-1, 1))

In [None]:
stasiun_features = pd.DataFrame(stasiun_features, columns = params["range_stasiun"])

In [None]:
stasiun_features.set_index(test_set.index, inplace = True)

In [None]:
test_set = pd.concat([stasiun_features, test_set], axis = 1)

In [None]:
test_set.drop(columns = "stasiun", inplace = True)

## 6. Balancing Label

In [None]:
sns.histplot(data = dataset, x = "categori", hue = "categori")

### 6.1. Undersampling

In [None]:
rus = RandomUnderSampler(random_state = 26)

In [None]:
x_rus, y_rus = rus.fit_resample(dataset.drop("categori", axis = 1), dataset.categori)

In [None]:
dataset_rus = pd.concat([x_rus, y_rus], axis = 1)

In [None]:
sns.histplot(dataset_rus, x = "categori", hue = "categori")

### 6.2. Oversampling

In [None]:
ros = RandomOverSampler(random_state = 11)

In [None]:
x_ros, y_ros = ros.fit_resample(dataset.drop("categori", axis = 1), dataset.categori)

In [None]:
dataset_ros = pd.concat([x_ros, y_ros], axis = 1)

In [None]:
sns.histplot(dataset_ros, x = "categori", hue = "categori")

### 6.3. SMOTE

In [None]:
sm = SMOTE(random_state = 112)

In [None]:
x_sm, y_sm = sm.fit_resample(dataset.drop("categori", axis = 1), dataset.categori)

In [None]:
dataset_ros = pd.concat([x_ros, y_ros], axis = 1)

In [None]:
sns.histplot(dataset_ros, x = "categori", hue = "categori")

## 7. Label Encoding

In [None]:
le_categori = LabelEncoder()

In [None]:
le_categori.fit(params["label_categories_new"])

In [None]:
joblib.dump(le_categori, "C:/Users/farha/Documents/pacmann_mlp/models/le_categori.pkl")

In [None]:
params["label_categories_new"]

### 7.1. Undersampling set

In [None]:
list(y_rus.unique()) == params["label_categories_new"]

In [None]:
y_rus = le_categori.transform(y_rus)

### 7.2. Oversampling set

In [None]:
list(y_ros.unique()) == params["label_categories_new"]

In [None]:
y_ros = le_categori.transform(y_ros)

#### 7.3. SMOTE

In [None]:
list(y_sm.unique()) == params["label_categories_new"]

In [None]:
y_sm = le_categori.transform(y_sm)

#### 7.4. Validation Set

In [None]:
len(set(valid_set.categori.unique()) - set(params["label_categories_new"])) == 0

In [None]:
valid_set.categori = le_categori.transform(valid_set.categori)

#### 7.5. Test Set

In [None]:
len(set(test_set.categori.unique()) - set(params["label_categories_new"])) == 0

In [None]:
test_set.categori = le_categori.transform(test_set.categori)

## 8. Dump Data Latih

In [None]:
joblib.dump(x_rus, "C:/Users/farha/Documents/pacmann_mlp/data/processed/x_rus.pkl")
joblib.dump(y_rus, "C:/Users/farha/Documents/pacmann_mlp/data/processed/y_rus.pkl")

joblib.dump(x_ros, "C:/Users/farha/Documents/pacmann_mlp/data/processed/x_ros.pkl")
joblib.dump(y_ros, "C:/Users/farha/Documents/pacmann_mlp/data/processed/y_ros.pkl")

joblib.dump(x_sm, "C:/Users/farha/Documents/pacmann_mlp/data/processed/x_sm.pkl")
joblib.dump(y_sm, "C:/Users/farha/Documents/pacmann_mlp/data/processed/y_sm.pkl")

joblib.dump(valid_set.drop(columns = "categori"), "C:/Users/farha/Documents/pacmann_mlp/data/processed/x_valid_feng.pkl")
joblib.dump(valid_set.categori, "C:/Users/farha/Documents/pacmann_mlp/data/processed/y_valid_feng.pkl")

joblib.dump(test_set.drop(columns = "categori"), "C:/Users/farha/Documents/pacmann_mlp/data/processed/x_test_feng.pkl")
joblib.dump(test_set.categori, "C:/Users/farha/Documents/pacmann_mlp/data/processed/y_test_feng.pkl")