In [1]:
import pandas as pd
import numpy as np
import joblib

pd.options.display.max_columns = None
pd.options.display.max_rows = None

import warnings
warnings.filterwarnings("ignore")  

from expected_disposal_model.config import raw_file_path, modelling_file_path, preprocessor_file_path
from expected_disposal_model.data_preparation.data_preprocessor import Preprocessor
from expected_disposal_model.data_preparation.preprocessing import convert_chains_to_schema
from expected_disposal_model.data_preparation.preprocessing import create_labels
from expected_disposal_model.data_preparation.preprocessing import get_stratified_train_test_val_columns
from expected_disposal_model.modelling_data_contract import ModellingDataContract


Load Data

In [None]:
data = pd.read_csv(raw_file_path)
data.head()

Preprocess Data

In [None]:
preproc = Preprocessor()
preproc.fit(data)

In [None]:
X = preproc.transform(data)

Create Labels

In [None]:
y = create_labels(data)

Combine Data

In [None]:
modelling_data = pd.concat([X, y], axis='columns')

Create Train Test Validation Split

In [None]:
modelling_data = get_stratified_train_test_val_columns(modelling_data, response=ModellingDataContract.RESPONSE)

Export Data

In [None]:
modelling_data.to_csv(modelling_file_path, index=False)

Export Preprocessor

In [None]:
joblib.dump(preproc, preprocessor_file_path)