In [None]:
### --------- DATASET ASSEMBLY STEPS (1-5) --------- ###

from dataset_assembly.functional_check_step import cf_main
from dataset_assembly.calving_step import calving_main
from dataset_assembly.diseases_step import treat_main
from dataset_assembly.ele_conductivity_step import ec_main
from dataset_assembly.lactose_step import ltts_main
from dataset_assembly.anagraphic_step import ana_main

### -------- ML PROCESS STEPS (6-13) --------- ###

from ml_process.dataset_assembly import merge_main
from ml_process.transformer import pre_processing, dit
from ml_process.imputer import run_clinical_imputation, distribution_comparison
from ml_process.feature_engineer import create_wide
from ml_process.sampler import undersample_balanced
from ml_process.classifier_module import split_by_animal, call_xgb, call_rf, call_cat, call_lgbm, upload_classifiers
from ml_process.analyzer import assemble_feature_summary, save_feature_summary_txt

In [None]:
## 1. FUNCTIONAL CHECK STEP
cf_main()

In [None]:
## 2. CALVING STEP
calving_main()

In [None]:
## 3. DISEASES
treat_main()

In [None]:
## 4. EC & LACTOSE
ec_main()
ltts_main()

In [None]:
## 5. ANAGRAPHIC DATA
ana_main()

In [None]:
## 6. MERGING DATASETS
merge_main()

In [None]:
## 7. DOMAIN-INFORMED TRANSFORMER

pre_processing(input_path=r"temporary_datasets\merged_dataset.parquet",output_path=r"temporary_datasets\first_imputation.parquet")
dit(input_path=r"temporary_datasets\first_imputation.parquet", output_path=r"temporary_datasets\domain_informed_dataset.parquet")

In [None]:
## 8. MISSING DATA IMPUTER

run_clinical_imputation(input_path=r"temporary_datasets\domain_informed_dataset.parquet",output_path=r"temporary_datasets\imputed.parquet")
distribution_comparison(input_path=r"temporary_datasets\domain_informed_dataset.parquet",output_path=r"temporary_datasets\imputed.parquet")

In [None]:
## 9. FEATURE ENGINEERING

create_wide(r"temporary_datasets\imputed.parquet", r"temporary_datasets\wide.parquet")

In [None]:
## 10. UNDERSAMPLING

undersample_balanced(input_path=r"temporary_datasets\wide.parquet", output_path=r"temporary_datasets\balanced_dataset.parquet")

In [None]:
## 11. PRE-CLASSIFICATION DATASET PREPARATION

from libraries import pd

# features_for_test = ['id', 'mastitis', 'scs', 'scs_t-1', 'scs_t-2', 'ec', 'ec_t-1', 'lactose', 'fat', 'age', 'milk', 'milk_t-1']

X_train, y_train, X_test, y_test = split_by_animal(r"temporary_datasets\balanced_dataset.parquet")#[features_for_test]
groups = pd.read_parquet(r"temporary_datasets\balanced_dataset.parquet").loc[X_train.index, 'id'].values

In [None]:
## 12. CLASSIFIER 

call_xgb(X_train, y_train, X_test, y_test, groups)
call_rf(X_train, y_train, X_test, y_test, groups)
call_cat(X_train, y_train, X_test, y_test, groups)
call_lgbm(X_train, y_train, X_test, y_test, groups)
del X_train, y_train, X_test, y_test, groups

In [None]:
## 13. FEATURE ANALYZER

models = upload_classifiers("classifier")
_, _, X_test, y_test = split_by_animal(r"temporary_datasets\balanced_dataset.parquet")

summary = assemble_feature_summary(models, base_model_name="lgbm", X_test=X_test, y_test=y_test, top_n=20)
save_feature_summary_txt(summary, "output/feature_summary.txt")
del models, summary, X_test, y_test