In [44]:
## ~0.7 variantgamma_transmissibility+variantgamma_imports_factor+variantgamma_cross_protection_prob
## ~0.5 variable_alpha + variantgamma_transmissibility_factor - GradientBoostingRegressor
## ~0.6 variable_alpha + variantgamma_severity_factor - GradientBoostingRegressor

import numpy as np
import pandas as pd
from tqdm import tqdm
import os
from load_functions import *
from variable_assignation import *

In [45]:
from scipy import stats

# Variables

In [46]:
input_files_folder = '../input_files'
output_files = '../input_files'

# Transform data
## Load info

In [48]:
df_data = pd.read_csv(f'{input_files_folder}/fred_output.csv')
df_params = pd.read_csv(f'{input_files_folder}/FRED_parameters_out.csv')

In [49]:
job_list = df_data['job_id'].unique()

## Process and format data

In [57]:
for variant in ['alpha','gamma','kappa','delta']:
    output_files_folder = f'{output_files}/{variant}'

    ## create folder
    directory = output_files_folder
    if not os.path.exists(directory):
        os.makedirs(directory)
        print("Directory '%s' created" %directory)
    
    df_data_prop_var = pd.DataFrame({})
    df_data_prop_deaths_var = pd.DataFrame({})
    for job_id in tqdm(job_list):
        mask_data = (df_data['job_id'] == job_id) 
        mask_params = (df_params['job_id'] == job_id)

        df_data_job = df_data[mask_data].copy()
        df_params_job = df_params[mask_params].copy()

        labels = [[df_params_job[var].to_numpy()[0] for var in variables_vars[variant]]]

        cases_var_vector = df_data_job[f'C_{dic_var[variant]}_mean']
        all_cases_vector =  df_data_job['C_1_mean'] + df_data_job['C_2_mean'] + df_data_job['C_3_mean'] + df_data_job['C_4_mean']
        prop_var = (cases_var_vector/all_cases_vector).fillna(0)

        deaths_var_vector = df_data_job[f'CF_{dic_var[variant]}_mean']
        all_deaths_vector = df_data_job['CF_mean'] + df_data_job['CF_1_mean'] + df_data_job['CF_2_mean'] + df_data_job['CF_3_mean'] + df_data_job['CF_4_mean']
        prop_deaths = (prop_var * all_deaths_vector).fillna(0)


        optimal_lambda = stats.yeojohnson_normmax(prop_var)
        transformed_prop_var = stats.yeojohnson(prop_var, lmbda=optimal_lambda)
        
        optimal_lambda = stats.yeojohnson_normmax(prop_deaths)
        transformed_prop_deaths = stats.yeojohnson(prop_deaths, lmbda=optimal_lambda)

        train_data_1 = transformed_prop_var
        train_data_2 = transformed_prop_deaths

        data_prop = [pd.Series(train_data_1.tolist())]
        data_deaths = [pd.Series(train_data_2.tolist())]

        df_data_prop_var = pd.concat([df_data_prop_var, pd.DataFrame({'dim_0':data_prop, 'y':labels})])
        df_data_prop_deaths_var = pd.concat([df_data_prop_deaths_var, pd.DataFrame({'dim_0':data_deaths, 'y':labels})])

    save_obj(df_data_prop_var.reset_index(drop=True), f'{output_files_folder}/df_prop_{variant}')
    save_obj(df_data_prop_deaths_var.reset_index(drop=True), f'{output_files_folder}/df_prop_deaths_{variant}')

100%|██████████| 1979/1979 [04:27<00:00,  7.40it/s]
100%|██████████| 1979/1979 [04:22<00:00,  7.54it/s]
100%|██████████| 1979/1979 [04:24<00:00,  7.48it/s]
100%|██████████| 1979/1979 [04:21<00:00,  7.57it/s]
