# Build df_reduced and df_balanced

In [105]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

In [106]:
df = pd.read_csv("df_mice_transformed.csv")
target_list = ['PMAX', 'Glucosio', 'Trigliceridi', 'Colesterolo_Hdl', 'BMI']
scaler_y = StandardScaler()

In [107]:
df.shape

(35853, 29)

In [108]:
df.head()

Unnamed: 0,Alanina_aminotransferasi_alt,Basofili_perc,Colesterolo_totale,Creatinina,Distribuzione_di_volume,Ematocrito_hct,Emoglobina_conc_media_mchc,Eosinofili_perc,Ferritina,Ferro_totale,...,Date,DATA_NASCITA,PMAX,Glucosio,Trigliceridi,Colesterolo_Hdl,BMI,SESSO,Rh,AB0
0,-1.605074,-1.987875,-0.27885,-1.078466,-0.652746,-2.093719,1.55998,-1.430024,-0.504158,1.920742,...,2009-02-02,1961-10-16,4.795791,4.672829,4.127134,4.60517,3.057826,2,1,2
1,-0.998052,-0.731967,0.075782,-1.995228,0.527112,-1.538745,0.712536,-1.690351,-0.389658,0.523251,...,2010-01-11,1961-10-16,4.70953,4.65396,3.828641,4.574711,3.041815,2,1,2
2,1.002842,0.279145,0.785048,-1.471364,-0.530921,-0.185996,-0.69987,-0.986498,0.81756,1.094891,...,2011-08-27,1961-10-16,4.70953,4.962845,4.025352,4.584967,3.046779,2,1,2
3,-0.100276,-0.731967,0.016677,-1.733296,-0.77547,-0.74097,-0.040747,-1.985484,0.205837,-1.017107,...,2012-06-19,1961-10-16,4.70953,4.584967,3.7612,4.532599,3.057826,2,1,2
4,-0.661944,0.279145,0.134888,-2.584575,-0.289917,-0.532854,-0.88819,-0.794184,-0.318001,-0.318864,...,2013-07-22,1961-10-16,4.70953,4.691348,3.78419,4.543295,3.046779,2,1,2


## df_balanced

In [109]:
patients = df['CAI'].nunique()
print(f"Total number of (unique) patients: {patients}")

Total number of (unique) patients: 4329


In [110]:
# Count number of women and men in the complete dataset
df.groupby('SESSO')['CAI'].nunique()

SESSO
1    3488
2     841
Name: CAI, dtype: int64

In [111]:
perc_women = 841/4329
perc_men = 3488/4329

print(perc_women)
print(perc_men)

0.19427119427119427
0.8057288057288057


In [112]:
# Count number of visits for each patient grouping by CAI and SESSO
visits = df.groupby(['CAI', 'SESSO']).size().reset_index(name='NUM_VISITE')

# Divide men and women in to groups and order them by decreasing number of visits
men_top = visits[visits['SESSO'] == 1].sort_values(by='NUM_VISITE', ascending=False)
women_top = visits[visits['SESSO'] == 2].sort_values(by='NUM_VISITE', ascending=False)

# Estract first 28 men and first 7 women
top_28_men = men_top.head(28)
top_7_women = women_top.head(7)

# Merge the results in a unique DataFrame
final_result = pd.concat([top_28_men, top_7_women])


In [113]:
final_result.shape

(35, 3)

In [114]:
cai_target_list = final_result['CAI'].tolist()

# Extract selected patients from the original dataset
df_pazienti_top = df[df['CAI'].isin(cai_target_list)]
df_pazienti_top = df_pazienti_top.sort_values(by=['SESSO', 'CAI'])

# Verify the result
print(f"Total number of rows: {len(df_pazienti_top)}")
print(f"Number of patients in the reduced dataset: {df_pazienti_top['CAI'].nunique()}")

Total number of rows: 1021
Number of patients in the reduced dataset: 35


In [115]:
df_pazienti_top.shape

(1021, 29)

In [116]:
df_pazienti_top.to_csv("df_balanced.csv", index=False)

## df_reduced

In [137]:
df_balanced = pd.read_csv("df_balanced.csv")

In [138]:
df_balanced.shape

(1021, 29)

In [139]:
df_balanced.columns

Index(['Alanina_aminotransferasi_alt', 'Basofili_perc', 'Colesterolo_totale',
       'Creatinina', 'Distribuzione_di_volume', 'Ematocrito_hct',
       'Emoglobina_conc_media_mchc', 'Eosinofili_perc', 'Ferritina',
       'Ferro_totale', 'Leucociti_wbc', 'Linfociti_perc', 'Monociti_perc',
       'Piastrine', 'Polso', 'Proteine_totali', 'Volume_medio', 'Eta', 'CAI',
       'Date', 'DATA_NASCITA', 'PMAX', 'Glucosio', 'Trigliceridi',
       'Colesterolo_Hdl', 'BMI', 'SESSO', 'Rh', 'AB0'],
      dtype='object')

In [140]:
import numpy as np
import pandas as pd

# Remove non significant columns + Date, DATA_NASCITA (related to covariate Eta)
cols_to_remove = [
    "Basofili_perc", "Creatinina", "Emoglobina_conc_media_mchc", 
    "Ferritina", "Ferro_totale", "AB0", "Rh", "SESSO",
    "DATA_NASCITA", "Date" 
]
df_balanced = df_balanced.drop(columns=[c for c in cols_to_remove if c in df_balanced.columns])

# 2. PARAMETRI DEL MODELLO
ID_COL = "CAI"
TARGET_COLS = ["Trigliceridi", "Colesterolo_Hdl", "Glucosio", "PMAX", "BMI"]

# 3. SELEZIONE COVARIATE
# Prendiamo solo i numeri, escludendo target e ID. 
# SESSO/Rh/AB0 sono già fuori, quindi non verranno create dummy.
numeric_cols = df_balanced.select_dtypes(include=[np.number]).columns.tolist()
COVARIATE_COLS = [c for c in numeric_cols if c not in TARGET_COLS + [ID_COL]]

# 4. COSTRUZIONE DATAFRAME MODELLO
cols_needed = [ID_COL] + TARGET_COLS + COVARIATE_COLS
df_model = df_balanced[cols_needed].dropna().copy()

# 5. MATRICE Y
Y_mat = df_model[TARGET_COLS].to_numpy(dtype=float)
N, K = Y_mat.shape

# 6. JITTERING
# Applichiamo il jittering a tutte le covariate rimaste (che sono tutte numeriche/continue)
JITTER_SCALE = 1e-5
for col in COVARIATE_COLS:
    std_dev = df_model[col].std()
    if std_dev > 0:
        scale = JITTER_SCALE * std_dev
        noise = np.random.uniform(low=-scale/2, high=scale/2, size=len(df_model))
        df_model[col] = df_model[col] + noise

# 7. MATRICE X E DATI STAN
X_mat = df_model[COVARIATE_COLS].to_numpy(dtype=float)
_, P = X_mat.shape
donor_ids, id_index = np.unique(df_model[ID_COL].to_numpy(), return_inverse=True)
I = len(donor_ids)
id_stan = id_index + 1

In [141]:
df_model.shape

(1021, 19)

In [142]:
df_model.columns

Index(['CAI', 'Trigliceridi', 'Colesterolo_Hdl', 'Glucosio', 'PMAX', 'BMI',
       'Alanina_aminotransferasi_alt', 'Colesterolo_totale',
       'Distribuzione_di_volume', 'Ematocrito_hct', 'Eosinofili_perc',
       'Leucociti_wbc', 'Linfociti_perc', 'Monociti_perc', 'Piastrine',
       'Polso', 'Proteine_totali', 'Volume_medio', 'Eta'],
      dtype='object')

In [144]:
output_file_name = "df_reduced.csv"
df_model.to_csv(output_file_name, index=False)