In [22]:
import pandas as pd
df = pd.read_parquet('diabetes-v2/data/predictions.parquet')

In [23]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'medical_specialty', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'tolazamide',
       'insulin', 'glyburide-metformin', 'change', 'diabetesMed',
       'Risk30DayReadmission', 'RiskAnyReadmission',
       'Risk30DayReadmissionPercentile', 'RiskLongStay',
       'ExpectedHospitalStay', 'ScoringTime'],
      dtype='object')

In [24]:
import numpy as np

## This is the one used
column = 'Risk30DayReadmission'
options = ['Very Poor', 'Poor', 'Neutral', 'Good', 'Excellent', pd.NA]

# Calculate percentiles
percentile_30 = df[column].quantile(0.30)
percentile_50 = df[column].quantile(0.50)
percentile_80 = df[column].quantile(0.80)
percentile_90 = df[column].quantile(0.90)

# # Create column1 based on the given conditions with weighted random choices
def calculate_column1(score):
    if score < percentile_30:
        return 'low risk'
    elif score < percentile_80:
        return np.random.choice(['low risk', 'neutral', 'high risk'], p=[0.45, 0.45, 0.1])
    else:
        return np.random.choice(['low risk', 'neutral', 'high risk'], p=[0.1, 0.6, 0.3])

df['advisory_30Day_Readmission'] = df[column].apply(calculate_column1)

# Create advisory columns based on the given conditions with weighted random choices
def calculate_advisory(score, percentile_low, percentile_high, weights):
    if score < percentile_low:
        return np.random.choice(options, p=weights[0])
    elif score < percentile_high:
        return np.random.choice(options, p=weights[1])
    else:
        return np.random.choice(options, p=weights[2])

# Define weights for different questions and advisory columns
weights_30_effectiveness = [
    [0.05, 0.05, 0.10, 0.20, 0.30, 0.30],  # Low percentile
    [0.10, 0.10, 0.20, 0.30, 0.20, 0.10],  # Mid percentile
    [0.05, 0.05, 0.10, 0.20, 0.25, 0.35]   # High percentile
]

weights_30_impact = [
    [0.05, 0.05, 0.10, 0.20, 0.25, 0.35],  # Low percentile
    [0.10, 0.10, 0.20, 0.30, 0.20, 0.10],  # Mid percentile
    [0.05, 0.05, 0.10, 0.20, 0.30, 0.30]   # High percentile
]

weights_30_clarity = [
    [0.05, 0.05, 0.10, 0.20, 0.25, 0.35],  # Low percentile
    [0.10, 0.10, 0.20, 0.30, 0.20, 0.10],  # Mid percentile
    [0.05, 0.05, 0.10, 0.20, 0.25, 0.35]   # High percentile
]

weights_50_effectiveness = [
    [0.05, 0.05, 0.15, 0.25, 0.40, 0.10],   # Low percentile
    [0.15, 0.15, 0.25, 0.25, 0.15, 0.05],   # Mid percentile
    [0.05, 0.05, 0.15, 0.25, 0.40, 0.10]    # High percentile
]

weights_50_impact = [
    [0.05, 0.05, 0.15, 0.25, 0.40, 0.10],   # Low percentile
    [0.15, 0.15, 0.25, 0.25, 0.15, 0.05],   # Mid percentile
    [0.05, 0.05, 0.15, 0.25, 0.40, 0.10]    # High percentile
]

weights_50_clarity = [
    [0.05, 0.05, 0.15, 0.25, 0.40, 0.10],   # Low percentile
    [0.15, 0.15, 0.25, 0.25, 0.15, 0.05],   # Mid percentile
    [0.05, 0.05, 0.15, 0.25, 0.40, 0.10]    # High percentile
]

df['advisory_30Day_effectiveness'] = df[column].apply(lambda score: calculate_advisory(score, percentile_30, percentile_80, weights_30_effectiveness))
df['advisory_30Day_impact'] = df[column].apply(lambda score: calculate_advisory(score, percentile_30, percentile_80, weights_30_impact))
df['advisory_30Day_clarity'] = df[column].apply(lambda score: calculate_advisory(score, percentile_30, percentile_80, weights_30_clarity))

# Create satisfaction columns with correlation logic and weighted random choices for different percentiles
def calculate_satisfaction(row, advisory_columns, percentiles):
    effectiveness_values = ['Very Poor', 'Poor', 'Neutral', 'Good', 'Excellent']
    
    effectiveness_count = sum([row[col] in effectiveness_values[:2] for col in advisory_columns if pd.notna(row[col])])
    
    if effectiveness_count >= 2:
        if row[column] < percentiles[1]:
            return np.random.choice(options, p=[0.20, 0.15, 0.15, 0.05, 0.05, 0.40])
        elif row[column] < percentiles[2]:
            return np.random.choice(options, p=[0.25, 0.20, 0.15, 0.05, 0.05, 0.30])
        else:
            return np.random.choice(options, p=[0.15, 0.15, 0.10, 0.05, 0.05, 0.50])
    
    effectiveness_count_high = sum([row[col] in effectiveness_values[3:] for col in advisory_columns if pd.notna(row[col])])
    
    if effectiveness_count_high >= 2:
        if row[column] < percentiles[1]:
            return np.random.choice(options, p=[0.05, 0.05, 0.10, 0.15, 0.15, 0.50])
        elif row[column] < percentiles[2]:
            return np.random.choice(options, p=[0.05, 0.10, 0.20, 0.20, 0.15, 0.30])
        else:
            return np.random.choice(options, p=[0.05, 0.05, 0.20, 0.15, 0.15, 0.40])
    
    if row[column] < percentiles[1]:
        return np.random.choice(options, p=[0.05, 0.05, 0.10, 0.15, 0.15, 0.50])
    elif row[column] < percentiles[2]:
        return np.random.choice(options, p=[0.10, 0.15, 0.20, 0.15, 0.10, 0.30])
    else:
        return np.random.choice(options, p=[0.05, 0.05, 0.15, 0.15, 0.10, 0.50])

df['advisory_30Day_satisfaction'] = df.apply(lambda row: calculate_satisfaction(row, ['advisory_30Day_effectiveness', 'advisory_30Day_impact', 'advisory_30Day_clarity'], [percentile_30, percentile_50, percentile_80]), axis=1)

# Ensure all columns are NA if satisfaction is NA
def ensure_na(row, advisory_columns):
    if pd.isna(row[advisory_columns[-1]]):
        for col in advisory_columns:
            row[col] = pd.NA
    return row

df = df.apply(lambda row: ensure_na(row, ['advisory_30Day_effectiveness', 'advisory_30Day_impact', 'advisory_30Day_clarity', 'advisory_30Day_satisfaction']), axis=1)

In [25]:
import numpy as np

## This is the one used
column = 'RiskAnyReadmission'
options = ['Very Poor', 'Poor', 'Neutral', 'Good', 'Excellent', pd.NA]

# Calculate percentiles
percentile_30 = df[column].quantile(0.30)
percentile_50 = df[column].quantile(0.50)
percentile_80 = df[column].quantile(0.80)
percentile_90 = df[column].quantile(0.90)

# Create column2 based on the given conditions with weighted random choices
def calculate_column2(score):
    if score < percentile_50:
        return 'low risk'
    elif score < percentile_90:
        return np.random.choice(['low risk', 'neutral', 'high risk'], p=[0.3, 0.6, 0.1])
    else:
        return np.random.choice(['low risk', 'neutral', 'high risk'], p=[0.1, 0.4, 0.5])

df['advisory_Any_Readmission'] = df[column].apply(calculate_column2)

# Create advisory columns based on the given conditions with weighted random choices
def calculate_advisory(score, percentile_low, percentile_high, weights):
    if score < percentile_low:
        return np.random.choice(options, p=weights[0])
    elif score < percentile_high:
        return np.random.choice(options, p=weights[1])
    else:
        return np.random.choice(options, p=weights[2])

# Define weights for different questions and advisory columns
weights_30_effectiveness = [
    [0.05, 0.05, 0.10, 0.20, 0.30, 0.30],  # Low percentile
    [0.10, 0.10, 0.20, 0.30, 0.20, 0.10],  # Mid percentile
    [0.05, 0.05, 0.10, 0.20, 0.25, 0.35]   # High percentile
]

weights_30_impact = [
    [0.05, 0.05, 0.10, 0.20, 0.25, 0.35],  # Low percentile
    [0.10, 0.10, 0.20, 0.30, 0.20, 0.10],  # Mid percentile
    [0.05, 0.05, 0.10, 0.20, 0.30, 0.30]   # High percentile
]

weights_30_clarity = [
    [0.05, 0.05, 0.10, 0.20, 0.25, 0.35],  # Low percentile
    [0.10, 0.10, 0.20, 0.30, 0.20, 0.10],  # Mid percentile
    [0.05, 0.05, 0.10, 0.20, 0.25, 0.35]   # High percentile
]

weights_50_effectiveness = [
    [0.05, 0.05, 0.15, 0.25, 0.40, 0.10],   # Low percentile
    [0.15, 0.15, 0.25, 0.25, 0.15, 0.05],   # Mid percentile
    [0.05, 0.05, 0.15, 0.25, 0.40, 0.10]    # High percentile
]

weights_50_impact = [
    [0.05, 0.05, 0.15, 0.25, 0.40, 0.10],   # Low percentile
    [0.15, 0.15, 0.25, 0.25, 0.15, 0.05],   # Mid percentile
    [0.05, 0.05, 0.15, 0.25, 0.40, 0.10]    # High percentile
]

weights_50_clarity = [
    [0.05, 0.05, 0.15, 0.25, 0.40, 0.10],   # Low percentile
    [0.15, 0.15, 0.25, 0.25, 0.15, 0.05],   # Mid percentile
    [0.05, 0.05, 0.15, 0.25, 0.40, 0.10]    # High percentile
]

df['advisory_any_effectiveness'] = df[column].apply(lambda score: calculate_advisory(score, percentile_50, percentile_90, weights_50_effectiveness))
df['advisory_any_impact'] = df[column].apply(lambda score: calculate_advisory(score, percentile_50, percentile_90, weights_50_impact))
df['advisory_any_clarity'] = df[column].apply(lambda score: calculate_advisory(score, percentile_50, percentile_90, weights_50_clarity))

# Create satisfaction columns with correlation logic and weighted random choices for different percentiles
def calculate_satisfaction(row, advisory_columns, percentiles):
    effectiveness_values = ['Very Poor', 'Poor', 'Neutral', 'Good', 'Excellent']
    
    effectiveness_count = sum([row[col] in effectiveness_values[:2] for col in advisory_columns if pd.notna(row[col])])
    
    if effectiveness_count >= 2:
        if row[column] < percentiles[1]:
            return np.random.choice(options, p=[0.20, 0.15, 0.15, 0.05, 0.05, 0.40])
        elif row[column] < percentiles[2]:
            return np.random.choice(options, p=[0.25, 0.20, 0.15, 0.05, 0.05, 0.30])
        else:
            return np.random.choice(options, p=[0.15, 0.15, 0.10, 0.05, 0.05, 0.50])
    
    effectiveness_count_high = sum([row[col] in effectiveness_values[3:] for col in advisory_columns if pd.notna(row[col])])
    
    if effectiveness_count_high >= 2:
        if row[column] < percentiles[1]:
            return np.random.choice(options, p=[0.05, 0.05, 0.10, 0.15, 0.15, 0.50])
        elif row[column] < percentiles[2]:
            return np.random.choice(options, p=[0.05, 0.10, 0.20, 0.20, 0.15, 0.30])
        else:
            return np.random.choice(options, p=[0.05, 0.05, 0.20, 0.15, 0.15, 0.40])
    
    if row[column] < percentiles[1]:
        return np.random.choice(options, p=[0.05, 0.05, 0.10, 0.15, 0.15, 0.50])
    elif row[column] < percentiles[2]:
        return np.random.choice(options, p=[0.10, 0.15, 0.20, 0.15, 0.10, 0.30])
    else:
        return np.random.choice(options, p=[0.05, 0.05, 0.15, 0.15, 0.10, 0.50])

df['advisory_any_satisfaction'] = df.apply(lambda row: calculate_satisfaction(row, ['advisory_any_effectiveness', 'advisory_any_impact', 'advisory_any_clarity'], [percentile_50, percentile_80, percentile_90]), axis=1)

# Ensure all columns are NA if satisfaction is NA
def ensure_na(row, advisory_columns):
    if pd.isna(row[advisory_columns[-1]]):
        for col in advisory_columns:
            row[col] = pd.NA
    return row

df = df.apply(lambda row: ensure_na(row, ['advisory_any_effectiveness', 'advisory_any_impact', 'advisory_any_clarity', 'advisory_any_satisfaction']), axis=1)

In [27]:
df.to_parquet('diabetes-v2/data/predictions.parquet')

In [26]:
df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,medical_specialty,num_lab_procedures,...,advisory_30Day_Readmission,advisory_30Day_effectiveness,advisory_30Day_impact,advisory_30Day_clarity,advisory_30Day_satisfaction,advisory_Any_Readmission,advisory_any_effectiveness,advisory_any_impact,advisory_any_clarity,advisory_any_satisfaction
0,2278392,8222157,Caucasian,Female,[0-10),Unknown,Unknown,Referral,Other,41,...,low risk,Excellent,Poor,Excellent,Very Poor,low risk,,,,
1,149190,55629189,Caucasian,Female,[10-20),Emergency,Discharged to Home,Emergency,,59,...,neutral,Good,Excellent,Good,Neutral,low risk,,,,
2,64410,86047875,AfricanAmerican,Female,[20-50),Emergency,Discharged to Home,Emergency,,11,...,neutral,,,,,high risk,Very Poor,Very Poor,Neutral,Neutral
3,500364,82442376,Caucasian,Male,[20-50),Emergency,Discharged to Home,Emergency,,44,...,high risk,,,,,neutral,,,,
4,16680,42519267,Caucasian,Male,[20-50),Emergency,Discharged to Home,Emergency,,51,...,low risk,Neutral,Neutral,Very Poor,Good,low risk,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99335,443847548,100162476,AfricanAmerican,Male,70+,Emergency,Other,Emergency,,51,...,low risk,Excellent,Neutral,,Poor,low risk,,Neutral,Excellent,Very Poor
99336,443847782,74694222,AfricanAmerican,Female,70+,Emergency,Other,Transfer,,33,...,low risk,Very Poor,Very Poor,Good,Very Poor,low risk,Good,Good,Neutral,Good
99337,443854148,41088789,Caucasian,Male,70+,Emergency,Discharged to Home,Emergency,,53,...,low risk,,,,,low risk,Excellent,,,Good
99338,443857166,31693671,Caucasian,Female,70+,Emergency,Other,Emergency,Surgery-General,45,...,neutral,Excellent,Good,Neutral,Excellent,low risk,,,,


In [21]:
df['advisory_30Day_effectiveness'].dtypes

dtype('O')