In [16]:
import numpy as np
import pandas as pd

def generate_augmented_data(df, n_samples):
    # Initialize an empty dataframe to hold the generated data
    df_generated = pd.DataFrame(columns=df.columns)
    
    # Set the continuous and categorical columns
    continuous_cols = ['los', 'age', 'prior_appts_attended', 'prior_dnas', 'fu_time', 'quintile', 'ethnicgroup']
    categorical_cols = list(set(df.columns) - set(continuous_cols) - {'death', 'id'})
    
    # Calculate the death ratio
    death_ratio = df['death'].mean()
    
    # Get the starting point for the id column
    id_start = df['id'].max() + 1
    
    for i in range(n_samples):
        # Randomly decide whether the person is dead or not
        is_dead = int(np.random.rand() < death_ratio)  # Convert to 0/1
        
        # Select the rows corresponding to the chosen 'death' value
        relevant_rows = df[df['death'] == is_dead]
        
        # Initialize a dictionary to hold the new row data
        new_row = {'death': is_dead, 'id': id_start + i}  # Assign the id
        
        # Generate values for continuous columns
        for col in continuous_cols:
            new_row[col] = np.random.choice(relevant_rows[col])
        
        # Generate values for categorical columns
        for col in categorical_cols:
            # Calculate the probabilities for each value of the column
            value_counts = relevant_rows[col].value_counts(normalize=True)
            values = value_counts.index
            probabilities = value_counts.values
            
            # Generate a value for the column
            new_row[col] = np.random.choice(values, p=probabilities)
        
        # Append the new row to the dataframe
        df_generated = df_generated.append(new_row, ignore_index=True)
    
    return df_generated


In [17]:
data = pd.read_csv('data.csv')
df_augmented = generate_augmented_data(data, n_samples=6000)
df_augmented.to_csv('df_augmented.csv', index=False)




  df_generated = df_generated.append(new_row, ignore_index=True)
  df_generated = df_generated.append(new_row, ignore_index=True)
  df_generated = df_generated.append(new_row, ignore_index=True)
  df_generated = df_generated.append(new_row, ignore_index=True)
  df_generated = df_generated.append(new_row, ignore_index=True)
  df_generated = df_generated.append(new_row, ignore_index=True)
  df_generated = df_generated.append(new_row, ignore_index=True)
  df_generated = df_generated.append(new_row, ignore_index=True)
  df_generated = df_generated.append(new_row, ignore_index=True)
  df_generated = df_generated.append(new_row, ignore_index=True)
  df_generated = df_generated.append(new_row, ignore_index=True)
  df_generated = df_generated.append(new_row, ignore_index=True)
  df_generated = df_generated.append(new_row, ignore_index=True)
  df_generated = df_generated.append(new_row, ignore_index=True)
  df_generated = df_generated.append(new_row, ignore_index=True)
  df_generated = df_gener