In [12]:
import pandas as pd
import numpy as np

# Seed for reproducibility
np.random.seed(0)

# Parameters
num_records = 1000
age_min, age_max = 1, 100
num_symptoms = 5
symptoms = ["Fever", "Cough", "Fatigue", "Shortness of Breath", "Loss of Taste or Smell"]
diseases = ["None", "Flu", "COVID-19", "Cold", "Allergy"]
vaccination_status = ["Not Vaccinated", "Partially Vaccinated", "Fully Vaccinated"]

# Generating Data
data = {
    "Person_ID": np.arange(1, num_records + 1),
    "Age": np.random.randint(age_min, age_max + 1, size=num_records),
    "Gender": np.random.choice(["Male", "Female", "Other"], size=num_records),
    "Disease": np.random.choice(diseases, size=num_records),
    "Vaccination_Status": np.random.choice(vaccination_status, size=num_records)
}

# Adding symptoms as binary features (1 for present, 0 for absent)
for symptom in symptoms:
    data[symptom] = np.random.randint(0, 2, size=num_records)

# Creating DataFrame
df = pd.DataFrame(data)
df.to_csv("original_data.csv", index=False)





Unnamed: 0,Person_ID,Age,Gender,Disease,Vaccination_Status,Fever,Cough,Fatigue,Shortness of Breath,Loss of Taste or Smell
0,1,45,Male,,Not Vaccinated,0,0,1,1,0
1,2,48,Other,Allergy,Partially Vaccinated,0,1,0,1,1
2,3,65,Female,COVID-19,Fully Vaccinated,1,0,0,1,0
3,4,68,Other,,Not Vaccinated,0,0,1,0,1
4,5,68,Other,Flu,Partially Vaccinated,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
195,196,84,Male,Cold,Not Vaccinated,0,1,1,1,1
196,197,34,Other,,Fully Vaccinated,1,1,0,1,1
197,198,33,Male,COVID-19,Partially Vaccinated,1,0,1,0,1
198,199,71,Other,COVID-19,Not Vaccinated,1,1,1,0,1


In [18]:
df

Unnamed: 0,Person_ID,Age,Gender,Disease,Vaccination_Status,Fever,Cough,Fatigue,Shortness of Breath,Loss of Taste or Smell
0,1,45,Male,,Not Vaccinated,0,0,1,1,0
1,2,48,Other,Allergy,Partially Vaccinated,0,1,0,1,1
2,3,65,Female,COVID-19,Fully Vaccinated,1,0,0,1,0
3,4,68,Other,,Not Vaccinated,0,0,1,0,1
4,5,68,Other,Flu,Partially Vaccinated,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
995,996,27,Male,,Not Vaccinated,0,1,1,1,0
996,997,49,Female,Cold,Not Vaccinated,0,1,1,0,1
997,998,72,Female,Allergy,Fully Vaccinated,0,1,1,0,0
998,999,55,Other,Cold,Not Vaccinated,1,0,0,0,1


In [14]:
import pandas as pd

# Load the mapping rules from the Excel file
mapping_df = pd.read_excel('mapping.xlsx')

# Preparing the mapping structure
# Fill NaN in 'Field' to make it easier to fill down and identify mappings that span multiple rows
mapping_df['Field'] = mapping_df['Field'].fillna(method='ffill')
# For New Field Name, where NaN appears, it should inherit the original Field name (indicating no change)
mapping_df['New Field'] = mapping_df['New Field'].fillna(method='ffill')

mapping_df

Unnamed: 0,Field,Description,Type,Range/Values,New Field,New Range/Values
0,Person_ID,Unique identifier for each person.,Integer,1 to 1000,id,
1,Age,Age of the person.,Integer,1 to 100 years,age_in_years,
2,Gender,Gender identity of the person.,Categorical,Male,sex,M
3,Gender,,,Female,sex,F
4,Gender,,,Other,sex,O
5,Disease,Diagnosed disease of the person.,Categorical,,type_of_disease,Well
6,Disease,,,Flu,type_of_disease,Respiratory
7,Disease,,,COVID-19,type_of_disease,Respiratory
8,Disease,,,Cold,type_of_disease,Respiratory
9,Disease,,,Allergy,type_of_disease,Allergy


In [15]:
import pandas as pd

def apply_data_mapping(original_data_path, mapping_path):
    """
    Applies mapping rules defined in an Excel file to a dataset contained in a CSV file.

    Parameters:
    - original_data_path: The file path to the CSV file containing the original data.
    - mapping_path: The file path to the Excel file containing the mapping rules.

    Returns:
    - rearranged_df: A DataFrame with the original and transformed columns, as per the mapping rules.
    """
    
    # Load the mapping rules from the Excel file
    mapping_df = pd.read_excel(mapping_path)

    # Fill NaN in 'Field' to identify mappings that span multiple rows
    mapping_df['Field'] = mapping_df['Field'].fillna(method='ffill')
    # For New Field Name, where NaN appears, it should inherit the first New Field name
    mapping_df['New Field'] =  mapping_df['New Field'].fillna(method='ffill')

    # Prepare the structure for value mappings
    value_mappings = {}
    for index, row in mapping_df.iterrows():
        field, new_field = row['Field'], row['New Field']
        original_value, new_value = row['Range/Values'], row['New Range/Values']
        if new_field not in value_mappings:
            value_mappings[new_field] = {'original_field': field, 'value_mapping': {}}
        if pd.notna(original_value) and pd.notna(new_value):
            value_mappings[new_field]['value_mapping'][original_value] = new_value

    # Load the original data
    original_data_df = pd.read_csv(original_data_path, keep_default_na=False)

    # Apply the mappings
    transformed_data_df = original_data_df.copy()
    for new_field, mapping in value_mappings.items():
        original_field = mapping['original_field']
        if original_field in transformed_data_df.columns:
            if mapping['value_mapping']:
                transformed_data_df[new_field] = transformed_data_df[original_field].map(mapping['value_mapping']).fillna(transformed_data_df[original_field])
            elif original_field != new_field:
                transformed_data_df[new_field] = transformed_data_df[original_field]

    # Rearrange columns to show original fields next to their new fields
    column_order = []
    for new_field, mapping in value_mappings.items():
        original_field = mapping['original_field']
        if original_field in transformed_data_df.columns and original_field != new_field:
            column_order.extend([original_field, new_field])
    for col in original_data_df.columns:
        if col not in column_order:
            column_order.append(col)

    rearranged_df = transformed_data_df[column_order]
    return rearranged_df
rearranged_df = apply_data_mapping('original_data.csv', 'mapping.xlsx')

In [17]:

rearranged_df.to_excel("mapped_data.xlsx") 
rearranged_df

Unnamed: 0,Person_ID,id,Age,age_in_years,Gender,sex,Disease,type_of_disease,Vaccination_Status,vax_status,Fever,fever,Cough,cough,Fatigue,fatigue,Shortness of Breath,s_o_b,Loss of Taste or Smell,l_t_s
0,1,1,45,45,Male,M,,,Not Vaccinated,Not Vaccinated,0,Absent,0,Absent,1,Present,1,Present,0,Absent
1,2,2,48,48,Other,O,Allergy,Allergy,Partially Vaccinated,Not Vaccinated,0,Absent,1,Present,0,Absent,1,Present,1,Present
2,3,3,65,65,Female,F,COVID-19,Respiratory,Fully Vaccinated,Fully Vaccinated,1,Present,0,Absent,0,Absent,1,Present,0,Absent
3,4,4,68,68,Other,O,,,Not Vaccinated,Not Vaccinated,0,Absent,0,Absent,1,Present,0,Absent,1,Present
4,5,5,68,68,Other,O,Flu,Respiratory,Partially Vaccinated,Not Vaccinated,0,Absent,0,Absent,0,Absent,1,Present,0,Absent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,996,27,27,Male,M,,,Not Vaccinated,Not Vaccinated,0,Absent,1,Present,1,Present,1,Present,0,Absent
996,997,997,49,49,Female,F,Cold,Respiratory,Not Vaccinated,Not Vaccinated,0,Absent,1,Present,1,Present,0,Absent,1,Present
997,998,998,72,72,Female,F,Allergy,Allergy,Fully Vaccinated,Fully Vaccinated,0,Absent,1,Present,1,Present,0,Absent,0,Absent
998,999,999,55,55,Other,O,Cold,Respiratory,Not Vaccinated,Not Vaccinated,1,Present,0,Absent,0,Absent,0,Absent,1,Present
