# Data Cleaning & Preprocessing

## 1. Load Library

In [1]:
import pandas as pd

## 2. Load 2024 Dataset

In [None]:
file_name = '../final dataset/Final2024_df.csv'

In [None]:
final_df = pd.read_csv(file_name)

## 3. Cleaning & Preprocessing

### 3.1 Apply filters to final df

In [3]:
print(f'shape of dataframe before filters',final_df.shape)

# Apply filters
final_df = final_df[final_df['rpsr_cod'].str.contains('CSM|HP')]  # report source code set to HP:Health Professional and CSM:Consumer
final_df['rpsr_cod'] = final_df['rpsr_cod'].str.strip()
final_df['pt'] = final_df['pt'].str.strip()
final_df = final_df.drop_duplicates()

# Drop the new column created, if present.
if 'Unnamed: 0' in final_df.columns:
    final_df = final_df.drop(columns=['Unnamed: 0'])

print(f'shape of dataframe after filters',final_df.shape)

shape of dataframe before filters (131, 51)
shape of dataframe after filters (121, 50)


### 3.2 Convert all ages to year

Convert ages from day and month to years

In [4]:
# Convert ages from months to years
final_df.loc[final_df['age_cod'] == 'MON', 'age'] /= 12

# Change 'MON' to 'YR'
final_df.loc[final_df['age_cod'] == 'MON', 'age_cod'] = 'YR'

# Convert ages from days to years
final_df.loc[final_df['age_cod'] == 'DY', 'age'] /= 365

# Change 'DY' to 'YR'
final_df.loc[final_df['age_cod'] == 'DY', 'age_cod'] = 'YR'

# Ensure the age values are rounded to 1 decimal place
final_df['age'] = final_df['age'].round(1)

Convert all weights from lbs to kgs

In [None]:
# Convert weights from lbs to kgs
final_df.loc[final_df['wt_cod'] == 'LBS', 'wt'] /= 2.2

### 3.3 Drop NA and Duplicates

drop NAs in age column

In [5]:
print(f'shape of dataframe',final_df.shape)
final_df = final_df.dropna(subset='age')
print(f'shape of dataframe after dropping NAs in age',final_df.shape)

shape of dataframe (121, 50)
shape of dataframe after dropping NAs in age (95, 50)


Drop duplicates

In [6]:
print(f'shape of dataframe',final_df.shape)
final_df = final_df.drop_duplicates()
print(f'shape of dataframe after dropping duplicates',final_df.shape)

shape of dataframe (95, 50)
shape of dataframe after dropping duplicates (95, 50)


### 3.4 Create and Add age categories
create age categories based on age

In [7]:
age_bins = [0, 17, 34, 54, 74, 120]
age_labels = ['Children & Adolescents', 'Young Adults', 'Adults', 'Seniors', 'Elderly']
final_df['age_category'] = pd.cut(final_df['age'], bins=age_bins, labels=age_labels, right=False)


### 3.5 Map and Add adverse events categories 

In [8]:
# Load the categories CSV into a DataFrame
pt_categories = pd.read_csv('../final dataset/Drugs/pt categories.csv')

map the adverse events to a category

In [None]:
# Convert the DataFrame into a dictionary
# Strip extra whitespace and convert events to lists
category_mapping = pt_categories.set_index('Category')['Events'].apply(lambda x: [e.strip().lower() for e in x.split(',')]).to_dict()

# Define the categorisation function
def categorise_side_effect(side_effect, category_mapping):
    side_effect = side_effect.strip().lower()  # Clean and standardize the input
    for category, effects in category_mapping.items():
        if side_effect in effects:
            return category
    return 'Other Event'

# Apply the categorization function to the 'pt' column
final_df['pt_category'] = final_df['pt'].apply(lambda x: categorise_side_effect(x, category_mapping))


## 4. Save the final dataframe

In [None]:
# save the dataframe
final_df.to_csv('../final dataset/Ventolin2024_df.csv')