# Data Cleaning
Script to clean and transform the Crystal Report 'UHMC Primary Surgery by Bariatric Surgeons' to match the formatting for the REDCap Database 'General Surgery Database'.

In [27]:
import pandas as pd
from datetime import datetime

In [28]:
# Read the CSV
df = pd.read_csv('/Users/carolinesanicola/Downloads/data.csv')

In [29]:
# data mapping
# Surgeon mapping dictionary (all caps as input)
surgeon_map = {
    'AGARWALA DO, ASHISH': 10,
    'BATES MD , ANDREW': 6,
    'DOCIMO DO , SALVATORE': 3,
    'HUGHES MD, MELANY': 11,
    'LEE MD, EDMUND': 8,
    'POWERS MD, PHD, KINGA': 4,
    'PRYOR MD , AURORA': 1,
    'ROSENBLUTH MD, AMY': 5,
    'SCHNUR MD , JESSICA': 12,
    'SHMELEV MD, ARTEM': 9,
    'SPANIOLAS MD , KONSTANTINOS': 2,
    'TELEM , DANA': 7
}

# gender mapping
gender_map = {'Male': 0, 'Female': 1, 'Other': 2}

In [30]:
# 1. Date of Surgery -> surgery_date
df['surgery_date'] = pd.to_datetime(df['Date of Surgery']).dt.strftime('%Y-%m-%d')
print(df['surgery_date'].head())

  df['surgery_date'] = pd.to_datetime(df['Date of Surgery']).dt.strftime('%Y-%m-%d')


0    2015-01-02
1    2015-01-06
2    2015-01-06
3    2015-01-06
4    2015-01-06
Name: surgery_date, dtype: object


In [31]:
# 2. Surgeon -> surgeon 
df['surgeon'] = df['Surgeon'].map(surgeon_map).fillna(999).astype(int)
print(df['surgeon'].value_counts())

surgeon
1     2439
2     1895
5     1181
4     1016
6      988
3      931
12     779
9      440
8      428
7      356
11     331
10       1
Name: count, dtype: int64


In [32]:
# 3. MRN -> mrn (8 digits, leading zeros)
df['mrn'] = df['MRN'].astype(str).str.zfill(8)
print(df['mrn'].head())

0    30447708
1    30553527
2    00290031
3    00441107
4    00031673
Name: mrn, dtype: object


In [33]:
# 4. Patient Name -> first_name, last_name
name_split = df['Patient Name'].str.split(',', expand=True)
df['last_name'] = name_split[0].str.strip().str.title()
df['first_name'] = name_split[1].str.strip().str.title() if name_split.shape[1] > 1 else ''
print(df['first_name'].head())
print(df['last_name'].head())

0    Hrushikesh
1      Labertha
2      Lorraine
3       Shannon
4        Dawn M
Name: first_name, dtype: object
0         Bhatt
1    Harrington
2          Goot
3       Ziegler
4          Ryan
Name: last_name, dtype: object


In [34]:
# 5. ADMIN_SEX -> gender
df['gender'] = df['ADMIN_SEX'].map(gender_map).fillna(2).astype(int)
print(df['gender'].value_counts())

gender
1    6258
0    4520
2       7
Name: count, dtype: int64


In [35]:
# 6. CPT_1 to CPT_8 → cpt (comma-separated, skip blanks/NaN)
cpt_cols = [f'CPT_{i}' for i in range(1, 9)]

def clean_cpt(row):
    # Only keep non-empty, non-NaN values
    filtered = [str(x) for x in row if pd.notna(x) and str(x).strip() != '']
    return ','.join(filtered)

df['cpt'] = df[cpt_cols].apply(clean_cpt, axis=1)
print(df['cpt'].head())

0                44970
1    44202,43235,49654
2          43281,43644
3                43644
4                43235
Name: cpt, dtype: object


In [36]:
# 7. Written Procedure -> written_proc
df['written_proc'] = df['Written Procedure']
print(df['written_proc'].head())

0                           Laparoscopic appendectomy.
1    Laparoscopic small bowel resection and reanast...
2                                                  NaN
3    Laparoscopic revision of sleeve gastrectomy an...
4                                                  NaN
Name: written_proc, dtype: object


In [37]:
# 8. Select and order columns for output
output_cols = [
    'mrn',
    'last_name',
    'first_name',
    'gender',
    'surgery_date',
    'surgeon',
    'cpt',
    'written_proc'
]
df_cleaned = df[output_cols]
df_cleaned.to_csv('/Users/carolinesanicola/Downloads/cleaned_data.csv', index=False)