## Importing necessary libraries

In [1]:
import pandas as pd
import re

## Loading the dataset and displaying the first few rows

In [2]:
df = pd.read_csv('../data/combined/doctors_combined_data.csv')
df.head()

Unnamed: 0,Doctor Name,Education,Speciality,Experience,Chamber,Location,Concentration
0,Asst. Prof. Dr. Nurun Nahar Mohua,"MBBS,D-CARD,PHD",Cardiologist,16.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"Bacterial Endocarditis,Cardiac Ablation,Cardia..."
1,Dr. Rezaul Alam Khan,"MBBS,CCD,MD (Medicine)",Medicine Specialist,23.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"Adolescent Medicine,Aesthetic Medicine,Cardiac..."
2,Dr. Ummy Salma Munni,"MSc,MPhil,PhD (Nutrition & Food Science),INFS ...",Nutritionist,14.0,BRB Hospitals Limited,Dhaka-1205,"Diabetes Management,Diet Management,Food Aller..."
3,Dr. Mostafa Kamal Rouf,"MBBS,BCS (Health),FCPS (Medicine),PGT (Neurology)",Medicine Specialist,23.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"Aesthetic Medicine,Alzheimer's Disease,Arthrit..."
4,Asst. Prof. Dr. Akhlak Ahmed,"MBBS (Dhaka),BCS (Health) FCPS (Medicine),MACP...",Medicine Specialist,19.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"Adolescent Medicine,Aesthetic Medicine,Cardiac..."


## Displaying DataFrame information

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6520 entries, 0 to 6519
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Doctor Name    6520 non-null   object 
 1   Education      6520 non-null   object 
 2   Speciality     6520 non-null   object 
 3   Experience     6341 non-null   float64
 4   Chamber        6520 non-null   object 
 5   Location       6341 non-null   object 
 6   Concentration  6520 non-null   object 
dtypes: float64(1), object(6)
memory usage: 356.7+ KB


## Data Cleaning & Processing

### Drop Duplicates rows & rows of the Missing Values

In [4]:
#Drop Duplicates rows & rows of the Missing Values
df.drop_duplicates(inplace=True)  # Remove duplicate rows
df.reset_index(drop=True, inplace=True)  # Reset the index after dropping duplicates
df.dropna(inplace=True)  # Remove rows with any missing values
df.reset_index(drop=True, inplace=True)  # Reset the index after dropping rows with missing
print(df.shape) # Display the shape of the DataFrame after processing


(6047, 7)


### Split the Education & Concentration Field

In [5]:
#Split the Education & Concentration Field
df['Education'] = df['Education'].str.split(',')
df["Concentration"] = df["Concentration"].str.split(',')
print("After Splitting 'Education' and 'Concentration':")
df.head()

After Splitting 'Education' and 'Concentration':


Unnamed: 0,Doctor Name,Education,Speciality,Experience,Chamber,Location,Concentration
0,Asst. Prof. Dr. Nurun Nahar Mohua,"[MBBS, D-CARD, PHD]",Cardiologist,16.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"[Bacterial Endocarditis, Cardiac Ablation, Car..."
1,Dr. Rezaul Alam Khan,"[MBBS, CCD, MD (Medicine)]",Medicine Specialist,23.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"[Adolescent Medicine, Aesthetic Medicine, Card..."
2,Dr. Ummy Salma Munni,"[MSc, MPhil, PhD (Nutrition & Food Science), I...",Nutritionist,14.0,BRB Hospitals Limited,Dhaka-1205,"[Diabetes Management, Diet Management, Food Al..."
3,Dr. Mostafa Kamal Rouf,"[MBBS, BCS (Health), FCPS (Medicine), PGT (Neu...",Medicine Specialist,23.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"[Aesthetic Medicine, Alzheimer's Disease, Arth..."
4,Asst. Prof. Dr. Akhlak Ahmed,"[MBBS (Dhaka), BCS (Health) FCPS (Medicine), M...",Medicine Specialist,19.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"[Adolescent Medicine, Aesthetic Medicine, Card..."


### Change the Case of Education & Concentration Field

In [6]:
#Change the Case of Education & Concentration field to Upper and Title respectively
df["Education"] = df["Education"].apply(lambda x: [item.upper() for item in x])
df["Concentration"] = df["Concentration"].apply(lambda x: [item.title() for item in x])
df.head()

Unnamed: 0,Doctor Name,Education,Speciality,Experience,Chamber,Location,Concentration
0,Asst. Prof. Dr. Nurun Nahar Mohua,"[MBBS, D-CARD, PHD]",Cardiologist,16.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"[Bacterial Endocarditis, Cardiac Ablation, Car..."
1,Dr. Rezaul Alam Khan,"[MBBS, CCD, MD (MEDICINE)]",Medicine Specialist,23.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"[Adolescent Medicine, Aesthetic Medicine, Card..."
2,Dr. Ummy Salma Munni,"[MSC, MPHIL, PHD (NUTRITION & FOOD SCIENCE), I...",Nutritionist,14.0,BRB Hospitals Limited,Dhaka-1205,"[Diabetes Management, Diet Management, Food Al..."
3,Dr. Mostafa Kamal Rouf,"[MBBS, BCS (HEALTH), FCPS (MEDICINE), PGT (NEU...",Medicine Specialist,23.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"[Aesthetic Medicine, Alzheimer'S Disease, Arth..."
4,Asst. Prof. Dr. Akhlak Ahmed,"[MBBS (DHAKA), BCS (HEALTH) FCPS (MEDICINE), M...",Medicine Specialist,19.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"[Adolescent Medicine, Aesthetic Medicine, Card..."


In [7]:
# Flatten the list of lists and then get unique values of the education field
education_list_flat = [item for sublist in df['Education'] for item in sublist]
print("Unique Education Entries after changing the case:")
print(f'No. of unique education: {len(pd.Series(education_list_flat).unique().tolist())}')
print(pd.Series(education_list_flat).unique().tolist())

print("--------------------------------------------------------")

#Flatten the list of lists and then get unique values of the concentration field
concentration_list_flat = [item for sublist in df['Concentration'] for item in sublist]
print("Unique Concentration Entries after changing the case:")
print(f'No. of unique concentration: {len(pd.Series(concentration_list_flat).unique().tolist())}')
print(pd.Series(concentration_list_flat).unique().tolist())

Unique Education Entries after changing the case:
No. of unique education: 2817
['MBBS', 'D-CARD', 'PHD', 'CCD', 'MD (MEDICINE)', 'MSC', 'MPHIL', 'PHD (NUTRITION & FOOD SCIENCE)', 'INFS (DU)', 'BCS (HEALTH)', 'FCPS (MEDICINE)', 'PGT (NEUROLOGY)', 'MBBS (DHAKA)', 'BCS (HEALTH) FCPS (MEDICINE)', 'MACP (USA)', 'MPH', 'MD (INTERNAL MEDICINE)', 'FACP (USA)', 'MD (RESPIRATORY MEDICINE)', 'MD (CARDIOLOGY)', 'MPH (DHAKA)', 'MS (OBS & GYNAE)', 'FCPS (GENERAL SURGERY)', 'MS (PLASTIC SURGERY)', 'MCPS', 'FELLOW IN RHEUMATOLOGY', 'FRCP (UK)', 'FCPS (OBGYN)', 'FCPS(OBS&GYNAE)', 'FRCP (GLASGOW)', 'FCPS (HEMATOLOGY)', 'MS (UROLOGY)', 'MSC (UK)', 'FCPS', 'DO', 'MBBS (DMC)', 'MD (NEUROLOGY)', 'BDS', 'FCPS (OMFS)', 'FHN (SURGERY)', 'FTMJ', 'DCH(GLASGOW)', 'MRCP(UK)', 'FCPS (PEDIATRICS)', 'FRCP', 'FRCPCH', 'MRCP (UK)', 'FRCP (LONDON)', 'MD (ENDOCRINOLOGY)', 'FCPS (CARDIOLOGY)', 'FCPS (SURGERY)', 'MCPS (MEDICINE)', 'MD (CHEST)', 'MD (GASTRO)', 'MACP', 'M.MED (UK)', 'BCS (HEALTH) FCPS (MEDICINE) FCPS (NEURO

### Remove Empty String and Entries starting with '(' on Education Field

In [8]:
# Remove empty strings and entries starting with '(' from 'Education' lists
df['Education'] = df['Education'].apply(lambda education_list: [edu for edu in education_list if edu != '' and not edu.startswith('(')])

# Flatten the list of lists and then get unique values
education_list_flat = [item for sublist in df['Education'] for item in sublist]
print("Unique Education Entries after removing entries starting with '(':")
print(f'No. of unique education: {len(pd.Series(education_list_flat).unique().tolist())}')
print(pd.Series(education_list_flat).unique().tolist())

Unique Education Entries after removing entries starting with '(':
No. of unique education: 2789
['MBBS', 'D-CARD', 'PHD', 'CCD', 'MD (MEDICINE)', 'MSC', 'MPHIL', 'PHD (NUTRITION & FOOD SCIENCE)', 'INFS (DU)', 'BCS (HEALTH)', 'FCPS (MEDICINE)', 'PGT (NEUROLOGY)', 'MBBS (DHAKA)', 'BCS (HEALTH) FCPS (MEDICINE)', 'MACP (USA)', 'MPH', 'MD (INTERNAL MEDICINE)', 'FACP (USA)', 'MD (RESPIRATORY MEDICINE)', 'MD (CARDIOLOGY)', 'MPH (DHAKA)', 'MS (OBS & GYNAE)', 'FCPS (GENERAL SURGERY)', 'MS (PLASTIC SURGERY)', 'MCPS', 'FELLOW IN RHEUMATOLOGY', 'FRCP (UK)', 'FCPS (OBGYN)', 'FCPS(OBS&GYNAE)', 'FRCP (GLASGOW)', 'FCPS (HEMATOLOGY)', 'MS (UROLOGY)', 'MSC (UK)', 'FCPS', 'DO', 'MBBS (DMC)', 'MD (NEUROLOGY)', 'BDS', 'FCPS (OMFS)', 'FHN (SURGERY)', 'FTMJ', 'DCH(GLASGOW)', 'MRCP(UK)', 'FCPS (PEDIATRICS)', 'FRCP', 'FRCPCH', 'MRCP (UK)', 'FRCP (LONDON)', 'MD (ENDOCRINOLOGY)', 'FCPS (CARDIOLOGY)', 'FCPS (SURGERY)', 'MCPS (MEDICINE)', 'MD (CHEST)', 'MD (GASTRO)', 'MACP', 'M.MED (UK)', 'BCS (HEALTH) FCPS (MEDI

### Standardizing Education Entries using Regular Expressions

In [9]:
#Substitute education using regular expression
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^AO-BASIC.*', 'AO-BASIC', edu) for edu in education_list])

df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^B.SC.*', 'BSC', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^BCS.*', 'BCS', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^BDS.*', 'BDS', edu) for edu in education_list])


df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^CCD.*', 'CCD', edu) for edu in education_list])

df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^DCP.*', 'DCP', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^DCH.*', 'DCH', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^DDV.*', 'DDV', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^DEM.*', 'DEM', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^D-CARD.*', 'D-CARD', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^D- CARD.*', 'D-CARD', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^DENT.*', 'DENT', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^DGO.*', 'DGO', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^DM.*', 'DM', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^D ORTHO.*', 'D-ORTHO', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^D - ORTHO.*', 'D-ORTHO', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^D-ORTHO.*', 'D-ORTHO', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^DLO.*', 'DLO', edu) for edu in education_list])


df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^FCPS.*', 'FCPS', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^FCCP.*', 'FCCP', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^FRCP.*', 'FRCP', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^FRCS.*', 'FRCS', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^FRCPCH.*', 'FRCPCH', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^FACP.*', 'FACP', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^FHN.*', 'FHN', edu) for edu in education_list])


df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^MBBS.*', 'MBBS', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^M.B.B.S.*', 'MBBS', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^MCPS.*', 'MCPS', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^MD.*', 'MD', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^MRCP.*', 'MRCP', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^MPH.*', 'MPH', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^MACP.*', 'MACP', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^M.MED.*', 'M.MED', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^MS.*', 'MS', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^M.S.*', 'MS', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^M.PHIL.*', 'M.PHIL', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^M. PHIL.*', 'M.PHIL', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^M.SC.*', 'M.SC', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^MACE.*', 'MACE', edu) for edu in education_list])

df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^PGT.*', 'PGT', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^PHD.*', 'PHD', edu) for edu in education_list])
df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^PGPN.*', 'PGPN', edu) for edu in education_list])

df['Education'] = df['Education'].apply(lambda education_list: [re.sub('^INFS.*', 'INFS', edu) for edu in education_list])


education_list = [item for sublist in df['Education'] for item in sublist]
edList = pd.Series(education_list).unique().tolist()

print("Unique Education Entries after cleaning:", len(edList))
education_list_flat = [item for sublist in df['Education'] for item in sublist]
print("Unique Education Entries after cleaning:", edList)

Unique Education Entries after cleaning: 1082
Unique Education Entries after cleaning: ['MBBS', 'D-CARD', 'PHD', 'CCD', 'MD', 'MS', 'MPH', 'INFS', 'BCS', 'FCPS', 'PGT', 'MACP', 'FACP', 'MCPS', 'FELLOW IN RHEUMATOLOGY', 'FRCP', 'DO', 'BDS', 'FHN', 'FTMJ', 'DCH', 'MRCP', 'M.MED', 'DTM', ' JAPAN)', 'FICS', 'M.PHIL', 'AO-BASIC', 'DDV', 'DEM', 'MACE', 'ADVANCED VATS TRAINING (INDIA)', 'FACC (USA)', 'DA (DMC)', 'DGO', 'FRCOG (LONDON)', 'MRCOG', 'FACE (AMERICA)', 'DM', 'FACS (USA)', 'DTCD', 'FCCP', 'MRCS', 'DLO', 'MACG (USA)', 'PGPN', 'D-ORTHO', 'DTCD (CHEST)', 'FELLOWSHIP IN IVA (DUBAI)', 'DIPLOMA IN CLINICAL RESEARCH (CANADA)', 'FELLOW COLORECTAL SURGERY (NUH', ' SINGAPORE)', 'FICO (LONDON)', 'UK', 'SINGAPORE', 'THAILAND & INDIA', 'CMU', 'FCCS (CCM-USA)', 'FMAS (INDIA)', 'FRCS', ' UK)', 'FICD', ' MALAYSIA)', 'FCGP (MEDICINE)', 'E.O.C. (GAINI & OBS)', 'DCM', 'FACS (AMERICA)', 'CMU (ULTRA)', 'FSCAI (USA)', 'FMD', 'FCCS (SG)', 'DNB (CARDIOLOGY)', 'DVD', 'FESC', 'MBBC', ' NICVD)', 'DO (DU)', 'D

### Adding a unique ID for each doctor

In [10]:
# Adding a unique ID for each doctor
df['Doctor ID'] = df.index + 1

# Reorder the columns to move 'Doctor ID' to the first position
cols = df.columns.tolist()
cols.insert(0, cols.pop(cols.index('Doctor ID')))  # Move 'Doctor ID' to the first column
df = df[cols]  # Reorder the DataFrame columns
print("Before Splitting 'Education' and 'Concentration':")
# print(df.head())
df.head()

Before Splitting 'Education' and 'Concentration':


Unnamed: 0,Doctor ID,Doctor Name,Education,Speciality,Experience,Chamber,Location,Concentration
0,1,Asst. Prof. Dr. Nurun Nahar Mohua,"[MBBS, D-CARD, PHD]",Cardiologist,16.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"[Bacterial Endocarditis, Cardiac Ablation, Car..."
1,2,Dr. Rezaul Alam Khan,"[MBBS, CCD, MD]",Medicine Specialist,23.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"[Adolescent Medicine, Aesthetic Medicine, Card..."
2,3,Dr. Ummy Salma Munni,"[MS, MPH, PHD, INFS]",Nutritionist,14.0,BRB Hospitals Limited,Dhaka-1205,"[Diabetes Management, Diet Management, Food Al..."
3,4,Dr. Mostafa Kamal Rouf,"[MBBS, BCS, FCPS, PGT]",Medicine Specialist,23.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"[Aesthetic Medicine, Alzheimer'S Disease, Arth..."
4,5,Asst. Prof. Dr. Akhlak Ahmed,"[MBBS, BCS, MACP]",Medicine Specialist,19.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"[Adolescent Medicine, Aesthetic Medicine, Card..."


### One-Hot Encoding Top Education and Concentration

In [11]:
# Count the occurrences of each degree across all lists
education_list = [item for sublist in df['Education'] for item in sublist]
top_10_education = pd.Series(education_list).value_counts().nlargest(10).index.tolist()

# Create new columns for the top 10 education degrees and one-hot encode them
for edu in top_10_education:
    df[edu] = df['Education'].apply(lambda x: 1 if edu in x else 0)

#Count the occurences of each concentration accrouss all lists
concentration_list = [item for sublist in df['Concentration'] for item in sublist]
top_10_concentration = pd.Series(concentration_list).value_counts().nlargest(10).index.tolist()

# Create new columns for the top 20 concentration and one-hot encode them
for conc in top_10_concentration:
    df[conc] = df['Concentration'].apply(lambda x: 1 if conc in x else 0)

print("DataFrame with one-hot encoded top 10 education degrees & Concentration:")
df.head()

DataFrame with one-hot encoded top 10 education degrees & Concentration:


Unnamed: 0,Doctor ID,Doctor Name,Education,Speciality,Experience,Chamber,Location,Concentration,MBBS,FCPS,...,Gynae Problems,Cardiac Medicine,General Medicine,Aesthetic Medicine,Adolescent Medicine,Infectious Diseases,Geriatric Medicine,Polycystic Ovary Syndrome (Pcos),Hormone Dirtubances,Health Checkup (Pediatric)
0,1,Asst. Prof. Dr. Nurun Nahar Mohua,"[MBBS, D-CARD, PHD]",Cardiologist,16.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"[Bacterial Endocarditis, Cardiac Ablation, Car...",1,0,...,0,1,0,0,0,0,0,0,0,0
1,2,Dr. Rezaul Alam Khan,"[MBBS, CCD, MD]",Medicine Specialist,23.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"[Adolescent Medicine, Aesthetic Medicine, Card...",1,0,...,0,1,1,1,1,0,1,0,0,0
2,3,Dr. Ummy Salma Munni,"[MS, MPH, PHD, INFS]",Nutritionist,14.0,BRB Hospitals Limited,Dhaka-1205,"[Diabetes Management, Diet Management, Food Al...",0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Dr. Mostafa Kamal Rouf,"[MBBS, BCS, FCPS, PGT]",Medicine Specialist,23.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"[Aesthetic Medicine, Alzheimer'S Disease, Arth...",1,1,...,0,0,1,1,1,1,1,0,0,0
4,5,Asst. Prof. Dr. Akhlak Ahmed,"[MBBS, BCS, MACP]",Medicine Specialist,19.0,Aalok Healthcare Ltd. | Mirpur 10,Dhaka-1216,"[Adolescent Medicine, Aesthetic Medicine, Card...",1,0,...,0,1,1,1,1,0,1,0,0,0


### Cleaning and processing Location field

In [12]:
df['Location'] = df['Location'].str.upper().str.strip()  # Normalize location strings

In [13]:
#see the unique locations
loc = df['Location'].tolist()
print(pd.Series(loc).unique().tolist())

['DHAKA-1216', 'DHAKA-1205', 'DHAKA-1217', 'DHAKA-1209', 'DHAKA-1230', 'DHAKA-1219', 'DHAKA-1204', 'BOGRA-5800', 'DHAKA-1207', 'DHAKA-1215', 'DHAKA-1212', 'CHITTAGONG-4000', 'CHITTAGONG-4337', 'CHITTAGONG-4203', 'CHITTAGONG-4212', 'BARISAL-8200', 'RAJSHAHI-6201', 'JESSORE-7400', "COX'S BAZAR-4700", 'DHAKA-1340', 'COMILLA-3500', 'DHAKA-1236', '1207', 'DHAKA-1360', 'DHAKA-1362', 'DHAKA-1229', 'DHAKA-1000', '1216', 'DHAKA-1213', 'DHAKA-1361', 'DHAKA-1206', 'DHAKA-1310', 'DHAKA-1450', 'DHAKA-1211', 'DHAKA-1100', 'RAJSHAHI-6000', 'CHITTAGONG-4202', 'NARAYANGANJ-1400', 'KUSHTIA-7032', 'DHAKA-1214', 'DHAKA-1711', 'NARAYANGANJ-1213', 'DHAKA-1840', 'DHAKA-1203', 'DHAKA-1430', 'GAZIPUR-1702']


In [14]:
df['Location'] = df['Location'].str.replace(r'^1207$', 'DHAKA-1207', regex=True)  # Change '1207' to 'DHAKA-1207'
df['Location'] = df['Location'].str.replace('^1216$', 'DHAKA-1216', regex= True)  # Change '1205' to 'DHAKA-1205'

In [15]:
#split the location get the city 
df['Location'] = df['Location'].str.split('-').str[0]  # Extract the city part before the hyphen
df['Location'] = df['Location'].str.title().str.strip()  # Normalize city strings

In [16]:
df['Location'] = df['Location'].str.replace("^Cox'S Bazar$", "Cox's Bazar", regex= True) # Change the location 'Cox'S Bazar' to 'Cox's Bazar'

### Cleaning the specialty column

In [17]:
df['Speciality'] = df['Speciality'].str.upper().str.strip()  # Normalize speciality strings
print("Unique Speciality Entries after normalization:")
print(pd.Series(df['Speciality']).unique().tolist())


Unique Speciality Entries after normalization:
['CARDIOLOGIST', 'MEDICINE SPECIALIST', 'NUTRITIONIST', 'INTERNAL MEDICINE', 'RESPIRATORY SPECIALIST', 'GYNECOLOGISTS', 'GENERAL SURGEON\nPLASTIC SURGEON', 'RHEUMATOLOGIST\nMEDICINE SPECIALIST', 'GYNECOLOGIST & OBSTETRICIAN', 'INTERNAL MEDICINE SPECIALIST', 'HEMATOLOGIST', 'UROLOGIST', 'DERMATOLOGIST', 'OPHTHALMOLOGIST', 'NEUROLOGIST', 'MAXILLOFACIAL AND DENTAL SURGEON', 'PEDIATRICIAN\nPEDIATRIC CARDIOLOGIST', 'ENDOCRINOLOGIST', 'GASTROENTEROLOGIST', 'PLASTIC SURGEON', 'OTOLARYNGOLOGISTS (ENT)', 'ONCOLOGIST', 'PEDIATRICIAN', 'ORTHOPEDIC SURGEON', 'GENERAL SURGEON', 'MEDICINE SPECIALIST\nENDOCRINOLOGIST', 'GENERAL SURGEON\nTHORACIC SURGEON', 'NEUROMEDICINE SPECIALIST', 'PSYCHIATRIST', 'NEPHROLOGIST', 'GASTROENTEROLOGIST\nMEDICINE SPECIALIST', 'INTERNAL MEDICINE SPECIALIST\nRESPIRATORY SPECIALIST', 'THORACIC SURGEON', 'CARDIAC SURGEON', 'ANESTHESIOLOGIST', 'GYNECOLOGIST & OBSTETRICIAN\nINFERTILITY SPECIALIST', 'GYNECOLOGIC ONCOLOGIST', 'RADI

In [18]:
#split the Speciality field
df['Speciality'] = df['Speciality'].str.split('\n')
#explode the Speciality field
df = df.explode('Speciality')

### Changing the experience year floating to integer

In [19]:
#change experience year floating to integer
df['Experience'] = df['Experience'].astype(int)  # Convert 'Experience Year' to integer type

## Exporting the Cleaned Data

In [20]:
df.to_csv('../data/cleaned/doctors_cleaned_data.csv', index=False)