Set up and Environment

In [2]:
# Set up and Envirionment
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Data Loading

In [3]:
df = pd.read_csv("CIS9650_Project 01_healthrate.ratemd.ny.csv")

  df = pd.read_csv("CIS9650_Project 01_healthrate.ratemd.ny.csv")


In [6]:

df.shape
# row, column

(143791, 1611)

In [7]:
df.columns.tolist()

['_id',
 'slug',
 'created',
 'modified',
 'id',
 'facet_url',
 'full_name',
 'full_name_specialty',
 'location.id',
 'location.category',
 'location.created',
 'location.modified',
 'location.longitude',
 'location.latitude',
 'location.slug',
 'location.city.id',
 'location.city.province_name',
 'location.city.province_slug',
 'location.city.country_name',
 'location.city.country_slug',
 'location.city.cover_images.253x83',
 'location.city.cover_images.autoxauto',
 'location.city.modified',
 'location.city.created',
 'location.city.deleted',
 'location.city.slug',
 'location.city.name',
 'location.city.cover_image',
 'location.city.province',
 'location.name',
 'location.address',
 'location.suite',
 'location.postal_code',
 'location.city_id',
 'location.phone_number',
 'location.website',
 'location.image',
 'location.images.autoxauto',
 'location.images.77x77',
 'location.images.100x100',
 'location.images.165x165',
 'location.images.70x70',
 'location.rating.cleanliness',
 'locat

Data Preparation (Cleaning the Data)

The raw data set from the CSV file contained 1,611 columns and 143,791 rows. That in itself is too much to analyze, and also contained fields such as images, phone numbers, etc... Field that would be irrelivant to our analysis, and research question. So this calls for some data cleaning!

* We selected a focused subset of 9 relevant variables that describe each doctorâ€™s name, specialty, city, and core rating metrics.

* We renamed the columns to simpler names. Column names were written such as "location.city.name", "rating.average" so we changed them to "city" and "rating_average"

* Rating fields were stored as mixed text/numeric types so we converted the rating columns to numeric using pd.to_numeric(df_clean[col], errors="coerce")

* We filtered out zero rating doctors to make sure we only analyze doctors who have actual patient reviews in order to produce valid insights

* After filtering, we reset the index for smooth readability

In [10]:
# ---------- DATA CLEANING ----------
# Select usable columns
cols_to_keep = [
    "_id", "full_name", "specialty_name", "location.city.name",
    "rating.average", "rating.helpfulness", "rating.punctuality",
    "rating.staff", "rating.count"
]

df_clean = df[cols_to_keep].copy()

# Rename columns to simpler names
df_clean = df_clean.rename(columns={
    "location.city.name": "city",
    "rating.average": "rating_avg",
    "rating.helpfulness": "rating_help",
    "rating.punctuality": "rating_punctuality",
    "rating.staff": "rating_staff",
    "rating.count": "rating_count"
})

# Convert numeric columns to proper numeric types
numeric_cols = [
    "rating_avg", "rating_help", "rating_punctuality",
    "rating_staff", "rating_count"
]

for col in numeric_cols:
    df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce")

# Drop rows with no rating information
df_clean = df_clean.dropna(subset=["rating_avg"])

# Before having this the rating avg and rating_help was 0's
df_clean = df_clean[df_clean["rating_count"] > 0]
df_clean = df_clean[df_clean["rating_avg"] > 0]

# Reset index
df_clean = df_clean.reset_index(drop=True)

df_clean.head() # this is to see the clean data


Unnamed: 0,_id,full_name,specialty_name,city,rating_avg,rating_help,rating_punctuality,rating_staff,rating_count
0,647e89e399dca372f92a0d7c,Dr. Janan S. Sayyed,Chiropractor,New York City,4.989221,4.993367,4.985075,4.99005,603
1,647e89e499dca372f92a0d91,Dr. Shirin Peters,Internist / Geriatrician,New York City,5.0,5.0,5.0,5.0,48
2,647e89e499dca372f92a0e14,Dr. Lev L. Barats,Internist / Geriatrician,Slingerlands,4.0,3.8,4.1,3.8,10
3,647e89e499dca372f92a0ef0,Dr. Harold N. Bornstein,Internist / Geriatrician,New York City,4.0,4.2,3.8,4.2,10
4,647e89e499dca372f92a0ef2,Dr. Romelle J. Maloney,Gynecologist (OBGYN),New Rochelle,4.0,3.9,4.1,3.8,10


In [11]:
df.head(50) # this is unclean data

Unnamed: 0,_id,slug,created,modified,id,facet_url,full_name,full_name_specialty,location.id,location.category,...,doctor_location_hours[8].location.url,doctor_location_hours[0].location.geocode_address,doctor_location_hours[1].location.geocode_address,doctor_location_hours[2].location.geocode_address,doctor_location_hours[3].location.geocode_address,doctor_location_hours[4].location.geocode_address,doctor_location_hours[5].location.geocode_address,doctor_location_hours[6].location.geocode_address,doctor_location_hours[7].location.geocode_address,doctor_location_hours[8].location.geocode_address
0,647e89e399dca372f92a0d7c,3183031/Dr-Janan+S.-Sayyed-NEW+YORK-NY.html,2014-06-13T11:39:19.399618-04:00,2023-04-16T10:17:11.091386-04:00,870323,/best-doctors/?specialty=chiropractor,Dr. Janan S. Sayyed,Dr. Janan S. Sayyed,870221,clinic,...,,,,,,,,,,
1,647e89e499dca372f92a0d91,3345163/Dr-SHIRIN-PETERS-NEW+YORK-NY.html,2014-06-13T10:53:51.310523-04:00,2022-11-15T15:19:02.430196-05:00,819737,/best-doctors/?specialty=internist-geriatrician,Dr. Shirin Peters,Dr. Shirin Peters,819635,clinic,...,,,,,,,,,,
2,647e89e499dca372f92a0db9,3081147/Dr-THOMAS+J.-MULHERN-New+York-NY.html,2014-06-13T12:33:17.025883-04:00,2022-03-08T14:16:12.357691-05:00,939574,/best-doctors/?specialty=psychologist,Thomas J. Mulhern,Thomas J. Mulhern,939475,clinic,...,,,,,,,,,,
3,647e89e499dca372f92a0dba,3081144/Dr-ZINA-RUTKIN-GREAT+NECK-NY.html,2014-06-13T12:33:17.131851-04:00,2022-03-08T14:16:12.357691-05:00,939577,/best-doctors/?specialty=psychologist,Zina Rutkin,Zina Rutkin,939478,clinic,...,,,,,,,,,,
4,647e89e499dca372f92a0dbd,3081128/Dr-JOHN+S.-CAVALLARO-Brooklyn-NY.html,2014-06-13T12:33:17.670698-04:00,2022-03-08T14:16:12.357691-05:00,939591,/best-doctors/?specialty=dentist,Dr. John S. Cavallaro,Dr. John S. Cavallaro,939492,clinic,...,,,,,,,,,,
5,647e89e499dca372f92a0dc3,3081104/Dr-KEITH+P.-GUTZMANN-Howard+Beach-NY.html,2014-06-13T12:33:18.897055-04:00,2022-03-08T14:16:12.357691-05:00,939611,/best-doctors/?specialty=chiropractor,Dr. Keith P. Gutzmann,Dr. Keith P. Gutzmann,939512,clinic,...,,,,,,,,,,
6,647e89e499dca372f92a0de4,3080954/Dr-ARUNA-MISHRA-New+York-NY.html,2014-06-13T12:33:25.083951-04:00,2022-03-08T14:16:12.357691-05:00,939751,/best-doctors/?specialty=gynecologist-obgyn,Dr. Aruna Mishra,Dr. Aruna Mishra,939652,clinic,...,,,,,,,,,,
7,647e89e499dca372f92a0df1,3080889/Dr-JONATHAN+D.-PORTER-Syracuse-NY.html,2014-06-13T12:33:27.067186-04:00,2022-03-08T14:16:12.357691-05:00,939804,/best-doctors/?specialty=dentist,Dr. Jonathan D. Porter,Dr. Jonathan D. Porter,939705,clinic,...,,,,,,,,,,
8,647e89e499dca372f92a0dfe,3080843/Dr-ANDRZEJ-RIESS-New+York-NY.html,2014-06-13T12:33:29.173000-04:00,2022-03-08T14:16:12.357691-05:00,939847,/best-doctors/?specialty=gynecologist-obgyn,Dr. Andrzej Riess,Dr. Andrzej Riess,939748,clinic,...,,,,,,,,,,
9,647e89e499dca372f92a0e14,150786/Dr-Lev+L.-Barats-Slingerlands-NY.html,2014-06-14T13:55:24.008055-04:00,2022-03-08T14:16:12.357691-05:00,1968564,/best-doctors/?specialty=internist-geriatrician,Dr. Lev L. Barats,Dr. Lev L. Barats,1968465,clinic,...,,,,,,,,,,
