<a href="https://colab.research.google.com/github/cis9650group7-boop/Group-7_Project1_HealthRateAnalysis/blob/dev/term_project_group7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project 1: Analysis of Health Rate

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Author: CIS 9650 Group 7 (Anish Bijusoman, Ivana Sundararao, Qingrong Tan, Reem Hussein)
### Date : November 28th, 2025

## Executive Summary

## Table of Contents

1. Introduction
2. Problem Statement / Research Question
3. Data Description
4. Setup and Environment
5. Data Loading
6. Data Preparation
7. Model Planning
8. Model Building / Analysis
9. Discussion & Interpretation
10. Conclusion
11. References
12. Appendix

## Introduction

## Problem Statement / Research Question

## Data Description

## Setup and Environment

In [2]:
!pip install google-cloud-storage



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import json
import os
from google.cloud import storage
from io import BytesIO

## Data Loading

### Functions

In [None]:
from google.cloud import storage
import pandas as pd
from io import BytesIO
import os
import json
import requests
import re # Import regex module for parsing HTML

def list_public_bucket(bucket_name, course, project):
    prefix = f"{course}/Project {project}/"

    client = storage.Client.create_anonymous_client()
    bucket = client.bucket(bucket_name)
    blobs = client.list_blobs(bucket, prefix=prefix)
    print(f"Listing files in public bucket '{bucket_name}' under '{prefix}':")
    filenames = []
    for blob in blobs:
        print(blob.name)
        filenames.append(blob.name)

    return filenames

def gdrive_file_to_dataframe(file_id: str, file_type: str):
    """
    Downloads a file from Google Drive using its ID and returns a pandas DataFrame.
    Handles Google Drive virus scan warnings by parsing the HTML form to get the
    actual download URL.
    """
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()
    response = session.get(URL, params={'id': file_id}, stream=True)
    response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)

    # Check if the response content is the virus scan warning HTML page
    if "Google Drive - Virus scan warning" in response.text:
        print("Google Drive virus scan warning detected. Attempting to bypass...")
        # Extract the download action URL and hidden input parameters from the HTML form
        match = re.search(r'<form id="download-form" action="([^"]+)" method="get">', response.text)
        if not match:
            raise ValueError("Could not find download form in virus warning page.")
        download_action_url = match.group(1)

        hidden_inputs = re.findall(r'<input type="hidden" name="([^"]+)" value="([^"]*)">', response.text)
        download_params = {name: value for name, value in hidden_inputs}

        # Make the actual download request with the extracted parameters
        final_response = session.get(download_action_url, params=download_params, stream=True)
        final_response.raise_for_status()
        data = BytesIO(final_response.content)
    else:
        # No virus warning, proceed with the initial response content
        data = BytesIO(response.content)

    df = None
    if file_type == "csv":
        df = pd.read_csv(data, engine='python', on_bad_lines='warn')
    elif file_type in ["xls", "xlsx"]:
        df = pd.read_excel(data)
    elif file_type == "parquet":
        df = pd.read_parquet(data)
    elif file_type == "json":
        text = data.read().decode("utf-8")
        json_obj = json.loads(text)
        if isinstance(json_obj, list):
            df = pd.DataFrame(json_obj)
        elif isinstance(json_obj, dict):
            df = pd.json_normalize(json_obj)
        else:
            raise ValueError("Unsupported JSON structure")
    else:
        raise ValueError(f"Unsupported file type: {file_type}")

    return df

# Fetching the data from the provided Google Drive link
try:
    file_id_to_fetch = '1yJjVih68D_J4JON2LXMLGIh_FoeibwaG'
    # Assuming the file type is CSV based on the filename provided
    gdrive_df = gdrive_file_to_dataframe(file_id_to_fetch, 'csv')
    print("Successfully fetched and loaded data from Google Drive into 'gdrive_df'. Head of the DataFrame:")
    print(gdrive_df.head())
except Exception as e:
    print(f"Error fetching or loading data from Google Drive: {e}")




## Data Preparation

In [None]:
df = gdrive_df

print("Number of rows", len(df))
print("Number of columns", len(df.columns))

In [None]:
# datatypes available
print("The datatypes available are:")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print(df.dtypes)

In [None]:
# similar naming columns
pd.set_option('display.max_columns', None)
df.columns = df.columns.str.lower().str.replace('_', '')

list_cols= list(df.columns)
print("The columns are:")
print(list_cols)

These need not be changed:'id', 'slug', 'created', 'modified', 'id', 'faceturl', 'fullname', 'fullnamespecialty'

# Method 1: Feature engineering

In [None]:
# Select usable columns
cols_to_keep = ["_id", "full_name", "specialty_name", "location.city.name","rating.average", "rating.helpfulness", "rating.punctuality","rating.staff", "rating.count"]

df_clean = df[cols_to_keep].copy()

# Rename columns to simpler names
df_clean = df_clean.rename(columns={
    "location.city.name": "city",
    "rating.average": "rating_avg",
    "rating.helpfulness": "rating_help",
    "rating.punctuality": "rating_punctuality",
    "rating.staff": "rating_staff",
    "rating.count": "rating_count"
})

# Convert numeric columns to proper numeric types
numeric_cols = [
    "rating_avg", "rating_help", "rating_punctuality",
    "rating_staff", "rating_count"
]

for col in numeric_cols:
    df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce")

# Drop rows with no rating information
df_clean = df_clean.dropna(subset=["rating_avg"])

# Before having this the rating avg and rating_help was 0's
df_clean = df_clean[df_clean["rating_count"] > 0]
df_clean = df_clean[df_clean["rating_avg"] > 0]

# Reset index
df_clean = df_clean.reset_index(drop=True)

df_clean.head() # this is to see the clean data

# Method 2: Feature engineering

Most columns share prefixes like:
location.*
images.*
rating.*
doctorlocations[n].*
doctorlocationhours[n].*

We can automatically group these

In [None]:
import pandas as pd

def group_columns_by_prefix(df, sep='.'):
    groups = {}
    for col in df.columns:
        prefix = col.split(sep)[0]
        groups.setdefault(prefix, []).append(col)
    return groups

groups = group_columns_by_prefix(df)
for g, cols in groups.items():
    print(g, len(cols))


We will Collapse Deeply Nested Repeating Blocks

In [None]:
import re

pattern = re.compile(r"doctor_locations\[(\d+)\]\.(.+)")

grouped = {}

for col in df.columns:
    m = pattern.match(col)
    if m:
        idx = int(m.group(1))
        field = m.group(2)
        grouped.setdefault(idx, {})[field] = col

print({k: len(v) for k, v in grouped.items()})


In [None]:
# Build the nested objects
dl_cols = [c for c in df.columns if c.startswith("doctor_locations[")]
doctor_df = df[dl_cols].copy()


In [None]:
# Drop the original wide columns
df = df.drop(columns=[c for c in df.columns if c.startswith("doctor_locations[")])

Removing Unnecessary Metadata Columns

In [None]:
cols_to_drop = df.filter(regex="created|modified|slug|deleted|image|url$").columns
df = df.drop(columns=cols_to_drop)


## Model Planning

## Model building / Analysis

## Discussion and Results

## Conclusion

## References

1. Barber, David. Bayesian Reasoning and Machine Learning. Cambridge University Press, 2012.
2. Aste, Tomaso, Paola Cerchiello, and Roberta Scaramozzino. "Information-Theoretic Causality Detection between Financial and Sentiment Data."Entropy, vol. 24, no. 6, 2022, pp. 1–18. DOI:10.3390/e24060774.
3. Metz, Cade. "Microsoft Puts OpenAI’s Sam Altman in Charge of New Advanced AI Research Team."
The New York Times, 20 Nov. 2023, www.nytimes.com/2023/11/20/technology/openai-microsoft-altman.html

## Appendix