In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=5271b4ad9ff8d1f39f5fca5cf7d1f1c4c3bf7cf1ec5e98797f9aee79e6012b91
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
# necessary imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count

In [None]:
# starting spark session
spark = SparkSession.builder.appName("HealthCareAnalysis").getOrCreate()

In [None]:
# loading data
demographics = spark.read.csv('/content/drive/MyDrive/2017_census.csv',header=True, inferSchema=True)
heart_disease = spark.read.csv('/content/drive/MyDrive/heart_2022_no_nans.csv',header=True, inferSchema=True)

In [None]:
# getting the shape of the dataframes
demographics_rows=demographics.count()
demographics_columns=len(demographics.columns)
print('Rows for demographics: ',demographics_rows)
print('Columns for demographics: ',demographics_columns)
print('\n')
heartdisease_rows=heart_disease.count()
heartdisease_columns=len(heart_disease.columns)
print('Rows for Heart Disease data: ',heartdisease_rows)
print('Columns for Heart Disease data: ',heartdisease_columns)

Rows for demographics:  74001
Columns for demographics:  37


Rows for Heart Disease data:  246022
Columns for Heart Disease data:  40


In [None]:
demographics.printSchema()

root
 |-- TractId: long (nullable = true)
 |-- State: string (nullable = true)
 |-- County: string (nullable = true)
 |-- TotalPop: integer (nullable = true)
 |-- Men: integer (nullable = true)
 |-- Women: integer (nullable = true)
 |-- Hispanic: double (nullable = true)
 |-- White: double (nullable = true)
 |-- Black: double (nullable = true)
 |-- Native: double (nullable = true)
 |-- Asian: double (nullable = true)
 |-- Pacific: double (nullable = true)
 |-- VotingAgeCitizen: integer (nullable = true)
 |-- Income: double (nullable = true)
 |-- IncomeErr: double (nullable = true)
 |-- IncomePerCap: double (nullable = true)
 |-- IncomePerCapErr: double (nullable = true)
 |-- Poverty: double (nullable = true)
 |-- ChildPoverty: double (nullable = true)
 |-- Professional: double (nullable = true)
 |-- Service: double (nullable = true)
 |-- Office: double (nullable = true)
 |-- Construction: double (nullable = true)
 |-- Production: double (nullable = true)
 |-- Drive: double (nullable = 

In [None]:
heart_disease.printSchema()

root
 |-- State: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- GeneralHealth: string (nullable = true)
 |-- PhysicalHealthDays: integer (nullable = true)
 |-- MentalHealthDays: integer (nullable = true)
 |-- LastCheckupTime: string (nullable = true)
 |-- PhysicalActivities: string (nullable = true)
 |-- SleepHours: integer (nullable = true)
 |-- RemovedTeeth: string (nullable = true)
 |-- HadHeartAttack: string (nullable = true)
 |-- HadAngina: string (nullable = true)
 |-- HadStroke: string (nullable = true)
 |-- HadAsthma: string (nullable = true)
 |-- HadSkinCancer: string (nullable = true)
 |-- HadCOPD: string (nullable = true)
 |-- HadDepressiveDisorder: string (nullable = true)
 |-- HadKidneyDisease: string (nullable = true)
 |-- HadArthritis: string (nullable = true)
 |-- HadDiabetes: string (nullable = true)
 |-- DeafOrHardOfHearing: string (nullable = true)
 |-- BlindOrVisionDifficulty: string (nullable = true)
 |-- DifficultyConcentrating: string (nullable 

In [None]:
#formatting of columns for demographics data
demographics = demographics \
    .withColumnRenamed("TotalPop", "TotalPopulation") \
    .withColumnRenamed("IncomeErr", "IncomeError") \
    .withColumnRenamed("IncomePerCap", "IncomePerCapita") \
    .withColumnRenamed("IncomePerCapErr", "IncomePerCapitaError") \
    .withColumnRenamed("OtherTransp", "OtherTransportation")

In [None]:
# calculate the percentage of missing values in each column of demographics data
# healthcare data has no nans

for c in demographics.columns:
  missing_count = demographics.select(count(when(col(c).isNull(), col(c)))).collect()[0][0]
  missing_percentage=(count/demographics_rows)*100

# print the missing value percentage for each column
print("Missing value percentage for each column:")
print(missing_percentage)

TypeError: 'str' object is not callable

In [None]:
numerical_columns = demographics.select_dtypes(include=['int64', 'float64']).columns
for column in numerical_columns:
    if demographics[column].isnull().any():
        mean_value = demographics[column].mean()
        demographics[column].fillna(mean_value, inplace=True)

### Insights:
1. High risk groups: which demographic factors correlate with a higher prevalance of heart disease Target-> preventive healthcare initaitves for high risk populations.
2. Identify socio-economic factors that might influence access to healthcare or healthy lifestyles, potentially leading to higher heart disease risk.
3. Investigate how factors like physical activity, sleep patterns, smoking status, and alcohol consumption relate to heart disease risk within different demographic groups.
4. Analyze if there are variations in risk factors based on geographic location (CensusTract data).
5. Use machine learning algorithms to develop a model that predicts heart disease risk based on a combination of demographic and health data. This model can be used for early detection and intervention.
6. Analyze healthcare resource utilization (hospitals, specialists) in areas with high heart disease risk. This can help optimize resource allocation and improve preventative care accessibility.
7. Conduct geospatial analysis to map the distribution of heart diseases and demographic characteristics at a regional or local level. Identify areas with higher disease burden and demographic disparities, which can inform targeted interventions and resource allocation.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv('heart_2022_no_nans.csv')

In [None]:
df.head()

In [None]:
print("\nColumn Data Types:")
df.info()

In [None]:
print("Missing values per column:")
df.isnull().sum()

In [None]:
# Function for setting style and size of plot
def set_size_style(width, height, style=None):
    plt.figure(figsize=(width, height))
    if style != None:
        sns.set_style(style)

# Function for customizating the plot
def customize_plot(plot, title:str, xlabel:str,  ylabel:str, title_font:int, label_font:int):
    plot.set_title(title, fontsize = title_font, weight='bold')
    plot.set_xlabel(xlabel, fontsize = label_font, weight='bold')
    plot.set_ylabel(ylabel, fontsize = label_font, weight='bold')

In [None]:
df.shape

In [None]:
# Convert categorical columns to 'category' type
categorical_columns = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking',
                       'Sex', 'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth',
                       'Asthma', 'KidneyDisease', 'SkinCancer']

for column in categorical_columns:
    df[column] = df[column].astype('category')

In [None]:
set_size_style(8,10,'whitegrid')
df.isna().sum().plot(kind='barh')