In [None]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DBAS-Step2-Data Understanding').getOrCreate()

# Enable pandas-on-Spark
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

#from pyspark.pandas import DataFrame as pds
import pyspark.pandas as pds
print(pyspark.__version__)


In [None]:
## Load data from csv file
spk_df = spark.read.csv("Data/heart_failure_dataset_raw.csv", header=True, inferSchema=True)
spk_df.printSchema()

# Convert Spark DataFrame to pandas-on-Spark DataFrame using to_pandas_on_spark()
spkpd_df = spk_df.to_pandas_on_spark()
spkpd_df.head()


In [None]:
# Get summary statistics
print(spkpd_df.describe())
print("--------------------------------------------")

# Transpose and round
spkpd_df_desc_transposed_rounded = spkpd_df.describe().transpose().round(2)

print(spkpd_df_desc_transposed_rounded)
print("--------------------------------------------")

# Check data types
print(spkpd_df.dtypes)

In [None]:
# describe data
spkpd_df['age'].describe()

# description of continuous fields
spkpd_df_desc_con = spkpd_df[['age','creatinine_phosphokinase',\
                                  'ejection_fraction','platelets',\
                            'serum_creatinine','serum_sodium','time']].describe()
spkpd_df_desc_con.round(2)
spkpd_df_desc_con.transpose().round(2)

In [None]:
# convert the data types
print(spkpd_df.dtypes)
print("--------------------------------------------")
spkpd_df = spkpd_df.astype({
    'anaemia' : bool, 
    'high_blood_pressure' : bool, 
    'smoking' : bool, 
    'diabetes' : bool, 
    'DEATH_EVENT' : bool,
    'sex' : bool,
    })
    
print(spkpd_df.dtypes)
print("--------------------------------------------")

# description of Boolean fields
print(spkpd_df['DEATH_EVENT'].value_counts())
print(spkpd_df['sex'].value_counts())
print(spkpd_df['smoking'].value_counts())
print(spkpd_df['anaemia'].value_counts())
print(spkpd_df['high_blood_pressure'].value_counts())
print(spkpd_df['diabetes'].value_counts())

In [None]:
### ---2.3 Explore data
# Add Visualisations
import matplotlib.pyplot as plt
from pyspark.pandas.config import set_option
set_option("plotting.backend", "matplotlib")

variables = ['DEATH_EVENT', 'smoking', 'anaemia', 'high_blood_pressure', 'diabetes']

for var in variables:
    fig, ax = plt.subplots()  # Create a new Axes object for each variable
    spkpd_df[var].value_counts().plot.bar(ax=ax, color='skyblue')
    ax.set_xlabel(var)
    ax.set_ylabel('Count')
    ax.set_title(f'Distribution of {var}')
    plt.show()
    


In [None]:
# Distribution of sex

fig, ax = plt.subplots()  # Create a new Axes object for each variable
spkpd_df['sex'].value_counts().plot.bar(ax=ax)
ax.set_xlabel('Sex')
ax.set_ylabel('Count')
ax.set_title('Distribution of Sex')
plt.xticks(ticks=[0, 1], labels=['Female', 'Male'])  # set labels as 'Female' and 'Male'
plt.show()

In [None]:
# distribution of 7 continuous fields  

import seaborn as sns

variables = ['age', 'creatinine_phosphokinase', 'ejection_fraction','platelets',\
          'serum_creatinine','serum_sodium','time']
pandas_df = spkpd_df.to_pandas()

for var in variables:
    plt.figure()  # create new figure for each field
    sns.histplot(pandas_df[var], bins=10, edgecolor='black', color='skyblue')
    plt.xlabel(var.capitalize())
    plt.ylabel('Count')
    plt.title(f'Distribution of {var.capitalize()}')
    plt.show()


In [None]:
### ---2.4 Verify the data quality
## varify data balance of death-event

# Verify the distribution of DEATH_EVENT
death_event_counts = spkpd_df['DEATH_EVENT'].value_counts()
total = len(spkpd_df)


# Plot the bar chart
ax = death_event_counts.plot(kind='bar', color=['skyblue', 'salmon'], figsize=(8, 6))


# Annotate the count and percentage on each bar
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width() / 2.,
            height + 3,
            '{:1.2f}% ({})'.format(100 * height / total, height),
            ha="center")

# Set title and labels
plt.xticks(ticks=[0, 1], labels=['Survival', 'Died'])  # set labels as 'Died' and 'survival'
plt.title('Distribution of DEATH_EVENT')
plt.xlabel('DEATH_EVENT')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()

# Display the plot
plt.show()



In [None]:
## Verify the outliners
# Detect outliners

import seaborn as sns

fields = ['creatinine_phosphokinase','platelets','serum_creatinine','serum_sodium']
pandas_df = spkpd_df.to_pandas()

for field in fields:
    plt.figure()  # create new figure for each field
    sns.histplot(pandas_df[field], bins=50)
    plt.xlabel(field.capitalize())
    plt.ylabel('Count')
    plt.title(f'Distribution of {field.capitalize()}')
    plt.show()
    

In [None]:
# Stop Spark session
spark.stop()