In [121]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [137]:
#load the data
path = 'survey_data_renamed.csv'
df = pd.read_csv(path)

In [None]:
# Filter the DataFrame to include only rows where the 'stroke' column is 1, indicating individuals who have had a stroke.
df_stroke = df[df['stroke'] == 1]

# Calculate the distribution and count of gender among individuals who have had a stroke.
# 'np.unique' with 'return_counts=True' provides unique gender values and their frequencies.
gender_dist, gender_count = np.unique(df_stroke['DHH_SEX'], return_counts=True)

# Print the unique gender identifiers and their corresponding counts.
print(gender_dist, gender_count)

# Calculate and print the percentage of men who have had a stroke.
print(f'{gender_count[0]/(gender_count[0]+gender_count[1])*100:.2f} percent of men have stroke')

# Calculate and print the percentage of women who have had a stroke.
print(f'{gender_count[1]/(gender_count[0]+gender_count[1])*100:.2f} percent of women have stroke')


In [None]:
# Calculate the percentage of stroke cases for each gender
total_count = gender_count[0] + gender_count[1]
percentage_male = (gender_count[0] / total_count) * 100
percentage_female = (gender_count[1] / total_count) * 100

# Create a bar chart
plt.figure(figsize=(8, 6))
plt.bar(['Male', 'Female'], [percentage_male, percentage_female], color=['blue', 'red'])
plt.title('Stroke Distribution by Sex')
plt.xlabel('Sex')
plt.ylabel('Percentage of Stroke Cases')
plt.ylim(0, 100)

# Display the correct percentages on top of the bars
for i, count in enumerate([percentage_male, percentage_female]):
    plt.text(i, count + 2, f'{count:.2f}%', ha='center', va='bottom', fontsize=12, fontweight='bold')

# Save the image as a file (e.g., as a PNG)
plt.savefig('stroke_distribution_by_sex_percent.png')

# Show the plot
plt.show()

In [None]:
# Filter the DataFrame to include only rows where the 'stroke' column is 0, indicating individuals who have not had a stroke.
df_stroke = df[df['stroke'] == 0]

# Calculate the distribution and count of gender among individuals who have not had a stroke.
# 'np.unique' with 'return_counts=True' provides unique gender values and their frequencies in the filtered DataFrame.
gender_dist, gender_count = np.unique(df_stroke['DHH_SEX'], return_counts=True)

# Print the unique gender identifiers and their corresponding counts.
print(gender_dist, gender_count)

# Calculate and print the percentage of men who do not have a stroke.
print(f'{gender_count[0]/(gender_count[0]+gender_count[1])*100} percent of men do not have stroke')

# Calculate and print the percentage of women who do not have a stroke.
print(f'{gender_count[1]/(gender_count[0]+gender_count[1])*100} percent of women do not have stroke')

In [None]:
# Filter the DataFrame for stroke cases
df_stroke_yes = df[df['stroke'] == 1]
df_stroke_no = df[df['stroke'] == 0]

# Calculate counts for stroke yes and stroke no for each gender
gender_labels = [1, 2]
stroke_yes_counts = [len(df_stroke_yes[df_stroke_yes['DHH_SEX'] == 1]), len(df_stroke_yes[df_stroke_yes['DHH_SEX'] == 2])]
stroke_no_counts = [len(df_stroke_no[df_stroke_no['DHH_SEX'] == 1]), len(df_stroke_no[df_stroke_no['DHH_SEX'] == 2])]

# Set the width of the bars
bar_width = 0.35

# Create index for x-axis
x = np.arange(len(gender_labels))

# Create grouped bar chart
plt.figure(figsize=(8, 6))
bars1 = plt.bar(x - bar_width/2, stroke_yes_counts, bar_width, label='Stroke Yes')
bars2 = plt.bar(x + bar_width/2, stroke_no_counts, bar_width, label='Stroke No')

# Add labels, title, and legend
plt.xlabel('Sex')
plt.ylabel('Count')
plt.title('Distribution of Stroke Cases by Sex')
plt.xticks(x, ['Male', 'Female'])  # Set x-axis labels
plt.legend()

# Add count numbers on top of the bars
for bar1, bar2 in zip(bars1, bars2):
    plt.text(bar1.get_x() + bar1.get_width()/2, bar1.get_height(), str(int(bar1.get_height())), ha='center', va='bottom')
    plt.text(bar2.get_x() + bar2.get_width()/2, bar2.get_height(), str(int(bar2.get_height())), ha='center', va='bottom')

# Save the image as a file (e.g., as a PNG)
plt.savefig('stroke_distribution_by_dataset.png')

# Show the plot
plt.show()


In [None]:
# Filter the DataFrame for stroke cases
df_stroke_yes = df[df['stroke'] == 1]
df_stroke_no = df[df['stroke'] == 0]

# Calculate percentages for stroke yes and stroke no for each gender
gender_labels = [0, 1]
total_male = len(df[df['DHH_SEX'] == 1])
total_female = len(df[df['DHH_SEX'] == 2])

stroke_yes_percent = [len(df_stroke_yes[df_stroke_yes['DHH_SEX'] == 1]) / total_male * 100, 
                      len(df_stroke_yes[df_stroke_yes['DHH_SEX'] == 2]) / total_female * 100]

stroke_no_percent = [len(df_stroke_no[df_stroke_no['DHH_SEX'] == 1]) / total_male * 100, 
                     len(df_stroke_no[df_stroke_no['DHH_SEX'] == 2]) / total_female * 100]

# Set the width of the bars
bar_width = 0.35

# Create index for x-axis
x = np.arange(len(gender_labels))

# Create grouped bar chart
plt.figure(figsize=(8, 6))
bars1 = plt.bar(x - bar_width/2, stroke_yes_percent, bar_width, label='Stroke Yes')
bars2 = plt.bar(x + bar_width/2, stroke_no_percent, bar_width, label='Stroke No')

# Add labels, title, and legend
plt.xlabel('Sex')
plt.ylabel('Percentage')
plt.title('Distribution of Stroke Cases by Sex (in Percentage)')
plt.xticks(x, ['Male', 'Female'])  # Set x-axis labels
plt.legend()

# Add percentage labels on top of the bars
for bar1, bar2 in zip(bars1, bars2):
    plt.text(bar1.get_x() + bar1.get_width()/2, bar1.get_height(), f'{bar1.get_height():.2f}%', ha='center', va='bottom')
    plt.text(bar2.get_x() + bar2.get_width()/2, bar2.get_height(), f'{bar2.get_height():.2f}%', ha='center', va='bottom')

# Save the image as a file (e.g., as a PNG)
plt.savefig('stroke_distribution_by_dataset_percent.png')

# Show the plot
plt.show()


In [138]:
race_set = ['White', 'Chinese', 'South Asian', 'Black', 'Filipino', 'Latin America', 'South-East Asian', 'Arab', 'West Asian', 'Japanese', 'Korean']

ethnicity_set = ['Canadian', 'French', 'English', 'German', 'Scottish', 'Irish', 'Italian', 'Ukrainian', 'Dutch', 'Chinese (Ethnic)', 'Jewish', 'Polish', 'Portuguese', 'South Asian (Ethnic)', 'Norwegian', 'Welsh', 'Swedish', 'Other', 'Metis', 'Inuit']

In [None]:
for rc in race_set:
    # Filter the DataFrame for individuals identified with the current race (rc) set to 1.
    df_race = df[df[rc] == 1]

    # Calculate the distribution and count of 'stroke' occurrences within the filtered race DataFrame.
    race_dist, race_count = np.unique(df_race['stroke'], return_counts=True)

    # Attempt to calculate and print the percentage of individuals with strokes within this race category.
    try:
        # If 'race_count[1]' exists, it indicates the presence of stroke cases.
        # The percentage of stroke cases is calculated as the number of stroke cases divided by the total number of cases,
        # multiplied by 100 to convert to a percentage, and rounded to two decimal places for readability.
        print(f'{rc}:{round((race_count[1]/(sum(race_count)))*100,2)} % have strokes')
    except IndexError:
        # An IndexError exception indicates that 'race_count[1]' does not exist, which occurs if no individuals with strokes
        # are found within this race category. In such cases, report 0% stroke occurrence.
        print(f'{rc}:0 % have strokes')


In [None]:
# Lists to store race names and corresponding stroke percentages
race_names = []
stroke_percentages = []

for rc in race_set:
    df_race = df[df[rc] == 1.]
    race_dist, race_count = np.unique(df_race['stroke'], return_counts=True)
    try:
        stroke_percentage = round((race_count[1] / (sum(race_count))) * 100, 2)
    except:
        stroke_percentage = 0.0
    
    race_names.append(rc)  # Append race name
    stroke_percentages.append(stroke_percentage)  # Append stroke percentage

# Create a bar plot with percentages inside the bars
plt.figure(figsize=(12, 6))
bars = plt.barh(race_names, stroke_percentages, color='lightcoral')
plt.xlabel('Stroke Percentage (%)')
plt.ylabel('Race')
plt.title('Stroke Percentage in Individual Race')
plt.gca().invert_yaxis()  # Invert y-axis to have the highest percentage at the top

# Add percentage labels inside the bars
for bar, percentage in zip(bars, stroke_percentages):
    plt.text(bar.get_width(), bar.get_y() + bar.get_height()/2, f'{percentage}%', ha='left', va='center')

# Save the image as a file (e.g., as a PNG)
plt.savefig('stroke_distribution_in_race_percent.png')

plt.show()


In [None]:
# Calculate the total number of stroke incidents in the dataset
total_strokes = len(df[df['stroke'] == 1])

race_percentages = []

# Calculate the percentage of stroke incidents for each race
for rc in race_set:
    df_race = df[df[rc] == 1]
    strokes_in_race = len(df_race[df_race['stroke'] == 1])
    percentage = (strokes_in_race / total_strokes) * 100
    race_percentages.append(percentage)

# Plot the race percentage with respect to all stroke incidents
plt.figure(figsize=(12, 6))
plt.bar(race_names, race_percentages, color='skyblue')
plt.title('Percentage of Stroke Incidents by Race')
plt.xlabel('Race')
plt.ylabel('Percentage (%)')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better visibility

# Display the percentage values on the bars
for i, percentage in enumerate(race_percentages):
    plt.text(i, percentage + 1, f'{percentage:.2f}%', ha='center', va='bottom', fontsize=10)

# Save the image as a file (e.g., as a PNG)
plt.savefig('stroke_distribution_by_race_percent.png')
plt.tight_layout()
plt.show()


In [None]:
for et in ethnicity_set :
    df_race = df[df[et]== 1.]
    race_dist, race_count = np.unique(df_race['stroke'], return_counts=True)
    #print(f'{rc}:{race_dist}, {race_count} ')
    try:
        print(f'{et}:{round((race_count[1]/(sum(race_count)))*100,2)} % have strokes')
    except:
        print(f'{et}:0 % have strokes')

In [None]:
# Lists to store ethnicity names and corresponding stroke percentages
ethnicity_names = []
stroke_percentages_ethnicity = []

for ec in ethnicity_set:
    df_ethnicity = df[df[ec] == 1.]
    ethnicity_dist, ethnicity_count = np.unique(df_ethnicity['stroke'], return_counts=True)
    try:
        stroke_percentage_ethnicity = round((ethnicity_count[1] / (sum(ethnicity_count))) * 100, 2)
    except:
        stroke_percentage_ethnicity = 0.0
    
    ethnicity_names.append(ec)  # Append ethnicity name
    stroke_percentages_ethnicity.append(stroke_percentage_ethnicity)  # Append stroke percentage

# Create a bar plot with percentages inside the bars for ethnicity_set
plt.figure(figsize=(12, 6))
bars_ethnicity = plt.barh(ethnicity_names, stroke_percentages_ethnicity, color='lightcoral')
plt.xlabel('Stroke Percentage (%)')
plt.ylabel('Ethnicity')
plt.title('Stroke Percentage in Individual Ethnicity')
plt.gca().invert_yaxis()  # Invert y-axis to have the highest percentage at the top

# Add percentage labels inside the bars
for bar_ethnicity, percentage_ethnicity in zip(bars_ethnicity, stroke_percentages_ethnicity):
    plt.text(bar_ethnicity.get_width(), bar_ethnicity.get_y() + bar_ethnicity.get_height()/2, f'{percentage_ethnicity}%', ha='left', va='center')

# Save the image as a file
plt.savefig('stroke_distribution_in_ethnicity_percent.png')

plt.show()


In [None]:
# Calculate the total number of stroke incidents in the dataset
total_strokes = len(df[df['stroke'] == 1])

ethnicity_percentages = []

# Calculate the percentage of stroke incidents for each ethnicity
for eth in ethnicity_set:
    df_ethnicity = df[df[eth] == 1]
    strokes_in_ethnicity = len(df_ethnicity[df_ethnicity['stroke'] == 1])
    percentage = (strokes_in_ethnicity / total_strokes) * 100
    ethnicity_percentages.append(percentage)

# Plot the ethnicity percentage with respect to all stroke incidents
plt.figure(figsize=(12, 7))
plt.bar(ethnicity_names, ethnicity_percentages, color='skyblue')
plt.title('Percentage of Stroke Incidents by Ethnicity')
plt.xlabel('Ethnicity')
plt.ylabel('Percentage (%)')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better visibility

# Display the percentage values on the bars
for i, percentage in enumerate(ethnicity_percentages):
    plt.text(i, percentage + 1, f'{percentage:.2f}%', ha='center', va='bottom', fontsize=10)

# Save the image as a file 
plt.savefig('stroke_distribution_by_ethnicity_percent.png')
plt.tight_layout()
plt.show()