# Statistical Analysis Notebook

This notebook contains statistical analysis using Chi-Square and Mann-Whitney U tests. The dataset being used is preprocessed and imputed before the tests are performed. Below are the tests conducted:

In [None]:
pip install seaborn

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as pl

In [None]:
df = pd.read_csv("brfss2020.csv")

In [None]:
df.shape

In [None]:
df_selected = df[['SEXVAR', 'CHECKUP1', 'EXERANY2', 'CVDINFR4', 'DIABETE4', 'INSULIN1', 'BLDSUGAR', 'DOCTDIAB', 'CHKHEMO3', 'DIABEDU', '_RFBMI5', '_RFSMOK3']]
df_selected.head()

In [None]:
df_selected.isnull().sum()       # indicates the number of missing values

In [None]:
df_selected.isnull().any()     #ach column in the df_selected DataFrame to determine if there are any missing values (True) or none (False)

In [None]:
df_selected.count()

In [None]:
df_selected = df_selected.rename(columns={"SEXVAR":"Gender"})


sns.countplot(data= df_selected, x='Gender')
df_selected.Gender.value_counts(sort=False)

In [None]:
df_selected = df_selected.rename(columns={"CHECKUP1":"Checkup"})
sns.countplot(x="Checkup", data=df_selected)
df_selected.Checkup.value_counts(sort=False)

#Values of 7 or 9 are equivelent to null
df_selected.loc[df_selected['Checkup'] == 9.0, 'Checkup'] = np.nan
df_selected.loc[df_selected['Checkup'] == 7.0, 'Checkup'] = np.nan



In [None]:
df_selected = df_selected.rename(columns={"EXERANY2":"Exercise"})
sns.countplot(x="Exercise", data=df_selected)
df_selected.Exercise.value_counts(sort=False)

#Values of 7 or 9 are equivelent to null
df_selected.loc[df_selected['Exercise'] == 9.0, 'Exercise'] = np.nan
df_selected.loc[df_selected['Exercise'] == 7.0, 'Exercise'] = np.nan


In [None]:
df_selected = df_selected.rename(columns={"CVDINFR4":"Heart_Attack"})
sns.countplot(x="Heart_Attack", data=df_selected)
df_selected.Heart_Attack.value_counts(sort=False)

df_selected.loc[df_selected['Heart_Attack'] == 9.0, 'Heart_Attack'] = np.nan
df_selected.loc[df_selected['Heart_Attack'] == 7.0, 'Heart_Attack'] = np.nan


In [None]:
df_selected = df_selected.rename(columns={"DIABETE4":"Diabetes"})

#Values other than 1 are not apart of our population, dropping those values.
df_selected.drop(df_selected[df_selected['Diabetes'] == 9].index, inplace= True)
df_selected.drop(df_selected[df_selected['Diabetes'] == 7].index, inplace= True)
df_selected.drop(df_selected[df_selected['Diabetes'] == 3].index, inplace= True)
df_selected.drop(df_selected[df_selected['Diabetes'] == 4].index, inplace= True)
df_selected.drop(df_selected[df_selected['Diabetes'] == 2].index, inplace= True)
sns.countplot(x="Diabetes", data=df_selected)
df_selected.Diabetes.value_counts(sort=False)

In [None]:
df_selected = df_selected.rename(columns={"INSULIN1":"Insulin"})
sns.countplot(x="Insulin", data=df_selected)
df_selected.Insulin.value_counts(sort=False)

df_selected.loc[df_selected["Insulin"] == 9.0, "Insulin"] = np.nan
df_selected.loc[df_selected["Insulin"] == 7.0, "Insulin"] = np.nan


In [None]:
df_selected = df_selected.rename(columns={"DIABEDU":"Managing_Diabetes"})

sns.countplot(x="Managing_Diabetes", data=df_selected)
df_selected.Managing_Diabetes.value_counts(sort=False)


#7 and 9 are equivelent to null for this feature
df_selected.loc[df_selected['Managing_Diabetes'] == 9.0, 'Managing_Diabetes'] = np.nan
df_selected.loc[df_selected['Managing_Diabetes'] == 7.0, 'Managing_Diabetes'] = np.nan


In [None]:
df_selected = df_selected.rename(columns={"_RFBMI5":"Overweight_or_Obese_Calculated"})

sns.countplot(x="Overweight_or_Obese_Calculated", data=df_selected)
df_selected.Overweight_or_Obese_Calculated.value_counts(sort=False)

#9 is equivelent to null for this feature.
df_selected.loc[df_selected["Overweight_or_Obese_Calculated"] == 9.0, "Overweight_or_Obese_Calculated"] = np.nan


In [None]:
df_selected = df_selected.rename(columns={"_RFSMOK3":"Current_Smokers"})

sns.countplot(x="Current_Smokers", data=df_selected)
df_selected.Current_Smokers.value_counts(sort=False)

#9 is equivelent to null for this feature
df_selected.loc[df_selected["Current_Smokers"] == 9.0, "Current_Smokers"] = np.nan


In [None]:
df_selected.info()
df_selected.head()

In [None]:
import pandas as pd


df_selected.to_csv('cleanedm.csv', index=False)


In [None]:
import csv

def process_csv(input_file, output_file):
    with open(input_file, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        rows = list(csv_reader)
#Row is for how many times a patient checks the blood sugar. It is formated that if it is 100s, the tens and ones value are how many times the patient
#checks daily. If it 200s, the same applies only its weekly checks. If it is 300s, it is monthly. If it is 400s, it is yearly. 888 corresponds with none.
#The below code block strips away the first digit and checks to see if there is a second digit that is not zero. Then it normalizes data to daily checks
#from weekly, monthly, yearly so we all values are on the same scale. Rounds down to 5 digits.
        for row in rows:
            if row['Dly_Sugar_Check'].startswith('1'):
                row['Dly_Sugar_Check'] = row['Dly_Sugar_Check'][1:]
                if len(row['Dly_Sugar_Check']) > 0 and row['Dly_Sugar_Check'][0] == '0':
                    row['Dly_Sugar_Check'] = row['Dly_Sugar_Check'][1:]
            elif len(row['Dly_Sugar_Check']) == 5 and row['Dly_Sugar_Check'].startswith('2'):
                row['Dly_Sugar_Check'] = row['Dly_Sugar_Check'][1:]
                if len(row['Dly_Sugar_Check']) > 0 and row['Dly_Sugar_Check'][0] == '0':
                    row['Dly_Sugar_Check'] = row['Dly_Sugar_Check'][1:]
                    row['Dly_Sugar_Check'] = str(round(float(row['Dly_Sugar_Check']) / 7, 5))
            elif len(row['Dly_Sugar_Check']) == 5 and row['Dly_Sugar_Check'].startswith('3'):
                row['Dly_Sugar_Check'] = row['Dly_Sugar_Check'][1:]
                if len(row['Dly_Sugar_Check']) > 0 and row['Dly_Sugar_Check'][0] == '0':
                    row['Dly_Sugar_Check'] = row['Dly_Sugar_Check'][1:]
                    row['Dly_Sugar_Check']=str(round(float(row['Dly_Sugar_Check'])/30, 5))
            elif len(row['Dly_Sugar_Check']) == 5 and row['Dly_Sugar_Check'].startswith('4'):
                row['Dly_Sugar_Check'] = row['Dly_Sugar_Check'][1:]
                if len(row['Dly_Sugar_Check']) > 0 and row['Dly_Sugar_Check'][0] == '0':
                    row['Dly_Sugar_Check'] = row['Dly_Sugar_Check'][1:]
                    row['Dly_Sugar_Check']=str(round(float(row['Dly_Sugar_Check'])/365, 5))
            elif row['Dly_Sugar_Check'] == ('888.0'):
                row['Dly_Sugar_Check'] = ('0.0')

#88 corresponds with none so in the below two rows, replacing that zero.
        for row in rows:
            if row['Diabetes_Checkup'] == ('88.0'):
                row['Diabetes_Checkup'] = ('0.0')

        for row in rows:
            if row['Glycosylated_Hemoglobin'] == ('88.0'):
                row['Glycosylated_Hemoglobin'] = ('0.0')


    with open(output_file, 'w', newline='') as csv_file:
        fieldnames = csv_reader.fieldnames
        csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        csv_writer.writeheader()
        csv_writer.writerows(rows)

input_file = 'cleanedm.csv'
output_file = 'outputm.csv'

process_csv(input_file, output_file)

In [None]:
df = pd.read_csv("outputm.csv")

In [None]:
df_selected = df[['Gender', 'Checkup', 'Exercise', 'Heart_Attack', 'Diabetes', 'Insulin', 'Dly_Sugar_Check', 'Diabetes_Checkup', 'Glycosylated_Hemoglobin', 'Managing_Diabetes', 'Overweight_or_Obese_Calculated', 'Current_Smokers']]
df_selected.head()

In [None]:
df_selected.isnull().sum()

In [None]:
df_selected.isnull().any()

In [None]:
df_selected.count()

In [None]:
pip install --upgrade scikit-learn

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Load your data
df = pd.read_csv("/content/outputm.csv")
print(len(df.columns))

# Initialize the SimpleImputer with the desired strategy (mean, median, most_frequent, or constant)
imputer = SimpleImputer(strategy='most_frequent')  # You can choose a different strategy based on your requirements
df_new = pd.DataFrame()

# Fit and transform the dataset
df_new = imputer.fit_transform(df)

# Now, df contains the DataFrame with imputed values
df_imputed = pd.DataFrame(df_new, columns=df.columns[:])

In [None]:
df_imputed.isnull().any()

In [None]:
df_imputed.head()

In [None]:
df_imputed.count()

In [None]:
df_selected=df_imputed

In [None]:
df_selected.describe()

In [None]:
sns.countplot(data= df_selected, x='Gender')
df_selected.Gender.value_counts(sort=False)

In [None]:

sns.countplot(x="Checkup", data=df_selected)
df_selected.Checkup.value_counts(sort=False)

In [None]:
sns.countplot(x="Exercise", data=df_selected)
df_selected.Exercise.value_counts(sort=False)

In [None]:
sns.countplot(x="Heart_Attack", data=df_selected)
df_selected.Heart_Attack.value_counts(sort=False)

In [None]:
sns.countplot(x="Diabetes", data=df_selected)
df_selected.Diabetes.value_counts(sort=False)

In [None]:
sns.countplot(x="Insulin", data=df_selected)
df_selected.Insulin.value_counts(sort=False)

In [None]:
sns.countplot(x="Managing_Diabetes", data=df_selected)
df_selected.Managing_Diabetes.value_counts(sort=False)

In [None]:
sns.countplot(x="Overweight_or_Obese_Calculated", data=df_selected)
df_imputed.Overweight_or_Obese_Calculated.value_counts(sort=False)


In [None]:
sns.countplot(x="Current_Smokers", data=df_selected)
df_selected.Current_Smokers.value_counts(sort=False)

In [None]:
df_selected.count()

In [None]:
df_selected.info()
df_selected.head()

In [None]:
df_imputed.to_csv('data_cleanm.csv', index=False)

In [None]:
from matplotlib import pyplot as plt

In [None]:
pip install statsmodels

In [None]:
import statsmodels.api as sm
import pylab as py
from scipy.special import ndtri
import scipy.stats as stats

In [None]:
temmp = df_imputed

In [None]:
combined_data = np.array(temmp[['Heart_Attack','Diabetes_Checkup']].values).flatten()
z = (combined_data - np.mean(combined_data)) / np.std(combined_data)
plt.figure(figsize=(8, 8))
stats.probplot(z, dist="norm", plot=plt)
plt.title('Q-Q Plot: Heart Attack vs Diabetes Checkup')
plt.show()

In [None]:
combined_data = np.array(temmp[['Heart_Attack','Glycosylated_Hemoglobin']].values).flatten()
z = (combined_data - np.mean(combined_data)) / np.std(combined_data)
plt.figure(figsize=(8, 8))
stats.probplot(z, dist="norm", plot=plt)
plt.title('Q-Q Plot: Heart Attack vs Glycosylated Hemoglobin')
plt.show()

In [None]:
combined_data = np.array(temmp[['Heart_Attack','Dly_Sugar_Check']].values).flatten()
z = (combined_data - np.mean(combined_data)) / np.std(combined_data)
plt.figure(figsize=(8, 8))
stats.probplot(z, dist="norm", plot=plt)
plt.title('Q-Q Plot: Heart Attack vs Daily Sugar Check')
plt.show()

In [None]:
#Correlation Analysis

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

dep_var = 'Heart_Attack'
ind_vars = ['Gender', 'Checkup', 'Exercise', 'Insulin', 'Dly_Sugar_Check',
            'Diabetes_Checkup', 'Glycosylated_Hemoglobin',
            'Overweight_or_Obese_Calculated', 'Current_Smokers']

corr = df_imputed[[dep_var] + ind_vars].corr()

plt.figure(figsize=(12,9))

heatmap = sns.heatmap(corr, cmap='coolwarm', annot=True, fmt=".2f",
                      annot_kws={"size": 14},  # Smaller font size for the annotations.
                      linewidths=.5, cbar_kws={"shrink": .5})

plt.xticks(rotation=45, ha='right')  # Rotate x labels for better visibility
plt.yticks(rotation=0)               # Rotate y labels to horizontal

# Set the title and adjust layout.
heatmap.set_title('Correlation Heatmap with Heart_Attack')
plt.tight_layout()
plt.show()

In [None]:
#Chi- Square Test between 2 categorical variables - Heart Attack and Managing Diabetes

#Null Hypothesis: TThere is no statistically significant association between experiencing a heart attack and participation in classes for diabetes management.
#Alternate Hypothesis: There is a statistically significant association between experiencing a heart attack and participation in classes for diabetes management.
import scipy.stats

cont = pd.crosstab(temmp["Heart_Attack"],temmp["Managing_Diabetes"])
chi2_stat, p_value, dof, expected = scipy.stats.chi2_contingency(cont)

print(f"Chi-Square Statistic: {chi2_stat}")
print(f"P-Value: {p_value}")
print(f"Degrees of Freedom: {dof}")

if p_value > 0.05:
    print("Conclusion: As p-value is greater than 0.05, we fail to reject the null hypothesis.")
else:
    print("Conclusion: As p-value is less than or equal to 0.05, we reject the null hypothesis.")


In [None]:
##Chi- Square Test between 2 categorical variables - Heart Attack and Exercise

#Null Hypothesis (H₀): There is no statistically significant association between experiencing a heart attack and engaging in exercise.
#Alternative Hypothesis (H₁): There is a statistically significant association between experiencing a heart attack and engaging in exercise.

cont = pd.crosstab(temmp["Heart_Attack"],temmp["Exercise"])
chi2_stat, p_value, dof, expected = scipy.stats.chi2_contingency(cont)

print(f"Chi-Square Statistic: {chi2_stat}")
print(f"P-Value: {p_value}")
print(f"Degrees of Freedom: {dof}")

if p_value > 0.05:
    print("Conclusion: As p-value is greater than 0.05, we fail to reject the null hypothesis.")
else:
    print("Conclusion: As p-value is less than or equal to 0.05, we reject the null hypothesis.")

#P value is < 0.05, hence we reject the null hypothesis. There is a statisticall significant difference between experiencing a heart attack and engaging in exercise.

In [None]:
##Chi- Square Test between 2 categorical variables - Heart Attack and the use of Insulin

#Null Hypothesis (H₀): There is no statistically significant association between experiencing a heart attack and the use of insulin.
#Alternative Hypothesis (H₁): There is a statistically significant association between experiencing a heart attack and the use of insulin
cont = pd.crosstab(temmp["Heart_Attack"],temmp["Insulin"])
chi2_stat, p_value, dof, expected = scipy.stats.chi2_contingency(cont)

print(f"Chi-Square Statistic: {chi2_stat}")
print(f"P-Value: {p_value}")
print(f"Degrees of Freedom: {dof}")

if p_value > 0.05:
    print("Conclusion: As p-value is greater than 0.05, we fail to reject the null hypothesis.")
else:
    print("Conclusion: As p-value is less than or equal to 0.05, we reject the null hypothesis.")

#P value is less than 0.05, we reject the null hypothesis. Hence, there is statistically significant association between experiencing a heart attack and the use of insulin.

In [None]:
##Chi- Square Test between 2 categorical variables - Heart Attack and being obese

#Null Hypothesis (H₀): There is no statistically significant association between experiencing a heart attack and being overweight.
#Alternative Hypothesis (H₁): There is a statistically significant association between experiencing a heart attack and being overweight.

cont = pd.crosstab(temmp["Heart_Attack"],temmp["Overweight_or_Obese_Calculated"])
chi2_stat, p_value, dof, expected = scipy.stats.chi2_contingency(cont)

print(f"Chi-Square Statistic: {chi2_stat}")
print(f"P-Value: {p_value}")
print(f"Degrees of Freedom: {dof}")

if p_value > 0.05:
    print("Conclusion: As p-value is greater than 0.05, we fail to reject the null hypothesis.")
else:
    print("Conclusion: As p-value is less than or equal to 0.05, we reject the null hypothesis.")

#P value is greater than 0.05, therefore we fail to reject the null hypothesis. Hence, there is no statistically significant difference among the two.

In [None]:
##Chi- Square Test between 2 categorical variables - Heart Attack and Smoking status

#Null Hypothesis (H₀): There is no statistically significant association between experiencing a heart attack and being a smoker.
#Alternative Hypothesis (H₁):There is a statistically significant association between experiencing a heart attack and being a smoker.

cont = pd.crosstab(df_imputed["Heart_Attack"],df_imputed["Current_Smokers"])
chi2_stat, p_value, dof, expected = scipy.stats.chi2_contingency(cont)

print(f"Chi-Square Statistic: {chi2_stat}")
print(f"P-Value: {p_value}")
print(f"Degrees of Freedom: {dof}")

if p_value > 0.05:
    print("Conclusion: As p-value is greater than 0.05, we fail to reject the null hypothesis.")
else:
    print("Conclusion: As p-value is less than or equal to 0.05, we reject the null hypothesis.")

In [None]:
cont = pd.crosstab(df_imputed["Heart_Attack"],df_imputed["Gender"])
chi2_stat, p_value, dof, expected = scipy.stats.chi2_contingency(cont)

print(f"Chi-Square Statistic: {chi2_stat}")
print(f"P-Value: {p_value}")
print(f"Degrees of Freedom: {dof}")

if p_value > 0.05:
    print("Conclusion: As p-value is greater than 0.05, we fail to reject the null hypothesis.")
else:
    print("Conclusion: As p-value is less than or equal to 0.05, we reject the null hypothesis.")

In [None]:
cont = pd.crosstab(df_imputed["Heart_Attack"],df_imputed["Checkup"])
chi2_stat, p_value, dof, expected = scipy.stats.chi2_contingency(cont)

print(f"Chi-Square Statistic: {chi2_stat}")
print(f"P-Value: {p_value}")
print(f"Degrees of Freedom: {dof}")

if p_value > 0.05:
    print("Conclusion: As p-value is greater than 0.05, we fail to reject the null hypothesis.")
else:
    print("Conclusion: As p-value is less than or equal to 0.05, we reject the null hypothesis.")

In [None]:
#Hence, we notice an association between Heart Attack and gender, exercise, use of insulin and smoking status.

In [None]:
#Null Hypothesis (H0): There is no statistically significant difference in the frequency of health check-ups between individuals who have had a heart attack and those who have not.
#Alternative Hypothesis (H1): There is a statistically significant difference in the frequency of health check-ups between individuals who have had a heart attack and those who have not.
from scipy.stats import mannwhitneyu
stat, p = mannwhitneyu(df_imputed["Heart_Attack"], df_imputed["Diabetes_Checkup"])
print(f"Stat: {stat}")
print(f"P-Value: {p}")

if p > 0.05:
    print('Conclusion: As p-value is greater than 0.05, we fail to reject the null hypothesis.')
else:
    print('Conclusion: As p-value is less than or equal to 0.05, we reject the null hypothesis')

In [None]:
#Null Hypothesis (H0): There is no statistically significant difference in the frequency of glucose checks between individuals who have had a heart attack and those who have not.
#Alternative Hypothesis (H1): There is a statistically significant difference in the frequency of glucose checks between individuals who have had a heart attack and those who have not.

stat, p = mannwhitneyu(df_imputed["Heart_Attack"], df_imputed["Dly_Sugar_Check"])
print(f"Stat: {stat}")
print(f"P-Value: {p}")

if p > 0.05:
    print('Conclusion: As p-value is greater than 0.05, we fail to reject the null hypothesis.')
else:
    print('Conclusion: As p-value is less than or equal to 0.05, we reject the null hypothesis')

In [None]:
#Null Hypothesis (H0): There is no statistically significant difference in the frequency of glycosylated hemoglobin (HbA1c) checks between individuals who have had a heart attack and those who have not.
#Alternative Hypothesis (H1): There is a statistically significant difference in the frequency of glycosylated hemoglobin (HbA1c) checks between individuals who have had a heart attack and those who have not.

stat, p = mannwhitneyu(df_imputed["Heart_Attack"], df_imputed["Glycosylated_Hemoglobin"])
print(f"Stat: {stat}")
print(f"P-Value: {p}")

if p > 0.05:
    print('Conclusion: As p-value is greater than 0.05, we fail to reject the null hypothesis.')
else:
    print('Conclusion: As p-value is less than or equal to 0.05, we reject the null hypothesis')