In [None]:
#Univariate Analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('medical_clean.csv')

#Print relevant information about the variable
print('++++==== Begin ====++++\n')
print('++++==== Description of Marriage ====++++\n')
print(df['Marital'].describe())
print('++++==== Unique Values in Marriage ====++++\n')
print(df['Marital'].unique())
print('++++==== Counts of Values in Marriage ====++++\n')
print(df['Marital'].value_counts())
print('++++==== Percentages of Values in Marriage ====++++\n')
print(np.round(df['Marital'].value_counts() / df['Marital'].count() * 100, 2))
print('++++==== End ====++++\n')

#https://matplotlib.org/stable/gallery/pie_and_polar_charts/pie_features.html
fig, ax = plt.subplots()
ax.pie(df['Marital'].value_counts(), labels=df['Marital'].unique(), autopct='%1.1f%%')
plt.show()

#Print relevant information about the variable
print('++++==== Begin ====++++\n')
print('++++==== Description of Initial_admin ====++++\n')
print(df['Initial_admin'].describe())
print('++++==== Unique Values in Initial_admin ====++++\n')
print(df['Initial_admin'].unique())
print('++++==== Counts of Values in Initial_admin ====++++\n')
print(df['Initial_admin'].value_counts())
print('++++==== Percentages of Values in Initial_admin ====++++\n')
print(np.round(df['Initial_admin'].value_counts() / df['Initial_admin'].count() * 100, 2))
print('++++==== End ====++++\n')

#https://matplotlib.org/stable/gallery/lines_bars_and_markers/bar_colors.html
fig, ax = plt.subplots()
ax.bar(df['Initial_admin'].unique(), df['Initial_admin'].value_counts(), label=df['Initial_admin'].unique)
ax.set_xlabel('Initial Admission Type')
ax.set_ylabel('Occurrences')
ax.set_title('Admission Types by Occurrence')
plt.show()

print('++++==== Begin ====++++\n')
print('++++==== Description of Income ====++++\n')
print(df['Income'].describe())
print('++++==== Median of Income ====++++\n')
print(f'Median: {df['Income'].median()}')
print('++++==== End ====++++\n')

plt.hist(df['Income'])
plt.title('Income')
plt.show()

print('++++==== Begin ====++++\n')
print('++++==== Description of TotalCharge ====++++\n')
print(df['TotalCharge'].describe())
print('++++==== Median of TotalCharge ====++++\n')
print(f'Median: {df['TotalCharge'].median()}')
print('++++==== End ====++++\n')

plt.hist(df['TotalCharge'])
plt.title('TotalCharge')
plt.show()

In [None]:
#Bivariate Analysis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('medical_clean.csv')

#Calculate IQR
def calculate_iqr(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    return Q3 - Q1

sns.boxplot(data=df, x='Gender', y='Additional_charges')
plt.title('Relationship of Additional Charges by Gender')
plt.show()

#Group by Asthma and calculate stats for TotalCharge
gender = df.groupby('Gender').agg(
    Mean=('Additional_charges', 'mean'),
    Median=('Additional_charges', 'median'),
    Max=('Additional_charges', 'max'),
    Min=('Additional_charges', 'min'),
    IQR=('Additional_charges', calculate_iqr)
).reset_index()

print(gender)

sns.boxplot(data=df, x='Asthma', y='TotalCharge')
plt.title('Relationship of TotalCharge by Asthma')
plt.show()

#Group by Asthma and calculate stats for TotalCharge
asthma = df.groupby('Asthma').agg(
    Mean=('TotalCharge', 'mean'),
    Median=('TotalCharge', 'median'),
    Max=('TotalCharge', 'max'),
    Min=('TotalCharge', 'min'),
    IQR=('TotalCharge', calculate_iqr)
).reset_index()

print(asthma)

In [94]:
#Chi-square test
import pandas as pd
import scipy.stats as stats

df = pd.read_csv('medical_clean.csv')

#Establish the alpha
alpha = 0.05

#Create the contingency table
contingency_table = pd.crosstab(df['ReAdmis'], df['Area'])

#Print the contingency table
print(contingency_table)

#Perform the test
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

#Print the results
print("Chi-square:", chi2)
print("P-value:", p)
print("The finding is significant" if p < alpha else "The finding is not significant")

Area     Rural  Suburban  Urban
ReAdmis                        
No        2150      2106   2075
Yes       1219      1222   1228
Chi-square: 0.7133125620168337
P-value: 0.7000130641731285
The finding is not significant
