In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Uncomment below to view more rows
# pd.set_option('display.max_rows', 500)

In [2]:
# Read the mixed male-female stroke dataset
sample_df = pd.read_csv('./datasets/data_q4_2.csv')
sample_df

Unnamed: 0,gender,age
0,Male,67.0
1,Female,61.0
2,Male,80.0
3,Female,49.0
4,Female,79.0
...,...,...
244,Male,57.0
245,Female,14.0
246,Female,75.0
247,Male,71.0


In [3]:
# Calculate CDF at every 'age' point on both Male and Female datasets
def get_cdf_list(df):
    cdf_list = []
    num_samples = df.shape[0]

    cumulative_pr = 0
    for index, row in df.iterrows():
        cumulative_pr += 1 / num_samples
        cdf_list.append(cumulative_pr)
    
    return cdf_list

In [4]:
# Split the mixed dataset into 2 sets (Male and Female). Sort them in ascending order of age
female_df = sample_df.loc[sample_df['gender'] == 'Female'].sort_values('age').reset_index(drop=True)
male_df = sample_df.loc[sample_df['gender'] == 'Male'].sort_values('age').reset_index(drop=True)
# Add a new column denoting the CDF at each point in the male and female datasets
female_df['female_eCDF'] = get_cdf_list(female_df)
male_df['male_eCDF'] = get_cdf_list(male_df)

In [5]:
%matplotlib

Using matplotlib backend: MacOSX


In [6]:
# Find distinct datapoints and their corresponding CDFs at each of the points in both male and female sets
male_distinct_df = male_df.drop_duplicates(subset='age', keep="last").reset_index(drop=True)
female_distinct_df = female_df.drop_duplicates(subset='age', keep="last").reset_index(drop=True)

if male_df.shape[0] > female_df.shape[0]:
    x_age = female_distinct_df['age'].to_numpy()
else:
    x_age = male_distinct_df['age'].to_numpy()

KS_Test_cols = ['x', 'F_cap_male_left', 'F_cap_male_right', 'F_cap_female_left', 'F_cap_female_right', 'right_diff_abs', 'left_diff_abs']
row_list = []

for x in x_age:
    # Find CDF to the left of point x in the sorted male dataset
    male_left_cdf = male_distinct_df.loc[male_distinct_df['age'] < x, 'male_eCDF']
    F_cap_male_left = 0.0 if male_left_cdf.empty else male_left_cdf.max()
    # Find CDF to the right of point x in the sorted male dataset
    male_right_cdf = male_distinct_df.loc[male_distinct_df['age'] >= x, 'male_eCDF']
    F_cap_male_right = 0.0 if male_right_cdf.empty else male_right_cdf.min()
    # Find CDF to the left of point x in the sorted female dataset
    female_left_cdf = female_distinct_df.loc[female_distinct_df['age'] < x, 'female_eCDF']
    F_cap_female_left = 0.0 if female_left_cdf.empty else female_left_cdf.max()
    # Find CDF to the left of point x in the sorted female dataset
    female_right_cdf = female_distinct_df.loc[female_distinct_df['age'] >= x, 'female_eCDF']
    F_cap_female_right = 0.0 if female_right_cdf.empty else female_right_cdf.min()
    # Find absolute difference between left CDFs of male and female datasets
    left_diff_abs = round(abs(F_cap_male_left - F_cap_female_left), 4)
    # Find absolute difference between right CDFs of male and female datasets
    right_diff_abs = round(abs(F_cap_male_right - F_cap_female_right), 4)
    
    # Build each row to be appended to the KS Test Table 
    row = [x, F_cap_male_left, F_cap_male_right, F_cap_female_left, F_cap_female_right, left_diff_abs, right_diff_abs]
    row_dict = dict(zip(KS_Test_cols, row))
    row_list.append(row_dict)

# Build KS Test Table (represented as a dataframe)
KS_Test_df = pd.DataFrame(row_list, columns=KS_Test_cols)
KS_Test_df

Unnamed: 0,x,F_cap_male_left,F_cap_male_right,F_cap_female_left,F_cap_female_right,right_diff_abs,left_diff_abs
0,42.0,0.0,0.009259,0.056738,0.070922,0.0567,0.0617
1,43.0,0.009259,0.018519,0.056738,0.070922,0.0475,0.0524
2,45.0,0.018519,0.027778,0.056738,0.070922,0.0382,0.0431
3,47.0,0.027778,0.037037,0.078014,0.085106,0.0502,0.0481
4,48.0,0.037037,0.046296,0.078014,0.085106,0.041,0.0388
5,49.0,0.046296,0.055556,0.085106,0.099291,0.0388,0.0437
6,51.0,0.055556,0.064815,0.120567,0.141844,0.065,0.077
7,54.0,0.064815,0.092593,0.170213,0.191489,0.1054,0.0989
8,56.0,0.092593,0.101852,0.205674,0.219858,0.1131,0.118
9,57.0,0.101852,0.157407,0.219858,0.255319,0.118,0.0979


In [7]:
# Calculate KS statistic
x_points = []
d_right = KS_Test_df.iloc[KS_Test_df['right_diff_abs'].idxmax(axis=1)][['x', 'right_diff_abs']]
d_left = KS_Test_df.iloc[KS_Test_df['left_diff_abs'].idxmax(axis=1)][['x', 'left_diff_abs']]
if d_right['right_diff_abs'] == d_left['left_diff_abs']:
    print("KS Statistic is {0} at age points {1} and {2}".format(d_right['right_diff_abs'], d_left['x'], d_right['x']))
    x_points.append(d_right['x'])
    x_points.append(d_left['x'])
elif d_right['right_diff_abs'] > d_left['left_diff_abs']:
    print("KS Statistic is {0} at {1}".format(d_right['right_diff_abs'], d_right['x']))
    x_points.append(d_right['x'])
else:
    print("KS Statistic is {0} at {1}".format(d_left['left_diff_abs'], d_left['x']))
    x_points.append(d_left['x'])

KS Statistic is 0.118 at age points 56.0 and 57.0


In [8]:
# Reject/Accept Null Hypothesis based on calculated KS Statistic d and given threshold=0.05
d = max(d_right['right_diff_abs'], d_left['left_diff_abs'])
critical_value = 0.05

if d > critical_value:
    print("Rejected Null Hypothesis: We reject the hypothesis that female patients get a stroke at the same age as male patients, as KS Statistic d={0} exceeds threshold {1}".format(d, critical_value))
else:
    print("Failed to reject Null Hypothesis: We accept the hypothesis that female patients get a stroke at the same age as male patients, as KS Statistic d={0} does not exceed threshold {1}".format(d, critical_value))

Rejected Null Hypothesis: We reject the hypothesis that female patients get a stroke at the same age as male patients, as KS Statistic d=0.118 exceeds threshold 0.05


In [9]:
# Plot KS Test eCDF
plt.figure('KS Test eCDF', figsize=(20,8))
plt.step(male_df['age'].to_numpy(), male_df['male_eCDF'], where='post', lw = 1.5, label='Male eCDF')
plt.step(female_df['age'].to_numpy(), female_df['female_eCDF'], where='post', lw = 1.5, label='Female eCDF')
for x in x_points:
    plt.axvline(x, linestyle="dashed", lw=1)

plt.xlabel('Age')
plt.ylabel('eCDF')
plt.legend(loc='upper left')
plt.grid()
plt.show()