In [1]:
import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
train = pd.read_csv('/kaggle/input/titanic/train.csv').drop(columns=['Survived'])
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [3]:
final_df = pd.concat([train,test]).sample(1309)

In [4]:
'''
here, we want to check if mean male age is significantly higher than mean female age. the claim is that both the means are almost similar. 
'''
pop_male = final_df[final_df['Sex'] == 'male']['Age'].dropna()
pop_female = final_df[final_df['Sex'] == 'female']['Age'].dropna()

In [5]:
pop_female

239    48.0
609    40.0
781    17.0
397    48.0
608    22.0
       ... 
205     2.0
530     2.0
8      27.0
437    24.0
251    29.0
Name: Age, Length: 388, dtype: float64

In [6]:
sample_male = pop_male.sample(25)
sample_female = pop_female.sample(25)

alpha = 0.05

In [7]:
# H0 - Mean age of male and female are similar
# H1 - Mean age of male is higher than female

In [8]:
pop_male.mean() #actually reference of male mean age to check if really was similar or significantly higher. 

30.58522796352584

In [9]:
pop_female.mean() 

28.68708762886598

In [10]:
'''
all t-tests require sample data to follow normal dist otherwise the test won't work on non-normal dist 
'''
from scipy.stats import shapiro

shapiro_male = shapiro(sample_male)
shapiro_female = shapiro(sample_female) 

print("Shapiro-Wilk test for desktop users:", shapiro_male)
print("Shapiro-Wilk test for mobile users:", shapiro_female)

Shapiro-Wilk test for desktop users: ShapiroResult(statistic=0.9326766729354858, pvalue=0.10021330416202545)
Shapiro-Wilk test for mobile users: ShapiroResult(statistic=0.9545333981513977, pvalue=0.31635603308677673)


In [11]:
from scipy.stats import levene
'''
independent 2 sample test requires that both the samples have equal variance. here, if the p value is greater than alpha (0.05), the variance 
is same of both the samples. 
'''
levene_test = levene(sample_male, sample_female)
print(levene_test)

LeveneResult(statistic=0.7369684966268486, pvalue=0.39490040600273624)


In [12]:
import scipy.stats as stats
'''
feed both the sample arrays to the given function and it automatically computes all the required parameters neccessary for formula and gives final
p value
'''
t_statistic, p_value = stats.ttest_ind(sample_male, sample_female)

print("t-statistic:", t_statistic)
print("p-value:", p_value/2)

t-statistic: 0.13388154347325673
p-value: 0.44702800602194875


In [13]:
alpha = 0.05

if p_value < alpha:
    print("Reject the null hypothesis.")
else:
    print("Fail to reject the null hypothesis.")

Fail to reject the null hypothesis.
