In [2]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.stats import ttest_ind
import scipy as sp
import scipy.stats as stats
import math
from statsmodels.formula.api import ols
import statsmodels.api as sm
from statsmodels.stats import weightstats as stests




In [24]:
pd.set_option('display.max_columns', None)
master = pd.read_csv('master_names_clean.csv')


# Light Condition vs. Age of Driver

Is there is a relationship between light condition and age of driver?

***Null Hypothesis***: there is not a statistically significant difference between light conditions and age of driver.

***Alternative Hypothesis***: there is a statistically significant difference between accident severity and age of driver

We start by determining the sample size requirements and test for normality

We will use a Welch Test

In [4]:
light22 = master[['light_conditions','age_driver']]
morelight = light22.groupby('light_conditions').agg({'age_driver': ['mean']})
morelight.reset_index(inplace=True)

 Calculating the standardized effect size, which is the difference between the two means divided by the standard deviation. 

In [5]:
light22.groupby('light_conditions').describe()

Unnamed: 0_level_0,age_driver,age_driver,age_driver,age_driver,age_driver,age_driver,age_driver,age_driver
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
light_conditions,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,50111.0,42.924508,16.902676,10.0,29.0,41.0,54.0,101.0
1,17790.0,38.306633,15.406243,13.0,26.0,35.0,49.0,97.0


In [6]:
#difference in means
42.924508-38.306633

4.617875000000005

In [7]:
#dividing the difference by standard deviation 
4.617875000000005/16.902676

0.27320378146040336

In [8]:
import statsmodels.stats.power as smp
smp.ttest_power(0.26, nobs=17790.0, alpha=0.025, alternative='two-sided')

1.0

Test for normality

In [9]:
from scipy import stats

stats.shapiro(master['age_driver'])



(0.9565568566322327, 0.0)

In [10]:
from scipy import stats

stats.shapiro(master['light_conditions'])

(0.548467218875885, 0.0)

In [11]:
from statsmodels.stats.power import TTestIndPower
effect_size = 0.26
alpha = 0.025 # significance level
power = 1.0
alternative= 'two-sided'
power_analysis = TTestIndPower()
sample_size = power_analysis.solve_power(effect_size = effect_size, 
                                         power = power, 
                                         alpha = alpha,
                                      alternative = alternative)
sample_size

5000.0

In [12]:


light = morelight.loc[0]
dark = morelight.loc[1]

statistic, pvalue = ttest_ind(light ,dark, equal_var= False,)
alpha = 0.025

print("Pvalue:", float(pvalue)) 
print("Tstat:", float(statistic)) 
if pvalue < alpha:
    print("Null hypothesis rejected, there is a statistical significance between light conditions and age of driver ")
else:

    print("Fail to reject null hypothesis there is no statistical significance between light conditions and age of driver ")

Pvalue: 0.955161705684588
Tstat: 0.06361557918807238
Fail to reject null hypothesis there is no statistical significance between light conditions and age of driver 


# Accident Severity vs Light Conditon

Is there is a relationship between accident severity and light condition of the accident?

***Null Hypothesis***: there is not a statistically significant difference accident severity and light condition of the accident .

***Alternative Hypothesis***: there is a statistically significant difference between accident severity and light condition of the accident.

In [14]:
contingency_table=pd.crosstab(master['accident_severity'],master['light_conditions'])
print('contingency_table :-\n',contingency_table)

contingency_table :-
 light_conditions       0      1
accident_severity              
1                    419    201
2                   8649   3460
3                  41043  14129


In [15]:
Observed_Values = contingency_table.values 
print("Observed Values :-\n",Observed_Values)

Observed Values :-
 [[  419   201]
 [ 8649  3460]
 [41043 14129]]


In [16]:
b=stats.chi2_contingency(contingency_table)
Expected_Values = b[3]
print("Expected Values :-\n",Expected_Values)

Expected Values :-
 [[  457.56056612   162.43943388]
 [ 8936.45305666  3172.54694334]
 [40716.98637723 14455.01362277]]


In [17]:
no_of_rows=len(contingency_table.iloc[0:3,0])
no_of_columns=len(contingency_table.iloc[0,0:2])
df =(no_of_rows-1)*(no_of_columns-1)
print("Degree of Freedom:-",df)
alpha = 0.05

Degree of Freedom:- 2


In [18]:
from scipy.stats import chi2
chi_square=sum([(o-e)**2./e for o,e in zip(Observed_Values,Expected_Values)])
chi_square_statistic=chi_square[0]+chi_square[1]
print("chi-square statistic:-",chi_square_statistic)

chi-square statistic:- 57.65787222602076


In [19]:

critical_value=chi2.ppf(q=1-alpha,df=df)
print('critical_value:',critical_value)
p_value=1-chi2.cdf(x=chi_square_statistic,df=df)
print('p-value:',p_value)

critical_value: 5.991464547107979
p-value: 3.0186964039558006e-13


In [22]:
print('Significance level: ',alpha)
print('Degree of Freedom: ',df)
print('chi-square statistic:',chi_square_statistic)
print('critical_value:',critical_value)
print('p-value:',p_value)

Significance level:  0.05
Degree of Freedom:  2
chi-square statistic: 57.65787222602076
critical_value: 5.991464547107979
p-value: 3.0186964039558006e-13


In [23]:
if chi_square_statistic>=critical_value:
    print("Reject null hypothesis, there is a relationship between accident severity and light condition of the accident")
else:
    print("Accept null hypothesis, there is no relationship between accident severity and light condition of the accident")
    
if p_value<=alpha:
    print("Reject null hypothesis, there is a relationship between accident severity and light condition of the accident")
else:
    print("Accept null hypothesis, there is no relationship between accident severity and light condition of the accident")

Reject null hypothesis, there is a relationship between accident severity and light condition of the accident
Reject null hypothesis, there is a relationship between accident severity and light condition of the accident
