# Statistical Tests in Python

# One Sample T Test

In [82]:
# Does the average age of Gujarati voters differ from that of the population?

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import math

np.random.seed(6)

population_ages=stats.poisson.rvs(loc=18,mu=30,size=150000)

gujarat_ages=stats.poisson.rvs(loc=18,mu=30,size=30)
population_ages.mean()

48.00456

In [79]:
gujarat_ages.mean()

46.36666666666667

In [83]:
stats.ttest_1samp(a=gujarat_ages,popmean=population_ages.mean())

Ttest_1sampResult(statistic=-1.0181540233717032, pvalue=0.31702417787117476)

In [84]:
# we will accept the null hypothesis that there is no significant difference in the mean

# Two Sample T-Test

In [85]:
# Such a test tells us whether two data samples have different means

In [86]:
np.random.seed(12)
maharashtra_ages=stats.poisson.rvs(loc=18,mu=33,size=30)
maharashtra_ages.mean()

50.266666666666666

In [87]:
maharashtra_ages

array([44, 47, 60, 65, 47, 52, 56, 56, 44, 48, 50, 47, 55, 49, 50, 50, 50,
       41, 53, 47, 39, 50, 52, 43, 53, 51, 59, 53, 51, 46])

In [88]:
maharashtra_ages.mean()

50.266666666666666

In [89]:
gujarat_ages.mean()

47.1

In [90]:
stats.ttest_ind(a=gujarat_ages,b=maharashtra_ages)

Ttest_indResult(statistic=-2.331755951544322, pvalue=0.023208212240042377)

In [91]:
# Since the p value is less than 0.05 we will reject the null hypothesis, they both have different samples

# Paired T-test

In [92]:
# When you want to check how different samples from the same group are, you can go for a paired T-test

In [93]:
np.random.seed(11)
before=stats.norm.rvs(scale=30,loc=250,size=100)
after=before+stats.norm.rvs(scale=5,loc=-1.25,size=100)
weight_df=pd.DataFrame({"weight_before":before,
                         "weight_after":after,
                         "weight_change":after-before})

In [None]:
weight_df.head()

In [None]:
weight_df.describe()

In [None]:
before.mean()

In [None]:
after.mean()

In [94]:
stats.ttest_rel(a=before,b=after)

Ttest_relResult(statistic=2.5720175998568284, pvalue=0.011596444318439857)

In [96]:
# Since the p value is less than 0.05 we will reject the null hypothesis, there is a significant change in the samples

# One Sample Z Test

In [43]:
from statsmodels.stats.weightstats import ztest
import numpy as np

In [48]:
data=np.random.normal(loc=3.4,scale=0.1,size=100)
data.mean()

3.3943383593564493

In [49]:
singleValue=3.3

In [47]:
testResult=ztest(data,value=singleValue)
testResult

(9.599303291062952, 8.048661830521805e-22)

In [50]:
pValue=testResult[1]
print("p-value is: "+str(pValue))
print("")

p-value is: 8.048661830521805e-22



# Two Sample Z Test

In [64]:
import numpy as np
from statsmodels.stats.proportion import proportions_ztest
count = np.array([5, 12])
nobs = np.array([83, 99])
stat, pval = proportions_ztest(count, nobs)
print('P Value is {0:0.3f}'.format(pval))

P Value is 0.159


# One Way Anova

In [33]:
import pandas as pd
import scipy.stats as stats
import researchpy as rp
import statsmodels.api as sm
from statsmodels.formula.api import ols
    
import matplotlib.pyplot as plt

# Loading data
df = pd.read_csv("https://raw.githubusercontent.com/Opensourcefordatascience/Data-sets/master/difficile.csv")
df.drop('person', axis= 1, inplace= True)

# Recoding value from numeric to string
df['dose'].replace({1: 'placebo', 2: 'low', 3: 'high'}, inplace= True)
    
# Gettin summary statistics
rp.summary_cont(df['libido'])





Unnamed: 0,Variable,N,Mean,SD,SE,95% Conf.,Interval
0,libido,15.0,3.466667,1.76743,0.456349,2.487896,4.445437


In [34]:
df.head()

Unnamed: 0,dose,libido
0,placebo,3
1,placebo,2
2,placebo,1
3,placebo,1
4,placebo,4


In [35]:
rp.summary_cont(df['libido'].groupby(df['dose']))





Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
dose,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
high,5,5.0,1.581139,0.707107,3.450484,6.549516
low,5,3.2,1.30384,0.583095,1.922236,4.477764
placebo,5,2.2,1.30384,0.583095,0.922236,3.477764


In [36]:
stats.f_oneway(df['libido'][df['dose'] == 'high'], 
             df['libido'][df['dose'] == 'low'],
             df['libido'][df['dose'] == 'placebo'])

F_onewayResult(statistic=5.11864406779661, pvalue=0.024694289538222603)

In [31]:
# The F-statistic= 5.119 and the p-value= 0.025 which is indicating that there is an overall significant effect of 
# medication on libido.

# Two Way Anova

In [26]:
import numpy as np
import pandas as pd
import scipy

import statsmodels
import statsmodels.api as sm
from statsmodels.formula.api import ols

data = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/tooth_growth_csv')
df = data[0:10]

formula = 'len ~ C(supp) + C(dose) + C(supp):C(dose)'
model = ols(formula, data).fit()
aov_table = statsmodels.stats.anova.anova_lm(model, typ=2)
print(aov_table)

                      sum_sq    df          F        PR(>F)
C(supp)           205.350000   1.0  15.571979  2.311828e-04
C(dose)          2426.434333   2.0  91.999965  4.046291e-18
C(supp):C(dose)   108.319000   2.0   4.106991  2.186027e-02
Residual          712.106000  54.0        NaN           NaN


In [28]:
data.head()

Unnamed: 0,len,supp,dose
0,4.2,VC,0.5
1,11.5,VC,0.5
2,7.3,VC,0.5
3,5.8,VC,0.5
4,6.4,VC,0.5


In [29]:
# In a Two-Way ANOVA, there are two variables to consider. The question is whether our variable in question
# (tooth length len) is related to the two other variables supp and dose by the equation: len=supp+dose+supp×dose

# Chi Square Test

In [68]:
from scipy.stats import chi2_contingency

In [66]:
import numpy as np
a1 = [6, 4, 5, 10]
a2 = [8, 5, 3, 3]
a3 = [5, 4, 8, 4]
a4 = [4, 11, 7, 13]
a5 = [5, 8, 7, 6]
a6 = [7, 3, 5, 9]
dice = np.array([a1, a2, a3, a4, a5, a6])

In [67]:
dice

array([[ 6,  4,  5, 10],
       [ 8,  5,  3,  3],
       [ 5,  4,  8,  4],
       [ 4, 11,  7, 13],
       [ 5,  8,  7,  6],
       [ 7,  3,  5,  9]])

In [69]:
chi2_contingency(dice)

(16.490612061288754,
 0.35021521809742745,
 15,
 array([[ 5.83333333,  5.83333333,  5.83333333,  7.5       ],
        [ 4.43333333,  4.43333333,  4.43333333,  5.7       ],
        [ 4.9       ,  4.9       ,  4.9       ,  6.3       ],
        [ 8.16666667,  8.16666667,  8.16666667, 10.5       ],
        [ 6.06666667,  6.06666667,  6.06666667,  7.8       ],
        [ 5.6       ,  5.6       ,  5.6       ,  7.2       ]]))

In [76]:
chi2_stat, p_val, dof, ex = stats.chi2_contingency(dice)
print("===Chi2 Stat===")
print(chi2_stat)
print("\n")
print("===Degrees of Freedom===")
print(dof)
print("\n")
print("===P-Value===")
print(p_val)
print("\n")
print("===Contingency Table===")
print(ex)

===Chi2 Stat===
16.490612061288754


===Degrees of Freedom===
15


===P-Value===
0.35021521809742745


===Contingency Table===
[[ 5.83333333  5.83333333  5.83333333  7.5       ]
 [ 4.43333333  4.43333333  4.43333333  5.7       ]
 [ 4.9         4.9         4.9         6.3       ]
 [ 8.16666667  8.16666667  8.16666667 10.5       ]
 [ 6.06666667  6.06666667  6.06666667  7.8       ]
 [ 5.6         5.6         5.6         7.2       ]]


In [77]:
# Since the p value is below the threshold we reject the null hypothesis