In [3]:
# Problem Statement 1:
# Is gender independent of education level? A random sample of 395 people were surveyed and each person was asked to report 
# the highest education level they obtained. The data that resulted from the survey is summarized in the following table:
# High School Bachelors Masters Ph.d. Total Female 60 54 46 41 201 Male 40 44 53 57 194 Total 100 98 99 98 395 
# Question: Are gender and education level dependent at 5% level of significance? In other words, given the data collected above,
# is there a relationship between the gender of an individual and the level of education that they have obtained?

import pandas as pd
import numpy as np
from scipy import stats

alpha = 0.05
N=395
df=pd.DataFrame({'Female':[60,54,46,41],'Male':[40,44,53,57]})
tot_Fem = 201
tot_Mal = 194
tot = np.array([100,98,99,98])
Exp_val_F=(tot_Fem*tot)/N
Exp_val_M=(tot_Mal*tot)/N
print('Expected Values of the Female and Male Population is:\n', Exp_val_F,'\n', Exp_val_M,'\n')

# chisquare value 
# Sum of (Original Value - Expected Value)**2 / Expected Value
# Degree of Freedom = (number of columns-1)* (number of rows-1) = (2-1)*(4-1) = 1* 3 = 3
Chisq_F = np.sum((df['Female']-Exp_val_F)**2/Exp_val_F)
Chisq_M = np.sum((df['Male']-Exp_val_M)**2/Exp_val_M)
chisq_T=(Chisq_F + Chisq_M)
chisq_D=stats.chi2.isf(q=0.05, df=3)
print('Chisquare Value of the distribution is       : ', chisq_T)
print('P Value of the calculated chisquare value is : ', stats.chi2.sf(chisq_T, 3))
print('Chisquare value for degree of freedom 3 is   : ', chisq_D)

# The chisquare value for degree of freedom 3 is greater than actual chisquare value calculated. 
# Hence we fail to reject the Null Hypothesis. 
if chisq_T > chisq_D:
    print('\nChisquare Value is greater than the chisquare of degree of freedom 3. So reject the Null Hypothesis and \
there is a relationship between the gender and the level of eductaion')
else:
    print('Chisquare Value is less than the chisquare of degree of freedom 3. So fail to reject the Null Hypothesis and \
there is no relationship between the gender and the level of eductaion')

Expected Values of the Female and Male Population is:
 [50.88607595 49.86835443 50.37721519 49.86835443] 
 [49.11392405 48.13164557 48.62278481 48.13164557] 

Chisquare Value of the distribution is       :  8.006066246262538
P Value of the calculated chisquare value is :  0.045886500891747214
Chisquare value for degree of freedom 3 is   :  7.814727903251178

Chisquare Value is greater than the chisquare of degree of freedom 3. So reject the Null Hypothesis and there is a relationship between the gender and the level of eductaion


In [9]:
# Using the following data, perform a oneway analysis of variance using α=.05. Write up the results in APA format. 
# [Group1: 51, 45, 33, 45, 67] [Group2: 23, 43, 23, 43, 45] [Group3: 56, 76, 74, 87, 56] 
 
# Number of observations - n = 5
alpha = 0.05
N = 5
df1=pd.DataFrame({'Group1': [51,45,33,45,67]})
df2=pd.DataFrame({'Group2': [23,43,23,43,45]})                  
df3=pd.DataFrame({'Group3': [56,76,74,87,56]})

# Get the values for the groups - mean, deviations and square of deviations
df1['mean'] = df1['Group1'].mean()
df1['deviations'] = df1['Group1'] - df1['mean']
df1['sq deviations'] = df1['deviations']**2

df2['mean'] = df2['Group2'].mean()
df2['deviations'] = df2['Group2'] - df2['mean']
df2['sq deviations'] = df2['deviations']**2

df3['mean'] = df3['Group3'].mean()
df3['deviations'] = df3['Group3'] - df3['mean']
df3['sq deviations'] = df3['deviations']**2

# Print the values for the groups - mean, deviations and square of deviations
print(' Dataframe for Group1\n', '-'*20, '\n', df1,'\n')
print(' Dataframe for Group2\n', '-'*20, '\n', df2,'\n')
print(' Dataframe for Group3\n', '-'*20, '\n', df3,'\n')

# Get the Sum of the squares of the groups 
SS_Group1 = sum(df1['sq deviations'])
SS_Group2 = sum(df2['sq deviations'])
SS_Group3 = sum(df3['sq deviations'])

# Get the Variance  of the groups 
Var1 = SS_Group1/(N-1)
Var2 = SS_Group2/(N-1)
Var3 = SS_Group3/(N-1)

# Get the Mean standard Error and Sum of Squares of the groups 
MS_Err = (Var1+Var2+Var3)/3
print('Mean Standard Error is      : ', MS_Err)
dof = 15-3
print('Degree of freedom, Error is : ', dof)
SS_Err = MS_Err * dof
print('Sum of Squares of Error is  : ', SS_Err)

# Get the mean, deviations and square of deviations of the group mean
df4 = pd.DataFrame({'Group_mean': [df1['mean'][0], df2['mean'][0], df3['mean'][0]]})
df4['grand_mean'] = df4['Group_mean'].mean()
df4['deviations'] = df4['Group_mean'] - df4['grand_mean']
df4['sq deviations'] = df4['deviations']**2

print('\n Dataframe for group mean\n', '-'*24, '\n', df4,'\n')

# Get the Mean standard Error and Sum of Squares of the group mean
dof_groups = (3-1)
print('Degree of freedom of groups is : ', dof_groups)
Var_means = sum(df4['sq deviations'])/dof_groups
print('Variance of means is           : ', Var_means)
MS_Between = Var_means * 5
print('Mean Standard Between is       : ', MS_Between)
SS_group = MS_Between * dof_groups
print('Sum of Squares group is        : ', SS_group)

# Get the F Value
F = MS_Between / MS_Err
print('F Value is                     : ' , F)

SS_Total = SS_Err + SS_group
print('Sum of Squares Total is        : ', SS_Total)

effect_size = SS_group / SS_Total
print('Effect Size is                 : ', effect_size)

print('\n Anova Table\n', '-' * 11)
df5 = pd.DataFrame({'source':['group', 'error', 'total'], 'SS':[SS_group,SS_Err,SS_Total], 
                    'df':[dof, dof_groups,' '], 'F': [F, ' ', ' ']})
print (df5)

print('\n APA writeup\n', '-' * 11)
print('F(2,12) is : ', round(F,2) ,', p < ', alpha, 'Effect Size is : ', round(effect_size,2) ) 

 Dataframe for Group1
 -------------------- 
    Group1  mean  deviations  sq deviations
0      51  48.2         2.8           7.84
1      45  48.2        -3.2          10.24
2      33  48.2       -15.2         231.04
3      45  48.2        -3.2          10.24
4      67  48.2        18.8         353.44 

 Dataframe for Group2
 -------------------- 
    Group2  mean  deviations  sq deviations
0      23  35.4       -12.4         153.76
1      43  35.4         7.6          57.76
2      23  35.4       -12.4         153.76
3      43  35.4         7.6          57.76
4      45  35.4         9.6          92.16 

 Dataframe for Group3
 -------------------- 
    Group3  mean  deviations  sq deviations
0      56  69.8       -13.8         190.44
1      76  69.8         6.2          38.44
2      74  69.8         4.2          17.64
3      87  69.8        17.2         295.84
4      56  69.8       -13.8         190.44 

Mean Standard Error is      :  155.06666666666666
Degree of freedom, Error is :  1

In [11]:
# Calculate F Test for given 10, 20, 30, 40, 50 and 5,10,15, 20, 25.  
# For 10, 20, 30, 40, 50:
    
# Define the first array, calculate the mean, Standard deviation and Variance
N1 = np.array([10,20,30,40,50])
Sd_N1 = np.std(N1, ddof=1)
Var_N1 = np.var(N1, ddof=1)

print('First Array is                       : ', N1)
print('Mean of First Array is               : ', N1.mean())
print('Standard Deviation of First Array is : ', Sd_N1)
print('Variance of First Array is           : ', Var_N1, '\n')

# Define the second array, calculate the mean, Standard deviation and Variance
N2 = np.array([5,10,15,20,25])
Sd_N2 = np.std(N2, ddof=1)
Var_N2 = np.var(N2, ddof=1)

print('Second Array is                       : ', N2)
print('Mean of Second Array is               : ', N2.mean())
print('Standard Deviation of Second Array is : ', Sd_N2)
print('Variance of Second Array is           : ', Var_N2,'\n')


# Calculate the F value
F_Test = Var_N1 / Var_N2 
print('F Value of the series is              : ', F_Test)

First Array is                       :  [10 20 30 40 50]
Mean of First Array is               :  30.0
Standard Deviation of First Array is :  15.811388300841896
Variance of First Array is           :  250.0 

Second Array is                       :  [ 5 10 15 20 25]
Mean of Second Array is               :  15.0
Standard Deviation of Second Array is :  7.905694150420948
Variance of Second Array is           :  62.5 

F Value of the series is              :  4.0
