In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
import researchpy as rp
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Read in csv
df = pd.read_csv('fittedFailureRate.csv', encoding='unicode_escape')
df.head()

  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


Unnamed: 0.1,Unnamed: 0,Webpage,Page_Type,Institution,Funding,Tax_type,Region,Type,CU,CSU,CS_Department,CCCS,Elements,Errors,FR,FR_fitted
0,0,https://www.adams.edu/,A,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,698,1,0.001433,0.900164
1,1,https://www.adams.edu/academics/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,729,1,0.001372,0.891194
2,2,https://www.adams.edu/academics/graduate/couns...,D,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,507,1,0.001972,0.969114
3,3,https://www.adams.edu/catalog/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,512,1,0.001953,0.966918
4,4,https://www.adams.edu/faculty-staff/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,645,1,0.00155,0.916706


In [34]:
"""
Cohen's d is the effect size when comparing between two means.
Effect size = M1 - M2 / Standard Deviation
Final column: % of control group below the mean of experimental group
"""

cd = {'Relative size': ['None', 'Small', 'Medium', 'Large', 'Considerable'], 
      'Effect size': [0.0, 0.2, 0.5, 0.8, 1.4],
      '% below': [50, 58, 69, 79, 92]}
df_cohens = pd.DataFrame(data=cd)
print(df_cohens)

  Relative size  Effect size  % below
0          None          0.0       50
1         Small          0.2       58
2        Medium          0.5       69
3         Large          0.8       79
4  Considerable          1.4       92


In [15]:
df.groupby('Region')['FR'].mean()

Region
Metro            0.032524
North Central    0.019011
Northeast        0.010455
Northwest        0.018777
Online           0.014684
Pikes Peak       0.011349
Southeast        0.006828
Southwest        0.004244
West Central     0.028983
Name: FR, dtype: float64

In [16]:
df.groupby('Region')['FR'].std()

Region
Metro            0.031849
North Central    0.024046
Northeast        0.004513
Northwest        0.010162
Online           0.019421
Pikes Peak       0.009422
Southeast        0.007874
Southwest        0.003999
West Central     0.033075
Name: FR, dtype: float64

In [17]:
df.groupby('Region')['FR'].var()

Region
Metro            0.001014
North Central    0.000578
Northeast        0.000020
Northwest        0.000103
Online           0.000377
Pikes Peak       0.000089
Southeast        0.000062
Southwest        0.000016
West Central     0.001094
Name: FR, dtype: float64

In [18]:
df.groupby('Region')['FR'].count()

Region
Metro            310
North Central     40
Northeast         10
Northwest         20
Online            20
Pikes Peak        60
Southeast         20
Southwest         30
West Central      30
Name: FR, dtype: int64

In [2]:
print(pairwise_tukeyhsd(endog=df['FR_fitted'], groups=df['Region'], alpha=0.05))

       Multiple Comparison of Means - Tukey HSD, FWER=0.05        
    group1        group2    meandiff p-adj   lower   upper  reject
------------------------------------------------------------------
        Metro North Central  -0.1835 0.2421 -0.4135  0.0466  False
        Metro     Northeast  -0.2891 0.5097  -0.729  0.1509  False
        Metro     Northwest  -0.0954    0.9 -0.4114  0.2205  False
        Metro        Online  -0.4435  0.001 -0.7594 -0.1275   True
        Metro    Pikes Peak   -0.368  0.001 -0.5611 -0.1748   True
        Metro     Southeast  -0.4903  0.001 -0.8062 -0.1743   True
        Metro     Southwest  -0.6088  0.001 -0.8707  -0.347   True
        Metro  West Central  -0.1148    0.9 -0.3767   0.147  False
North Central     Northeast  -0.1056    0.9 -0.5898  0.3786  False
North Central     Northwest    0.088    0.9  -0.287  0.4631  False
North Central        Online    -0.26 0.4372  -0.635   0.115  False
North Central    Pikes Peak  -0.1845 0.5039  -0.464   0.095  F

In [19]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['Region'] == 'Metro'], group1_name="Metro",
                            group2=df['FR_fitted'][df['Region'] != 'Metro'], group2_name="All others")
print(summary)
print(results)

# Correlation
# > medium effect

     Variable      N      Mean        SD        SE  95% Conf.  Interval
0       Metro  310.0  1.694469  0.476017  0.027036   1.641271  1.747666
1  All others  230.0  1.370120  0.413630  0.027274   1.316380  1.423860
2    combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                   Independent t-test   results
0  Difference (Metro - All others) =     0.3243
1               Degrees of freedom =   538.0000
2                                t =     8.2727
3            Two side test p value =     0.0000
4           Difference < 0 p value =     1.0000
5           Difference > 0 p value =     0.0000
6                        Cohen's d =     0.7199
7                        Hedge's g =     0.7189
8                   Glass's delta1 =     0.6814
9                 Point-Biserial r =     0.3359


  groups = group1.append(group2, ignore_index= True)


In [20]:
stats.ttest_ind(df['FR_fitted'][df['Region'] == 'Metro'],
                df['FR_fitted'][df['Region'] != 'Metro'])

Ttest_indResult(statistic=8.272682855077814, pvalue=1.0351723857982042e-15)

In [21]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['Region'] == 'North Central'], group1_name="North Central",
                            group2=df['FR_fitted'][df['Region'] != 'North Central'], group2_name="All others")
print(summary)
print(results)

# No correlation

        Variable      N      Mean        SD        SE  95% Conf.  Interval
0  North Central   40.0  1.511005  0.357225  0.056482   1.396759  1.625251
1     All others  500.0  1.559946  0.486329  0.021749   1.517214  1.602677
2       combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                           Independent t-test   results
0  Difference (North Central - All others) =    -0.0489
1                       Degrees of freedom =   538.0000
2                                        t =    -0.6229
3                    Two side test p value =     0.5336
4                   Difference < 0 p value =     0.2668
5                   Difference > 0 p value =     0.7332
6                                Cohen's d =    -0.1024
7                                Hedge's g =    -0.1022
8                           Glass's delta1 =    -0.1370
9                         Point-Biserial r =    -0.0268


  groups = group1.append(group2, ignore_index= True)


In [22]:
stats.ttest_ind(df['FR_fitted'][df['Region'] == 'North Central'],
                df['FR_fitted'][df['Region'] != 'North Central'])

Ttest_indResult(statistic=-0.6229166136835794, pvalue=0.5336032720586124)

In [24]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['Region'] == 'Northeast'], group1_name="Northeast",
                            group2=df['FR_fitted'][df['Region'] != 'Northeast'], group2_name="All others")
print(summary)
print(results)

# No correlation

     Variable      N      Mean        SD        SE  95% Conf.  Interval
0   Northeast   10.0  1.405402  0.154004  0.048700   1.295235  1.515570
1  All others  530.0  1.559168  0.481494  0.020915   1.518082  1.600254
2    combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                       Independent t-test   results
0  Difference (Northeast - All others) =    -0.1538
1                   Degrees of freedom =   538.0000
2                                    t =    -1.0081
3                Two side test p value =     0.3139
4               Difference < 0 p value =     0.1569
5               Difference > 0 p value =     0.8431
6                            Cohen's d =    -0.3218
7                            Hedge's g =    -0.3213
8                       Glass's delta1 =    -0.9985
9                     Point-Biserial r =    -0.0434


  groups = group1.append(group2, ignore_index= True)


In [25]:
stats.ttest_ind(df['FR_fitted'][df['Region'] == 'Northeast'],
                df['FR_fitted'][df['Region'] != 'Northeast'])

Ttest_indResult(statistic=-1.008080560012808, pvalue=0.31386895667449)

In [28]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['Region'] == 'Northwest'], group1_name="Northwest",
                            group2=df['FR_fitted'][df['Region'] != 'Northwest'], group2_name="All others")
print(summary)
print(results)

# No correlation 

     Variable      N      Mean        SD        SE  95% Conf.  Interval
0   Northwest   20.0  1.599048  0.205574  0.045968   1.502836  1.695259
1  All others  520.0  1.554677  0.485326  0.021283   1.512866  1.596488
2    combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                       Independent t-test   results
0  Difference (Northwest - All others) =     0.0444
1                   Degrees of freedom =   538.0000
2                                    t =     0.4072
3                Two side test p value =     0.6841
4               Difference < 0 p value =     0.6580
5               Difference > 0 p value =     0.3420
6                            Cohen's d =     0.0928
7                            Hedge's g =     0.0926
8                       Glass's delta1 =     0.2158
9                     Point-Biserial r =     0.0176


  groups = group1.append(group2, ignore_index= True)


In [29]:
stats.ttest_ind(df['FR_fitted'][df['Region'] == 'Northwest'],
                df['FR_fitted'][df['Region'] != 'Northwest'])

Ttest_indResult(statistic=0.4071620582738536, pvalue=0.6840509112559108)

In [32]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['Region'] != 'Online'], group1_name="All others",
                            group2=df['FR_fitted'][df['Region'] == 'Online'], group2_name="Online")
print(summary)
print(results)

# Correlation 
# > medium effect

     Variable      N      Mean        SD        SE  95% Conf.  Interval
0  All others  520.0  1.568063  0.469701  0.020598   1.527598  1.608528
1      Online   20.0  1.251015  0.591601  0.132286   0.974137  1.527892
2    combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                    Independent t-test   results
0  Difference (All others - Online) =     0.3170
1                Degrees of freedom =   538.0000
2                                 t =     2.9321
3             Two side test p value =     0.0035
4            Difference < 0 p value =     0.9982
5            Difference > 0 p value =     0.0018
6                         Cohen's d =     0.6681
7                         Hedge's g =     0.6672
8                    Glass's delta1 =     0.6750
9                  Point-Biserial r =     0.1254


  groups = group1.append(group2, ignore_index= True)


In [31]:
stats.ttest_ind(df['FR_fitted'][df['Region'] == 'Online'],
                df['FR_fitted'][df['Region'] != 'Online'])

Ttest_indResult(statistic=-2.9320585066701654, pvalue=0.003510541366259111)

In [33]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['Region'] != 'Pikes Peak'], group1_name="All others",
                            group2=df['FR_fitted'][df['Region'] == 'Pikes Peak'], group2_name="Pikes Peak")
print(summary)
print(results)

# Correlation 
# ~ medium effect

     Variable      N      Mean        SD        SE  95% Conf.  Interval
0  All others  480.0  1.585050  0.479348  0.021879   1.542059  1.628041
1  Pikes Peak   60.0  1.326485  0.400412  0.051693   1.223048  1.429923
2    combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                        Independent t-test   results
0  Difference (All others - Pikes Peak) =     0.2586
1                    Degrees of freedom =   538.0000
2                                     t =     4.0062
3                 Two side test p value =     0.0001
4                Difference < 0 p value =     1.0000
5                Difference > 0 p value =     0.0000
6                             Cohen's d =     0.5486
7                             Hedge's g =     0.5478
8                        Glass's delta1 =     0.5394
9                      Point-Biserial r =     0.1702


  groups = group1.append(group2, ignore_index= True)


In [35]:
stats.ttest_ind(df['FR_fitted'][df['Region'] == 'Pikes Peak'],
                df['FR_fitted'][df['Region'] != 'Pikes Peak'])

Ttest_indResult(statistic=-4.006233761844781, pvalue=7.038367542354269e-05)

In [36]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['Region'] != 'Southeast'], group1_name="All others",
                            group2=df['FR_fitted'][df['Region'] == 'Southeast'], group2_name="Southeast")
print(summary)
print(results)

# Correlation 
# > medium effect

     Variable      N      Mean        SD        SE  95% Conf.  Interval
0  All others  520.0  1.569863  0.479453  0.021025   1.528557  1.611168
1   Southeast   20.0  1.204215  0.252026  0.056355   1.086263  1.322167
2    combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                       Independent t-test   results
0  Difference (All others - Southeast) =     0.3656
1                   Degrees of freedom =   538.0000
2                                    t =     3.3905
3                Two side test p value =     0.0007
4               Difference < 0 p value =     0.9996
5               Difference > 0 p value =     0.0004
6                            Cohen's d =     0.7726
7                            Hedge's g =     0.7715
8                       Glass's delta1 =     0.7626
9                     Point-Biserial r =     0.1446


  groups = group1.append(group2, ignore_index= True)


In [37]:
stats.ttest_ind(df['FR_fitted'][df['Region'] == 'Southeast'],
                df['FR_fitted'][df['Region'] != 'Southeast'])

Ttest_indResult(statistic=-3.3904587160526427, pvalue=0.0007489838714412557)

In [38]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['Region'] != 'Southwest'], group1_name="All others",
                            group2=df['FR_fitted'][df['Region'] == 'Southwest'], group2_name="Southwest")
print(summary)
print(results)

# Correlation 
# > large effect (Very large)

     Variable      N      Mean        SD        SE  95% Conf.  Interval
0  All others  510.0  1.584007  0.474500  0.021011   1.542728  1.625287
1   Southwest   30.0  1.085640  0.223471  0.040800   1.002195  1.169086
2    combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                       Independent t-test   results
0  Difference (All others - Southwest) =     0.4984
1                   Degrees of freedom =   538.0000
2                                    t =     5.7117
3                Two side test p value =     0.0000
4               Difference < 0 p value =     1.0000
5               Difference > 0 p value =     0.0000
6                            Cohen's d =     1.0730
7                            Hedge's g =     1.0715
8                       Glass's delta1 =     1.0503
9                     Point-Biserial r =     0.2391


  groups = group1.append(group2, ignore_index= True)


In [39]:
stats.ttest_ind(df['FR_fitted'][df['Region'] == 'Southwest'],
                df['FR_fitted'][df['Region'] != 'Southwest'])

Ttest_indResult(statistic=-5.711720100306712, pvalue=1.85225307522225e-08)