In [3]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
import researchpy as rp
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Read in csv
df = pd.read_csv('fittedFailureRate.csv', encoding='unicode_escape')
df['FundType'] = df['Funding'] + df['Type']
df.head()

  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


Unnamed: 0.1,Unnamed: 0,Webpage,Page_Type,Institution,Funding,Tax_type,Region,Type,CU,CSU,CS_Department,CCCS,Elements,Errors,FR,FR_fitted,FundType
0,0,https://www.adams.edu/,A,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,698,1,0.001433,0.900164,publicUniversity
1,1,https://www.adams.edu/academics/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,729,1,0.001372,0.891194,publicUniversity
2,2,https://www.adams.edu/academics/graduate/couns...,D,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,507,1,0.001972,0.969114,publicUniversity
3,3,https://www.adams.edu/catalog/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,512,1,0.001953,0.966918,publicUniversity
4,4,https://www.adams.edu/faculty-staff/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,645,1,0.00155,0.916706,publicUniversity


In [4]:
df.groupby(['Funding','Type'])['FR'].mean()

Funding  Type             
private  College              0.042552
         University           0.028821
         VoTech               0.034175
public   College              0.013433
         Community College    0.014357
         University           0.018881
         VoTech               0.049524
Name: FR, dtype: float64

In [5]:
df.groupby(['Funding','Type'])['FR'].std()

Funding  Type             
private  College              0.044601
         University           0.029254
         VoTech               0.029855
public   College              0.011237
         Community College    0.015461
         University           0.024467
         VoTech               0.032664
Name: FR, dtype: float64

In [6]:
df.groupby(['Funding','Type'])['FR'].var()

Funding  Type             
private  College              0.001989
         University           0.000856
         VoTech               0.000891
public   College              0.000126
         Community College    0.000239
         University           0.000599
         VoTech               0.001067
Name: FR, dtype: float64

In [7]:
df.groupby(['Funding','Type'])['FR'].count()

Funding  Type             
private  College               40
         University           130
         VoTech                50
public   College               10
         Community College    140
         University           140
         VoTech                30
Name: FR, dtype: int64

In [8]:
print(pairwise_tukeyhsd(endog=df['FR_fitted'], groups=df['FundType'], alpha=0.05))

                 Multiple Comparison of Means - Tukey HSD, FWER=0.05                  
         group1                  group2         meandiff p-adj   lower   upper  reject
--------------------------------------------------------------------------------------
         privateCollege       privateUniversity  -0.1178 0.7527 -0.3601  0.1244  False
         privateCollege           privateVoTech   0.0689    0.9 -0.2152  0.3531  False
         privateCollege           publicCollege  -0.2673 0.6177 -0.7409  0.2064  False
         privateCollege publicCommunity College  -0.2902 0.0069 -0.5304   -0.05   True
         privateCollege        publicUniversity  -0.3299 0.0011   -0.57 -0.0897   True
         privateCollege            publicVoTech   0.1881 0.5883 -0.1355  0.5116  False
      privateUniversity           privateVoTech   0.1868 0.1689 -0.0362  0.4097  False
      privateUniversity           publicCollege  -0.1495    0.9 -0.5891  0.2902  False
      privateUniversity publicCommunity Col

In [9]:
summary, results = rp.ttest(group1=df['FR_fitted'][(df['FundType'] == 'privateCollege')], group1_name="Private colleges",
                            group2=df['FR_fitted'][(df['FundType'] != 'privateCollege')], group2_name="All others")
print(summary)
print(results)

           Variable      N      Mean        SD        SE  95% Conf.  Interval
0  Private colleges   40.0  1.733559  0.613235  0.096961   1.537437  1.929681
1        All others  500.0  1.542141  0.463203  0.020715   1.501442  1.582841
2          combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                              Independent t-test   results
0  Difference (Private colleges - All others) =     0.1914
1                          Degrees of freedom =   538.0000
2                                           t =     2.4490
3                       Two side test p value =     0.0146
4                      Difference < 0 p value =     0.9927
5                      Difference > 0 p value =     0.0073
6                                   Cohen's d =     0.4024
7                                   Hedge's g =     0.4019
8                              Glass's delta1 =     0.3121
9                            Point-Biserial r =     0.1050


  groups = group1.append(group2, ignore_index= True)


In [11]:
stats.ttest_ind(df['FR_fitted'][(df['FundType'] == 'privateCollege')],
                df['FR_fitted'][(df['FundType'] != 'privateCollege')])

Ttest_indResult(statistic=2.449018481891657, pvalue=0.01464258546306276)

In [12]:
summary, results = rp.ttest(group1=df['FR_fitted'][(df['FundType'] == 'publicCollege')], group1_name="Public colleges",
                            group2=df['FR_fitted'][(df['FundType'] != 'publicCollege')], group2_name="All others")
print(summary)
print(results)

          Variable      N      Mean        SD        SE  95% Conf.  Interval
0  Public colleges   10.0  1.466271  0.205799  0.065079   1.319051  1.613491
1       All others  530.0  1.558019  0.481458  0.020913   1.516936  1.599102
2         combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                             Independent t-test   results
0  Difference (Public colleges - All others) =    -0.0917
1                         Degrees of freedom =   538.0000
2                                          t =    -0.6011
3                      Two side test p value =     0.5480
4                     Difference < 0 p value =     0.2740
5                     Difference > 0 p value =     0.7260
6                                  Cohen's d =    -0.1919
7                                  Hedge's g =    -0.1916
8                             Glass's delta1 =    -0.4458
9                           Point-Biserial r =    -0.0259


  groups = group1.append(group2, ignore_index= True)


In [13]:
summary, results = rp.ttest(group1=df['FR_fitted'][(df['FundType'] == 'privateUniversity')], group1_name="Private universities",
                            group2=df['FR_fitted'][(df['FundType'] != 'privateUniversity')], group2_name="All others")
print(summary)
print(results)

               Variable      N      Mean        SD        SE  95% Conf.  \
0  Private universities  130.0  1.615722  0.523212  0.045889   1.524930   
1            All others  410.0  1.537485  0.461657  0.022800   1.492666   
2              combined  540.0  1.556320  0.477872  0.020564   1.515924   

   Interval  
0  1.706515  
1  1.582304  
2  1.596716  
                                  Independent t-test   results
0  Difference (Private universities - All others) =     0.0782
1                              Degrees of freedom =   538.0000
2                                               t =     1.6290
3                           Two side test p value =     0.1039
4                          Difference < 0 p value =     0.9481
5                          Difference > 0 p value =     0.0519
6                                       Cohen's d =     0.1640
7                                       Hedge's g =     0.1637
8                                  Glass's delta1 =     0.1495
9            

  groups = group1.append(group2, ignore_index= True)


In [14]:
summary, results = rp.ttest(group1=df['FR_fitted'][(df['FundType'] == 'publicUniversity')], group1_name="Public universities",
                            group2=df['FR_fitted'][(df['FundType'] != 'publicUniversity')], group2_name="All others")
print(summary)
print(results)

              Variable      N      Mean        SD        SE  95% Conf.  \
0  Public universities  140.0  1.403696  0.507745  0.042912   1.318851   
1           All others  400.0  1.609739  0.455679  0.022784   1.564947   
2             combined  540.0  1.556320  0.477872  0.020564   1.515924   

   Interval  
0  1.488541  
1  1.654530  
2  1.596716  
                                 Independent t-test   results
0  Difference (Public universities - All others) =    -0.2060
1                             Degrees of freedom =   538.0000
2                                              t =    -4.4673
3                          Two side test p value =     0.0000
4                         Difference < 0 p value =     0.0000
5                         Difference > 0 p value =     1.0000
6                                      Cohen's d =    -0.4387
7                                      Hedge's g =    -0.4381
8                                 Glass's delta1 =    -0.4058
9                          

  groups = group1.append(group2, ignore_index= True)


In [15]:
stats.ttest_ind(df['FR_fitted'][(df['FundType'] == 'publicUniversity')],
                df['FR_fitted'][(df['FundType'] != 'publicUniversity')])

Ttest_indResult(statistic=-4.46734452431102, pvalue=9.65865585660817e-06)

In [16]:
summary, results = rp.ttest(group1=df['FR_fitted'][(df['FundType'] == 'privateVoTech')], group1_name="Private VoTechs",
                            group2=df['FR_fitted'][(df['FundType'] != 'privateVoTech')], group2_name="All others")
print(summary)
print(results)

          Variable      N      Mean        SD        SE  95% Conf.  Interval
0  Private VoTechs   50.0  1.802506  0.311993  0.044122   1.713839  1.891174
1       All others  490.0  1.531199  0.484900  0.021906   1.488159  1.574240
2         combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                             Independent t-test   results
0  Difference (Private VoTechs - All others) =     0.2713
1                         Degrees of freedom =   538.0000
2                                          t =     3.8735
3                      Two side test p value =     0.0001
4                     Difference < 0 p value =     0.9999
5                     Difference > 0 p value =     0.0001
6                                  Cohen's d =     0.5751
7                                  Hedge's g =     0.5743
8                             Glass's delta1 =     0.8696
9                           Point-Biserial r =     0.1647


  groups = group1.append(group2, ignore_index= True)


In [17]:
stats.ttest_ind(df['FR_fitted'][(df['FundType'] == 'privateVoTech')],
                df['FR_fitted'][(df['FundType'] != 'privateVoTech')])

Ttest_indResult(statistic=3.8735117689948066, pvalue=0.0001204922746138038)

In [18]:
summary, results = rp.ttest(group1=df['FR_fitted'][(df['FundType'] == 'publicVoTech')], group1_name="Public VoTechs",
                            group2=df['FR_fitted'][(df['FundType'] != 'publicVoTech')], group2_name="All others")
print(summary)
print(results)

         Variable      N      Mean        SD        SE  95% Conf.  Interval
0  Public VoTechs   30.0  1.921630  0.467168  0.085293   1.747187  2.096074
1      All others  510.0  1.534831  0.470169  0.020819   1.493929  1.575734
2        combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                            Independent t-test   results
0  Difference (Public VoTechs - All others) =     0.3868
1                        Degrees of freedom =   538.0000
2                                         t =     4.3806
3                     Two side test p value =     0.0000
4                    Difference < 0 p value =     1.0000
5                    Difference > 0 p value =     0.0000
6                                 Cohen's d =     0.8230
7                                 Hedge's g =     0.8218
8                            Glass's delta1 =     0.8280
9                          Point-Biserial r =     0.1856


  groups = group1.append(group2, ignore_index= True)


In [19]:
stats.ttest_ind(df['FR_fitted'][(df['FundType'] == 'publicVoTech')],
                df['FR_fitted'][(df['FundType'] != 'publicVoTech')])

Ttest_indResult(statistic=4.380554253224218, pvalue=1.4234096089934076e-05)

In [20]:
summary, results = rp.ttest(group1=df['FR_fitted'][(df['FundType'] == 'publicCommunity College')], group1_name="Community Colleges",
                            group2=df['FR_fitted'][(df['FundType'] != 'publicCommunity College')], group2_name="All others")
print(summary)
print(results)

             Variable      N      Mean        SD        SE  95% Conf.  \
0  Community Colleges  140.0  1.443374  0.292775  0.024744   1.394451   
1          All others  400.0  1.595851  0.522096  0.026105   1.544531   
2            combined  540.0  1.556320  0.477872  0.020564   1.515924   

   Interval  
0  1.492297  
1  1.647172  
2  1.596716  
                                Independent t-test   results
0  Difference (Community Colleges - All others) =    -0.1525
1                            Degrees of freedom =   538.0000
2                                             t =    -3.2786
3                         Two side test p value =     0.0011
4                        Difference < 0 p value =     0.0006
5                        Difference > 0 p value =     0.9994
6                                     Cohen's d =    -0.3219
7                                     Hedge's g =    -0.3215
8                                Glass's delta1 =    -0.5208
9                              Point-Bise

  groups = group1.append(group2, ignore_index= True)


In [21]:
stats.ttest_ind(df['FR_fitted'][(df['FundType'] == 'publicCommunity College')],
                df['FR_fitted'][(df['FundType'] != 'publicCommunity College')])

Ttest_indResult(statistic=-3.278565061641866, pvalue=0.0011107848158906771)