In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
import researchpy as rp
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

import seaborn as sns

# Read in csv
df = pd.read_csv('fittedFailureRate.csv', encoding='unicode_escape')
df['TaxType'] = df['Tax_type'] + df['Type']
df.head()

  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


Unnamed: 0.1,Unnamed: 0,Webpage,Page_Type,Institution,Funding,Tax_type,Region,Type,CU,CSU,CS_Department,CCCS,Elements,Errors,FR,FR_fitted,TaxType
0,0,https://www.adams.edu/,A,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,698,1,0.001433,0.900164,non-profitUniversity
1,1,https://www.adams.edu/academics/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,729,1,0.001372,0.891194,non-profitUniversity
2,2,https://www.adams.edu/academics/graduate/couns...,D,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,507,1,0.001972,0.969114,non-profitUniversity
3,3,https://www.adams.edu/catalog/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,512,1,0.001953,0.966918,non-profitUniversity
4,4,https://www.adams.edu/faculty-staff/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,645,1,0.00155,0.916706,non-profitUniversity


In [3]:
df.groupby(['Tax_type','Type'])['FR'].mean()

Tax_type    Type             
non-profit  College              0.028544
            Community College    0.014357
            University           0.022785
            VoTech               0.049524
profit      College              0.042185
            University           0.027550
            VoTech               0.034175
Name: FR, dtype: float64

In [4]:
df.groupby(['Tax_type','Type'])['FR'].std()

Tax_type    Type             
non-profit  College              0.046960
            Community College    0.015461
            University           0.027242
            VoTech               0.032664
profit      College              0.037769
            University           0.027413
            VoTech               0.029855
Name: FR, dtype: float64

In [5]:
df.groupby(['Tax_type','Type'])['FR'].var()

Tax_type    Type             
non-profit  College              0.002205
            Community College    0.000239
            University           0.000742
            VoTech               0.001067
profit      College              0.001426
            University           0.000751
            VoTech               0.000891
Name: FR, dtype: float64

In [6]:
df.groupby(['Tax_type','Type'])['FR'].count()

Tax_type    Type             
non-profit  College               20
            Community College    140
            University           220
            VoTech                30
profit      College               30
            University            50
            VoTech                50
Name: FR, dtype: int64

In [7]:
print(pairwise_tukeyhsd(endog=df['FR_fitted'], groups=df['TaxType'], alpha=0.05))

                     Multiple Comparison of Means - Tukey HSD, FWER=0.05                     
           group1                      group2           meandiff p-adj   lower  upper  reject
---------------------------------------------------------------------------------------------
          non-profitCollege non-profitCommunity College  -0.2099  0.471 -0.5339 0.1141  False
          non-profitCollege        non-profitUniversity  -0.1767 0.6283 -0.4932 0.1399  False
          non-profitCollege            non-profitVoTech   0.2683 0.3988 -0.1229 0.6596  False
          non-profitCollege               profitCollege   0.0447    0.9 -0.3465 0.4359  False
          non-profitCollege            profitUniversity  -0.0191    0.9 -0.3777 0.3395  False
          non-profitCollege                profitVoTech   0.1492 0.8746 -0.2094 0.5078  False
non-profitCommunity College        non-profitUniversity   0.0332    0.9 -0.1133 0.1798  False
non-profitCommunity College            non-profitVoTech   0.

In [8]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['TaxType'] == 'non-profitCollege'], group1_name="Non-profit Colleges",
                            group2=df['FR_fitted'][df['TaxType'] != 'non-profitCollege'], group2_name="All others")
print(summary)
print(results)

              Variable      N      Mean        SD        SE  95% Conf.  \
0  Non-profit Colleges   20.0  1.653281  0.366627  0.081980   1.481695   
1           All others  520.0  1.552591  0.481524  0.021116   1.511107   
2             combined  540.0  1.556320  0.477872  0.020564   1.515924   

   Interval  
0  1.824868  
1  1.594075  
2  1.596716  
                                 Independent t-test   results
0  Difference (Non-profit Colleges - All others) =     0.1007
1                             Degrees of freedom =   538.0000
2                                              t =     0.9246
3                          Two side test p value =     0.3556
4                         Difference < 0 p value =     0.8222
5                         Difference > 0 p value =     0.1778
6                                      Cohen's d =     0.2107
7                                      Hedge's g =     0.2104
8                                 Glass's delta1 =     0.2746
9                          

  groups = group1.append(group2, ignore_index= True)


In [10]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['TaxType'] == 'profitCollege'], group1_name="Profit Colleges",
                            group2=df['FR_fitted'][df['TaxType'] != 'profitCollege'], group2_name="All others")
print(summary)
print(results)

          Variable      N      Mean        SD        SE  95% Conf.  Interval
0  Profit Colleges   30.0  1.697981  0.670591  0.122433   1.447578  1.948384
1       All others  510.0  1.547987  0.463625  0.020530   1.507654  1.588321
2         combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                             Independent t-test   results
0  Difference (Profit Colleges - All others) =     0.1500
1                         Degrees of freedom =   538.0000
2                                          t =     1.6735
3                      Two side test p value =     0.0948
4                     Difference < 0 p value =     0.9526
5                     Difference > 0 p value =     0.0474
6                                  Cohen's d =     0.3144
7                                  Hedge's g =     0.3140
8                             Glass's delta1 =     0.2237
9                           Point-Biserial r =     0.0720


  groups = group1.append(group2, ignore_index= True)


In [11]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['TaxType'] == 'non-profitUniversity'], group1_name="Non-profit Universities",
                            group2=df['FR_fitted'][df['TaxType'] != 'non-profitUniversity'], group2_name="All others")
print(summary)
print(results)

                  Variable      N      Mean        SD        SE  95% Conf.  \
0  Non-profit Universities  220.0  1.476597  0.541820  0.036530   1.404603   
1               All others  320.0  1.611130  0.420612  0.023513   1.564870   
2                 combined  540.0  1.556320  0.477872  0.020564   1.515924   

   Interval  
0  1.548592  
1  1.657390  
2  1.596716  
                                  Independent t-test   results
0  Difference (Non-profit Universities - All othe...   -0.1345
1                              Degrees of freedom =   538.0000
2                                               t =    -3.2427
3                           Two side test p value =     0.0013
4                          Difference < 0 p value =     0.0006
5                          Difference > 0 p value =     0.9994
6                                       Cohen's d =    -0.2840
7                                       Hedge's g =    -0.2836
8                                  Glass's delta1 =    -0.2483
9

  groups = group1.append(group2, ignore_index= True)


In [12]:
stats.ttest_ind(df['FR_fitted'][df['TaxType'] == 'non-profitUniversity'],
                df['FR_fitted'][df['TaxType'] != 'non-profitUniversity'])

Ttest_indResult(statistic=-3.2426876444392034, pvalue=0.0012574750402337272)

In [13]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['TaxType'] == 'profitUniversity'], group1_name="Profit Universities",
                            group2=df['FR_fitted'][df['TaxType'] != 'profitUniversity'], group2_name="All others")
print(summary)
print(results)

              Variable      N      Mean        SD        SE  95% Conf.  \
0  Profit Universities   50.0  1.634199  0.425596  0.060188   1.513246   
1           All others  490.0  1.548373  0.482574  0.021800   1.505539   
2             combined  540.0  1.556320  0.477872  0.020564   1.515924   

   Interval  
0  1.755152  
1  1.591208  
2  1.596716  
                                 Independent t-test   results
0  Difference (Profit Universities - All others) =     0.0858
1                             Degrees of freedom =   538.0000
2                                              t =     1.2103
3                          Two side test p value =     0.2267
4                         Difference < 0 p value =     0.8866
5                         Difference > 0 p value =     0.1134
6                                      Cohen's d =     0.1797
7                                      Hedge's g =     0.1794
8                                 Glass's delta1 =     0.2017
9                          

  groups = group1.append(group2, ignore_index= True)


In [14]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['TaxType'] == 'non-profitVoTech'], group1_name="Non-profit VoTechs",
                            group2=df['FR_fitted'][df['TaxType'] != 'non-profitVoTech'], group2_name="All others")
print(summary)
print(results)

             Variable      N      Mean        SD        SE  95% Conf.  \
0  Non-profit VoTechs   30.0  1.921630  0.467168  0.085293   1.747187   
1          All others  510.0  1.534831  0.470169  0.020819   1.493929   
2            combined  540.0  1.556320  0.477872  0.020564   1.515924   

   Interval  
0  2.096074  
1  1.575734  
2  1.596716  
                                Independent t-test   results
0  Difference (Non-profit VoTechs - All others) =     0.3868
1                            Degrees of freedom =   538.0000
2                                             t =     4.3806
3                         Two side test p value =     0.0000
4                        Difference < 0 p value =     1.0000
5                        Difference > 0 p value =     0.0000
6                                     Cohen's d =     0.8230
7                                     Hedge's g =     0.8218
8                                Glass's delta1 =     0.8280
9                              Point-Bise

  groups = group1.append(group2, ignore_index= True)


In [16]:
stats.ttest_ind(df['FR_fitted'][df['TaxType'] == 'non-profitVoTech'],
                df['FR_fitted'][df['TaxType'] != 'non-profitVoTech'])

Ttest_indResult(statistic=4.380554253224218, pvalue=1.4234096089934076e-05)

In [15]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['TaxType'] == 'profitVoTech'], group1_name="Profit VoTechs",
                            group2=df['FR_fitted'][df['TaxType'] != 'profitVoTech'], group2_name="All others")
print(summary)
print(results)

         Variable      N      Mean        SD        SE  95% Conf.  Interval
0  Profit VoTechs   50.0  1.802506  0.311993  0.044122   1.713839  1.891174
1      All others  490.0  1.531199  0.484900  0.021906   1.488159  1.574240
2        combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                            Independent t-test   results
0  Difference (Profit VoTechs - All others) =     0.2713
1                        Degrees of freedom =   538.0000
2                                         t =     3.8735
3                     Two side test p value =     0.0001
4                    Difference < 0 p value =     0.9999
5                    Difference > 0 p value =     0.0001
6                                 Cohen's d =     0.5751
7                                 Hedge's g =     0.5743
8                            Glass's delta1 =     0.8696
9                          Point-Biserial r =     0.1647


  groups = group1.append(group2, ignore_index= True)


In [17]:
stats.ttest_ind(df['FR_fitted'][df['TaxType'] == 'profitVoTech'],
                df['FR_fitted'][df['TaxType'] != 'profitVoTech'])

Ttest_indResult(statistic=3.8735117689948066, pvalue=0.0001204922746138038)

In [18]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['TaxType'] == 'non-profitCommunity College'], group1_name="Community Colleges",
                            group2=df['FR_fitted'][df['TaxType'] != 'non-profitCommunity College'], group2_name="All others")
print(summary)
print(results)

             Variable      N      Mean        SD        SE  95% Conf.  \
0  Community Colleges  140.0  1.443374  0.292775  0.024744   1.394451   
1          All others  400.0  1.595851  0.522096  0.026105   1.544531   
2            combined  540.0  1.556320  0.477872  0.020564   1.515924   

   Interval  
0  1.492297  
1  1.647172  
2  1.596716  
                                Independent t-test   results
0  Difference (Community Colleges - All others) =    -0.1525
1                            Degrees of freedom =   538.0000
2                                             t =    -3.2786
3                         Two side test p value =     0.0011
4                        Difference < 0 p value =     0.0006
5                        Difference > 0 p value =     0.9994
6                                     Cohen's d =    -0.3219
7                                     Hedge's g =    -0.3215
8                                Glass's delta1 =    -0.5208
9                              Point-Bise

  groups = group1.append(group2, ignore_index= True)


In [19]:
stats.ttest_ind(df['FR_fitted'][df['TaxType'] == 'non-profitCommunity College'],
                df['FR_fitted'][df['TaxType'] != 'non-profitCommunity College'])

Ttest_indResult(statistic=-3.278565061641866, pvalue=0.0011107848158906771)