In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
import researchpy as rp
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Read in csv
df = pd.read_csv('fittedFailureRate.csv', encoding='unicode_escape')
df.head()

  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


Unnamed: 0.1,Unnamed: 0,Webpage,Page_Type,Institution,Funding,Tax_type,Region,Type,CU,CSU,CS_Department,CCCS,Elements,Errors,FR,FR_fitted
0,0,https://www.adams.edu/,A,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,698,1,0.001433,0.900164
1,1,https://www.adams.edu/academics/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,729,1,0.001372,0.891194
2,2,https://www.adams.edu/academics/graduate/couns...,D,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,507,1,0.001972,0.969114
3,3,https://www.adams.edu/catalog/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,512,1,0.001953,0.966918
4,4,https://www.adams.edu/faculty-staff/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,645,1,0.00155,0.916706


In [5]:
df.groupby('Type')['FR'].mean()

Type
College              0.036728
Community College    0.014357
University           0.023667
VoTech               0.039931
Name: FR, dtype: float64

In [6]:
df.groupby('Type')['FR'].std()

Type
College              0.041772
Community College    0.015461
University           0.027286
VoTech               0.031629
Name: FR, dtype: float64

In [7]:
df.groupby('Type')['FR'].var()

Type
College              0.001745
Community College    0.000239
University           0.000745
VoTech               0.001000
Name: FR, dtype: float64

In [8]:
df.groupby('Type')['FR'].count()

Type
College               50
Community College    140
University           270
VoTech                80
Name: FR, dtype: int64

In [2]:
model = ols("""FR_fitted ~ C(Type)""", data=df).fit()
sm.stats.anova_lm(model, typ=1)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(Type),3.0,10.00948,3.336493,15.815344,7.228634e-10
Residual,536.0,113.077555,0.210966,,


In [3]:
print(pairwise_tukeyhsd(endog=df['FR_fitted'], groups=df['Type'], alpha=0.05))

           Multiple Comparison of Means - Tukey HSD, FWER=0.05            
      group1            group2      meandiff p-adj   lower   upper  reject
--------------------------------------------------------------------------
          College Community College  -0.2367   0.01 -0.4317 -0.0417   True
          College        University  -0.1743 0.0668 -0.3566  0.0079  False
          College            VoTech   0.1671 0.1826 -0.0463  0.3805  False
Community College        University   0.0624 0.5533 -0.0609  0.1857  False
Community College            VoTech   0.4038  0.001  0.2379  0.5697   True
       University            VoTech   0.3414  0.001  0.1907  0.4921   True
--------------------------------------------------------------------------


In [3]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['Type'] != 'University'], group1_name="All others",
                            group2=df['FR_fitted'][df['Type'] == 'University'], group2_name="Universities")
print(summary)
print(results)
# Slight correlation, universities are less likely to have errors than universities
# < small effect

       Variable      N      Mean        SD        SE  95% Conf.  Interval
0    All others  270.0  1.606858  0.420341  0.025581   1.556493  1.657223
1  Universities  270.0  1.505783  0.525127  0.031958   1.442863  1.568703
2      combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                          Independent t-test   results
0  Difference (All others - Universities) =     0.1011
1                      Degrees of freedom =   538.0000
2                                       t =     2.4691
3                   Two side test p value =     0.0139
4                  Difference < 0 p value =     0.9931
5                  Difference > 0 p value =     0.0069
6                               Cohen's d =     0.2125
7                               Hedge's g =     0.2122
8                          Glass's delta1 =     0.2405
9                        Point-Biserial r =     0.1059


  groups = group1.append(group2, ignore_index= True)


In [4]:
stats.ttest_ind(df['FR_fitted'][df['Type'] != 'University'],
                df['FR_fitted'][df['Type'] == 'University'])

Ttest_indResult(statistic=2.46912486180508, pvalue=0.013853889880621603)

In [10]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['Type'] == 'College'], group1_name="Colleges",
                            group2=df['FR_fitted'][df['Type'] != 'College'], group2_name="All others")
print(summary)
print(results)
# No correlation

     Variable      N      Mean        SD        SE  95% Conf.  Interval
0    Colleges   50.0  1.680101  0.564583  0.079844   1.519649  1.840554
1  All others  490.0  1.543690  0.466953  0.021095   1.502242  1.585137
2    combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                      Independent t-test   results
0  Difference (Colleges - All others) =     0.1364
1                  Degrees of freedom =   538.0000
2                                   t =     1.9276
3               Two side test p value =     0.0544
4              Difference < 0 p value =     0.9728
5              Difference > 0 p value =     0.0272
6                           Cohen's d =     0.2862
7                           Hedge's g =     0.2858
8                      Glass's delta1 =     0.2416
9                    Point-Biserial r =     0.0828


  groups = group1.append(group2, ignore_index= True)


In [11]:
stats.ttest_ind(df['FR_fitted'][df['Type'] == 'College'],
                df['FR_fitted'][df['Type'] != 'College'])

Ttest_indResult(statistic=1.9276002500374345, pvalue=0.05443096551306783)

In [13]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['Type'] != 'Community College'], group1_name="All others",
                            group2=df['FR_fitted'][df['Type'] == 'Community College'], group2_name="Community Colleges")
print(summary)
print(results)
# Correlation
# > small effect

             Variable      N      Mean        SD        SE  95% Conf.  \
0          All others  400.0  1.595851  0.522096  0.026105   1.544531   
1  Community Colleges  140.0  1.443374  0.292775  0.024744   1.394451   
2            combined  540.0  1.556320  0.477872  0.020564   1.515924   

   Interval  
0  1.647172  
1  1.492297  
2  1.596716  
                                Independent t-test   results
0  Difference (All others - Community Colleges) =     0.1525
1                            Degrees of freedom =   538.0000
2                                             t =     3.2786
3                         Two side test p value =     0.0011
4                        Difference < 0 p value =     0.9994
5                        Difference > 0 p value =     0.0006
6                                     Cohen's d =     0.3219
7                                     Hedge's g =     0.3215
8                                Glass's delta1 =     0.2920
9                              Point-Bise

  groups = group1.append(group2, ignore_index= True)


In [17]:
stats.ttest_ind(df['FR_fitted'][df['Type'] != 'Community College'],
                df['FR_fitted'][df['Type'] == 'Community College'])

Ttest_indResult(statistic=3.278565061641866, pvalue=0.0011107848158906771)

In [15]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['Type'] == 'VoTech'], group1_name="VoTechs",
                            group2=df['FR_fitted'][df['Type'] != 'VoTech'], group2_name="All others")
print(summary)
print(results)
# Correlation
# ~ large effect

     Variable      N      Mean        SD        SE  95% Conf.  Interval
0     VoTechs   80.0  1.847178  0.379287  0.042406   1.762771  1.931584
1  All others  460.0  1.505736  0.475494  0.022170   1.462169  1.549304
2    combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                     Independent t-test   results
0  Difference (VoTechs - All others) =     0.3414
1                 Degrees of freedom =   538.0000
2                                  t =     6.0928
3              Two side test p value =     0.0000
4             Difference < 0 p value =     1.0000
5             Difference > 0 p value =     0.0000
6                          Cohen's d =     0.7381
7                          Hedge's g =     0.7370
8                     Glass's delta1 =     0.9002
9                   Point-Biserial r =     0.2541


  groups = group1.append(group2, ignore_index= True)


In [16]:
stats.ttest_ind(df['FR_fitted'][df['Type'] == 'VoTech'],
                df['FR_fitted'][df['Type'] != 'VoTech'])

Ttest_indResult(statistic=6.0927997212221765, pvalue=2.1154441993096776e-09)