In [81]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
import researchpy as rp
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Read in csv
df = pd.read_csv('fittedFailureRate.csv', encoding='unicode_escape')
df['FundRegion'] = df['Funding'] + df['Region']
df.head()

Unnamed: 0.1,Unnamed: 0,Webpage,Page_Type,Institution,Funding,Tax_type,Region,Type,CU,CSU,CS_Department,CCCS,Elements,Errors,FR,FR_fitted,FundRegion
0,0,https://www.adams.edu/,A,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,698,1,0.001433,0.900164,publicSouthwest
1,1,https://www.adams.edu/academics/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,729,1,0.001372,0.891194,publicSouthwest
2,2,https://www.adams.edu/academics/graduate/couns...,D,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,507,1,0.001972,0.969114,publicSouthwest
3,3,https://www.adams.edu/catalog/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,512,1,0.001953,0.966918,publicSouthwest
4,4,https://www.adams.edu/faculty-staff/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,645,1,0.00155,0.916706,publicSouthwest


In [76]:
df.groupby(['Funding','Region'])['FR'].mean()

Funding  Region       
private  Metro            0.035815
         Online           0.009687
         Pikes Peak       0.012795
public   Metro            0.027314
         North Central    0.019011
         Northeast        0.010455
         Northwest        0.018777
         Online           0.019680
         Pikes Peak       0.010626
         Southeast        0.006828
         Southwest        0.004244
         West Central     0.028983
Name: FR, dtype: float64

In [77]:
df.groupby(['Funding','Region'])['FR'].std()

Funding  Region       
private  Metro            0.033675
         Online           0.023627
         Pikes Peak       0.012959
public   Metro            0.028074
         North Central    0.024046
         Northeast        0.004513
         Northwest        0.010162
         Online           0.013510
         Pikes Peak       0.007132
         Southeast        0.007874
         Southwest        0.003999
         West Central     0.033075
Name: FR, dtype: float64

In [78]:
df.groupby(['Funding','Region'])['FR'].var()

Funding  Region       
private  Metro            0.001134
         Online           0.000558
         Pikes Peak       0.000168
public   Metro            0.000788
         North Central    0.000578
         Northeast        0.000020
         Northwest        0.000103
         Online           0.000183
         Pikes Peak       0.000051
         Southeast        0.000062
         Southwest        0.000016
         West Central     0.001094
Name: FR, dtype: float64

In [79]:
df.groupby(['Funding','Region'])['FR'].count()

Funding  Region       
private  Metro            190
         Online            10
         Pikes Peak        20
public   Metro            120
         North Central     40
         Northeast         10
         Northwest         20
         Online            10
         Pikes Peak        40
         Southeast         20
         Southwest         30
         West Central      30
Name: FR, dtype: int64

In [82]:
print(pairwise_tukeyhsd(endog=df['FR_fitted'], groups=df['FundRegion'], alpha=0.05))

             Multiple Comparison of Means - Tukey HSD, FWER=0.05              
       group1              group2       meandiff p-adj   lower   upper  reject
------------------------------------------------------------------------------
       privateMetro       privateOnline  -0.6309  0.001 -1.0955 -0.1663   True
       privateMetro   privatePikes Peak  -0.4964  0.001 -0.8331 -0.1598   True
       privateMetro         publicMetro  -0.1523 0.1138 -0.3192  0.0147  False
       privateMetro publicNorth Central  -0.2424 0.0653 -0.4915  0.0067  False
       privateMetro     publicNortheast   -0.348 0.3706 -0.8126  0.1166  False
       privateMetro     publicNorthwest  -0.1544    0.9  -0.491  0.1823  False
       privateMetro        publicOnline  -0.3739 0.2593 -0.8385  0.0907  False
       privateMetro    publicPikes Peak  -0.3922  0.001 -0.6413  -0.143   True
       privateMetro     publicSoutheast  -0.5492  0.001 -0.8858 -0.2125   True
       privateMetro     publicSouthwest  -0.6678  0.

In [69]:
summary, results = rp.ttest(group1=df['FR_fitted'][(df['Funding'] == 'private') & (df['Type'] == 'University')], group1_name="Private universities",
                            group2=df['FR_fitted'][(df['Funding'] == 'public') & (df['Type'] == 'University')], group2_name="Private all others")
print(summary)
print(results)

               Variable      N      Mean        SD        SE  95% Conf.  \
0  Private universities  130.0  1.615722  0.523212  0.045889   1.524930   
1    Private all others  140.0  1.403696  0.507745  0.042912   1.318851   
2              combined  270.0  1.505783  0.525127  0.031958   1.442863   

   Interval  
0  1.706515  
1  1.488541  
2  1.568703  
                                  Independent t-test   results
0  Difference (Private universities - Private all...    0.2120
1                              Degrees of freedom =   268.0000
2                                               t =     3.3785
3                           Two side test p value =     0.0008
4                          Difference < 0 p value =     0.9996
5                          Difference > 0 p value =     0.0004
6                                       Cohen's d =     0.4115
7                                       Hedge's g =     0.4104
8                                  Glass's delta1 =     0.4052
9            

  groups = group1.append(group2, ignore_index= True)


In [70]:
stats.ttest_ind(df['FR_fitted'][(df['Funding'] == 'private') & (df['Type'] == 'University')],
                df['FR_fitted'][(df['Funding'] == 'public') & (df['Type'] == 'University')])

Ttest_indResult(statistic=3.378532509460123, pvalue=0.0008370034613914972)