In [15]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
import researchpy as rp
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Read in csv
df = pd.read_csv('fittedFailureRate.csv', encoding='unicode_escape')
df.head()

Unnamed: 0.1,Unnamed: 0,Webpage,Page_Type,Institution,Funding,Tax_type,Region,Type,CU,CSU,CS_Department,CCCS,Elements,Errors,FR,FR_fitted
0,0,https://www.adams.edu/,A,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,698,1,0.001433,0.900164
1,1,https://www.adams.edu/academics/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,729,1,0.001372,0.891194
2,2,https://www.adams.edu/academics/graduate/couns...,D,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,507,1,0.001972,0.969114
3,3,https://www.adams.edu/catalog/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,512,1,0.001953,0.966918
4,4,https://www.adams.edu/faculty-staff/,C,Adams State University,public,non-profit,Southwest,University,no,no,yes,no,645,1,0.00155,0.916706


In [3]:
df.groupby('Page_Type')['FR'].mean()

Page_Type
A    0.023727
B    0.018563
C    0.026805
D    0.020625
Name: FR, dtype: float64

In [4]:
df.groupby('Page_Type')['FR'].std()

Page_Type
A    0.021291
B    0.028758
C    0.030352
D    0.024858
Name: FR, dtype: float64

In [5]:
df.groupby('Page_Type')['FR'].var()

Page_Type
A    0.000453
B    0.000827
C    0.000921
D    0.000618
Name: FR, dtype: float64

In [6]:
df.groupby('Page_Type')['FR'].count()

Page_Type
A     54
B     27
C    353
D    106
Name: FR, dtype: int64

In [16]:
print(pairwise_tukeyhsd(endog=df['FR_fitted'], groups=df['Page_Type'], alpha=0.05))

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     A      B    -0.17 0.4267 -0.4578 0.1179  False
     A      C   0.0251    0.9 -0.1534 0.2036  False
     A      D  -0.1331 0.3354 -0.3373 0.0711  False
     B      C   0.1951 0.1674 -0.0488  0.439  False
     B      D   0.0369    0.9 -0.2264 0.3001  False
     C      D  -0.1582 0.0143 -0.2935 -0.023   True
---------------------------------------------------


In [7]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['Page_Type'] == 'A'], group1_name="Home Page",
                            group2=df['FR_fitted'][df['Page_Type'] != 'A'], group2_name="All others")
print(summary)
print(results)

# No correlation

     Variable      N      Mean        SD        SE  95% Conf.  Interval
0   Home Page   54.0  1.574534  0.444771  0.060526   1.453135  1.695934
1  All others  486.0  1.554296  0.481798  0.021855   1.511355  1.597238
2    combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                       Independent t-test   results
0  Difference (Home Page - All others) =     0.0202
1                   Degrees of freedom =   538.0000
2                                    t =     0.2950
3                Two side test p value =     0.7681
4               Difference < 0 p value =     0.6159
5               Difference > 0 p value =     0.3841
6                            Cohen's d =     0.0423
7                            Hedge's g =     0.0423
8                       Glass's delta1 =     0.0455
9                     Point-Biserial r =     0.0127


  groups = group1.append(group2, ignore_index= True)


In [8]:
stats.ttest_ind(df['FR_fitted'][df['Page_Type'] == 'A'],
                df['FR_fitted'][df['Page_Type'] != 'A'])

Ttest_indResult(statistic=0.294988824630058, pvalue=0.7681162151607215)

In [9]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['Page_Type'] == 'B'], group1_name="Subdomains",
                            group2=df['FR_fitted'][df['Page_Type'] != 'B'], group2_name="All others")
print(summary)
print(results)

# No correlation

     Variable      N      Mean        SD        SE  95% Conf.  Interval
0  Subdomains   27.0  1.404565  0.499758  0.096178   1.206867  1.602262
1  All others  513.0  1.564307  0.475860  0.021010   1.523031  1.605583
2    combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                        Independent t-test   results
0  Difference (Subdomains - All others) =    -0.1597
1                    Degrees of freedom =   538.0000
2                                     t =    -1.6959
3                 Two side test p value =     0.0905
4                Difference < 0 p value =     0.0452
5                Difference > 0 p value =     0.9548
6                             Cohen's d =    -0.3349
7                             Hedge's g =    -0.3344
8                        Glass's delta1 =    -0.3196
9                      Point-Biserial r =    -0.0729


  groups = group1.append(group2, ignore_index= True)


In [10]:
stats.ttest_ind(df['FR_fitted'][df['Page_Type'] == 'B'],
                df['FR_fitted'][df['Page_Type'] != 'B'])

Ttest_indResult(statistic=-1.6959254464732028, pvalue=0.09047852339254574)

In [11]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['Page_Type'] == 'C'], group1_name="One Directory Down",
                            group2=df['FR_fitted'][df['Page_Type'] != 'C'], group2_name="All others")
print(summary)
print(results)

# Correlation
# > small effect

             Variable      N      Mean        SD        SE  95% Conf.  \
0  One Directory Down  353.0  1.599645  0.462232  0.024602   1.551259   
1          All others  187.0  1.474536  0.497119  0.036353   1.402819   
2            combined  540.0  1.556320  0.477872  0.020564   1.515924   

   Interval  
0  1.648031  
1  1.546253  
2  1.596716  
                                Independent t-test   results
0  Difference (One Directory Down - All others) =     0.1251
1                            Degrees of freedom =   538.0000
2                                             t =     2.9147
3                         Two side test p value =     0.0037
4                        Difference < 0 p value =     0.9981
5                        Difference > 0 p value =     0.0019
6                                     Cohen's d =     0.2636
7                                     Hedge's g =     0.2633
8                                Glass's delta1 =     0.2707
9                              Point-Bise

  groups = group1.append(group2, ignore_index= True)


In [12]:
stats.ttest_ind(df['FR_fitted'][df['Page_Type'] == 'C'],
                df['FR_fitted'][df['Page_Type'] != 'C'])

Ttest_indResult(statistic=2.914652126588466, pvalue=0.003709108742225079)

In [13]:
summary, results = rp.ttest(group1=df['FR_fitted'][df['Page_Type'] != 'D'], group1_name="All others",
                            group2=df['FR_fitted'][df['Page_Type'] == 'D'], group2_name="Two+ Directories")
print(summary)
print(results)

# Correlation 
# < medium effect

           Variable      N      Mean        SD        SE  95% Conf.  Interval
0        All others  434.0  1.584384  0.463809  0.022264   1.540626  1.628142
1  Two+ Directories  106.0  1.441416  0.518230  0.050335   1.341611  1.541221
2          combined  540.0  1.556320  0.477872  0.020564   1.515924  1.596716
                              Independent t-test   results
0  Difference (All others - Two+ Directories) =     0.1430
1                          Degrees of freedom =   538.0000
2                                           t =     2.7786
3                       Two side test p value =     0.0057
4                      Difference < 0 p value =     0.9972
5                      Difference > 0 p value =     0.0028
6                                   Cohen's d =     0.3010
7                                   Hedge's g =     0.3006
8                              Glass's delta1 =     0.3082
9                            Point-Biserial r =     0.1189


  groups = group1.append(group2, ignore_index= True)


In [14]:
stats.ttest_ind(df['FR_fitted'][df['Page_Type'] == 'D'],
                df['FR_fitted'][df['Page_Type'] != 'D'])

Ttest_indResult(statistic=-2.7785535247873545, pvalue=0.005650929497092892)