# Statistical testing

In [1]:
import pandas as pd
import numpy as np

import matplotlib as plt
import matplotlib.pyplot as plt

import seaborn as sns

%matplotlib inline

df = pd.read_csv('processed_data/metrics_iterations_with_zones.csv')
df.columns

Index(['project', 'sprint.name', 'issues', 'velocity', 'work_capacity',
       'sprint.startDate', 'sprint.endDate', 'storypoints', 'sprint_length',
       'focus_factor', 'fields.assignee.name', 'old_devs', 'old_devs_abs',
       'new_devs_abs', 'turnover_abs', 'current_devs_abs', 'turnover',
       'new_devs', 'sprint.nbr', 'TSI', 'TSI_inv', 'individual_velocity',
       'sprint.key', 'sigma_velocity', 'sigma_focusfactor', 'zone_velocity',
       'zone_focusfactor'],
      dtype='object')

# First assumption. 

There is homogeneity of variances. This means that the population variances in each group are equal. If you use SPSS Statistics, Levene's Test for Homogeneity of Variances is included in the output when you run a one-way ANOVA in SPSS Statistics (see our One-way ANOVA using SPSS Statistics guide).

In [2]:
import scipy.stats as stats
import researchpy as rp
import statsmodels.api as sm
from statsmodels.formula.api import ols

# 1 assumption. There is homogeneity of variances. This means that the population variances in each group are equal. If you use SPSS Statistics, Levene's Test for Homogeneity of Variances is included in the output when you run a one-way ANOVA in SPSS Statistics (see our One-way ANOVA using SPSS Statistics guide).

table = pd.DataFrame()
for var in ['zone_velocity', 'zone_focusfactor']:
        for factor in ['turnover', 'new_devs', 'TSI_inv', 'sprint_length']:
            
            f = stats.levene(df[df[var] == 'D+'][factor],
                           df[df[var] == 'D-'][factor],
                           df[df[var] == 'N'][factor])
            #print(var, factor, f)

            if f[1] > 0.1:
                #print('Levene’s test for homogeneity of variance is not significant (at p>.1) which indicates that the groups have not equal variances.')
                txt = 'Different variances'
            else:
                #print('Levene’s test for homogeneity of variance is significant (at p<.1) which indicates that the groups have equal variances.')
                txt = 'Equal variances'
                
            d = { 'factor' : var,
            'dependent_var' : factor, 
            'statistic' : f[0],
            'p' : f[1],
            'r' : txt}
            
            table = table.append(d, ignore_index=True)
            
table

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


Unnamed: 0,dependent_var,factor,p,r,statistic
0,turnover,zone_velocity,0.031,Equal variances,3.50537
1,new_devs,zone_velocity,0.416763,Different variances,0.877236
2,TSI_inv,zone_velocity,0.551361,Different variances,0.596289
3,sprint_length,zone_velocity,0.067445,Equal variances,2.715464
4,turnover,zone_focusfactor,0.001635,Equal variances,6.524585
5,new_devs,zone_focusfactor,0.016595,Equal variances,4.142707
6,TSI_inv,zone_focusfactor,0.432044,Different variances,0.841065
7,sprint_length,zone_focusfactor,0.063542,Equal variances,2.775924


In [3]:
dr = { 'turnover' : 'Turnover (leavers)',
  'new_devs' : 'Turnover (newcomers)',
  'TSI_inv'  : '$TSI^{-1}$',
  'sprint_length' : 'Iteration length',
  'zone_velocity' : 'Zones of velocity',
  'zone_focusfactor' : 'Zones of focus factor'}

In [4]:
print(table[['factor', 'dependent_var', 'statistic','p', 'r']].replace(d).round(3).to_latex(index=False))

\begin{tabular}{llrrl}
\toprule
           factor &  dependent\_var &  statistic &      p &                    r \\
\midrule
    zone\_velocity &       turnover &      3.505 &  0.031 &      Equal variances \\
    zone\_velocity &       new\_devs &      0.877 &  0.417 &  Different variances \\
    zone\_velocity &        TSI\_inv &      0.596 &  0.551 &  Different variances \\
    zone\_velocity &  sprint\_length &      2.715 &  0.067 &      Equal variances \\
 zone\_focusfactor &       turnover &      6.525 &  0.002 &      Equal variances \\
 zone\_focusfactor &       new\_devs &      4.143 &  0.017 &      Equal variances \\
 zone\_focusfactor &        TSI\_inv &      0.841 &  0.432 &  Different variances \\
 zone\_focusfactor &  sprint\_length &      2.776 &  0.064 &      Equal variances \\
\bottomrule
\end{tabular}



# Second assumption

Normality

In [5]:
# normality check
from scipy.stats import shapiro

table = pd.DataFrame()
for var in ['zone_velocity', 'zone_focusfactor']:
    for ztype in ['D+', 'D-', 'N']:
        for factor in ['turnover', 'new_devs', 'TSI_inv', 'sprint_length']:
            
            m = df[df[var] == ztype][factor]
            stat, p = shapiro(m)

            #print('%s (n=%.0f): Statistics=%.3f, p=%.3f' % (ztype, len(m), stat, p))

            # interpret
            alpha = 0.01
            if p > alpha:
                #print('Sample looks Gaussian (fail to reject H0)')
                txt = 'Normal'
            else:
                #print('Sample does not look Gaussian (reject H0)');
                txt = 'Not normal'
                
            d = { 'factor' : var,
            'dependent_var' : factor, 
            'value' : ztype,
            'statistic' : stat,
            'p' : p,
            'Interpretation' : txt}
            
            table = table.append(d, ignore_index=True)
            
table

Unnamed: 0,Interpretation,dependent_var,factor,p,statistic,value
0,Not normal,turnover,zone_velocity,0.0001048801,0.598829,D+
1,Not normal,new_devs,zone_velocity,0.009716143,0.801389,D+
2,Normal,TSI_inv,zone_velocity,0.110637,0.887874,D+
3,Not normal,sprint_length,zone_velocity,0.001811171,0.733598,D+
4,Normal,turnover,zone_velocity,0.2841533,0.936985,D-
5,Normal,new_devs,zone_velocity,0.1166605,0.913949,D-
6,Normal,TSI_inv,zone_velocity,0.2476251,0.933363,D-
7,Not normal,sprint_length,zone_velocity,4.438121e-06,0.560682,D-
8,Not normal,turnover,zone_velocity,9.198986000000001e-18,0.854334,N
9,Not normal,new_devs,zone_velocity,7.560587999999999e-19,0.835677,N


# Kruskal-Wallis

In [6]:
table = pd.DataFrame()
for var in ['zone_velocity', 'zone_focusfactor']:
        for factor in ['turnover', 'new_devs', 'TSI_inv', 'sprint_length']:
            
            # The Kruskal-Wallis H-test tests the null hypothesis that the population median of all of the groups are equal. 
            stat, p = stats.kruskal(df[df[var] == 'D+'][factor],
                           df[df[var] == 'D-'][factor],
                           df[df[var] == 'N'][factor])
            
            d = { 'factor' : var,
            'dependent_var' : factor, 
            'statistic' : stat,
            'p' : p,
            'Interpretation' : 'Different medians' if p < .01 else 'Equal medians' }
            
            table = table.append(d, ignore_index=True)
            
table

Unnamed: 0,Interpretation,dependent_var,factor,p,statistic
0,Different medians,turnover,zone_velocity,0.002666,11.854326
1,Equal medians,new_devs,zone_velocity,0.044151,6.240288
2,Equal medians,TSI_inv,zone_velocity,0.010766,9.062716
3,Different medians,sprint_length,zone_velocity,0.007544,9.77404
4,Equal medians,turnover,zone_focusfactor,0.528432,1.275681
5,Equal medians,new_devs,zone_focusfactor,0.118259,4.269755
6,Equal medians,TSI_inv,zone_focusfactor,0.501539,1.380147
7,Equal medians,sprint_length,zone_focusfactor,0.10165,4.572433


# Post hoc tests

In [None]:
import scipy.stats as ss
import statsmodels.api as sa
import scikit_posthocs as sp

tables = []
for var in ['zone_velocity', 'zone_focusfactor']:
        for factor in ['turnover', 'new_devs', 'TSI_inv', 'sprint_length']:
            data = [df.loc[ids, factor].values for ids in df.groupby(var).groups.values()]
            
            H, p = ss.kruskal(*data)
            #print(var, factor)
            #print(H,p)

            dunn = sp.posthoc_dunn(df, val_col=factor, group_col=var, p_adjust = 'bonferroni')
            
            # keep the lower triangle of the matrix
            keep = np.triu(np.ones(dunn.shape)).astype('bool').reshape(dunn.size)
            
            
            table = dunn.stack()[~keep].reset_index()
            
            table.loc[:, 'var'] = var
            table.loc[:, 'factor'] = factor
            
            tables.append(table)
            #print(table)
            
table = pd.concat(tables, axis=0)

In [None]:
table.columns= ['group1', 'group2', 'p', 'var', 'factor']
table[['var', 'factor', 'group1', 'group2', 'p']].replace(dr).round(3)

Pairwise comparisons show that we may reject the null hypothesis (p < 0.01) for each pair of species and conclude that all groups (zones) differ in their variables (i.e. turnover, tsi, or iteration length).