# Recap: Power Analysis

Say that we do a one-sample test for $H_0:\mu=0$, assuming $\sigma=1$. What is the sample size to have a CI of length 0.3?

Compare with the calculator: https://www.statskingdom.com/sample_size_t_z.html or `statsmodels`.

# Recap: Mann–Whitney Test

The U-statistic counts a chosen inequality (smaller, bigger) between pairs.

Suitably normalized can be approximated by a normal distribution, which is the test basis.

Let's look at the diabets data shared in https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html

In [None]:
import numpy as np
import pandas as pd
from IPython.display import display

males = np.array([19, 22, 16, 29, 24])
females = np.array([20, 11, 17, 12])

import itertools

print("# Pairs with F younger:",sum( map(lambda xy: xy[0]>xy[1], itertools.product(males,females) ) ) )

#males = pd.Series(data=[19, 22, 16, 29, 24],name='M')
#females = pd.Series(data=[20, 11, 17, 12],name='F')

data = pd.concat(
  [pd.DataFrame( data=zip(males,len(males)*['M']), columns=['age','sex'] ),
  pd.DataFrame( data=zip(females,len(females)*['F']), columns=['age','sex'] )
])

data["rank"] = data["age"].rank(method="first")
display(data)

comparison = data.groupby("sex")["rank"].sum()-data.groupby("sex").size().apply(lambda n: n*(n+1)/2)
display(comparison)

print("# Pairs with F younger:",comparison.loc["M"] )
print("# Pairs with M younger:",comparison.loc["F"] )

# Pairs with F younger: 17


Unnamed: 0,age,sex,rank
0,19,M,5.0
1,22,M,7.0
2,16,M,3.0
3,29,M,9.0
4,24,M,8.0
0,20,F,6.0
1,11,F,1.0
2,17,F,4.0
3,12,F,2.0


sex
F     3.0
M    17.0
dtype: float64

# Pairs with F younger: 17.0
# Pairs with M younger: 3.0


# Multiple Testing: Fund Managers

We will evaluate if fund managers beat the market :-)

Credits to https://islp.readthedocs.io/en/latest/labs/Ch13-multiple-lab.html

In [None]:
%pip install ISLP --quiet
from ISLP import load_data

Fund = load_data('Fund')
Fund.head()

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.2/349.2 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m522.0/522.0 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m801.6/801.6 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m841.5/841.5 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.2/94.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m522.2/522.2 kB[0m [31m20.

Unnamed: 0,Manager1,Manager2,Manager3,Manager4,Manager5,Manager6,Manager7,Manager8,Manager9,Manager10,...,Manager1991,Manager1992,Manager1993,Manager1994,Manager1995,Manager1996,Manager1997,Manager1998,Manager1999,Manager2000
0,-3.341992,-4.167469,9.389223,8.41722,0.997863,7.191473,-10.767592,4.072425,1.575264,-0.798505,...,-2.948706,10.350706,-2.855337,-4.431786,0.739544,0.198044,1.752188,-1.53471,-3.359419,6.585654
1,3.759627,12.525254,3.403366,0.143944,-7.222227,0.067747,-10.737053,-1.138185,-7.166604,4.778522,...,24.00315,-1.966606,-1.609109,1.405325,4.717175,1.540359,-12.218233,-0.073008,-8.547683,-2.382629
2,12.970091,-2.581061,-0.824734,6.584604,17.050241,1.85713,3.196942,-7.981362,-1.214148,2.33825,...,-2.926914,6.420147,8.946921,3.449013,1.009957,1.481369,14.203314,0.005562,-5.105035,2.292429
3,-4.87463,7.981743,-4.026743,-4.731946,0.503276,0.740187,-28.96941,4.683751,-0.56884,-4.000547,...,-3.112208,3.173581,-6.017109,-1.984873,1.022525,-2.261927,19.34597,-1.048299,-0.016154,1.196832
4,2.019279,-5.370236,-4.854669,10.594432,-6.891574,9.877838,1.430033,9.840311,5.311455,18.365094,...,7.173653,-9.157211,7.643125,-1.022339,-1.325865,2.848785,-6.642081,2.488612,0.03206,-7.510032


In [None]:
from scipy.stats import ttest_1samp
from statsmodels.stats.multitest import multipletests as mult_test

my_test = lambda x: ttest_1samp(x,popmean=0)
results = Fund.apply(my_test).T
results.columns = ['T','pval']

In [None]:
#@title Controlling Family-Wise Error

method = 'holm' # @param ["bonferroni", "holm"]
n_first_managers = 9 # @param {type:"integer"}

reject, bonf = mult_test(results['pval'][:n_first_managers], method = method)[:2]
print("Effects found",reject.sum(), list(results.index[:n_first_managers][reject]))

Effects found 6 ['Manager1', 'Manager3', 'Manager6', 'Manager7', 'Manager8', 'Manager9']


In [None]:
#@title Controlling False Discovery Rate

import math

fdr = 0.10 # @param {type:"number"}

pvals = results['pval']
fund_qvalues = mult_test(pvals, method = "fdr_bh")[1]
n_effects = (fund_qvalues <= fdr).sum()
print(f"Found {n_effects} effects out of which {math.ceil(fdr*n_effects)} are expected false-discoveries")

Found 146 effects out of which 15 are expected false-discoveries
