# Calculations of the Effect Size (ES) for each microarray study 

###  Using Hedges' g value, an adjusted Cohen's d  value

$$  {Enrichment} = \bar{X_2}-\bar{X_1}$$

Let Group 1 be 5hS Hippocampus Expression values and Group 2 be 5hSD Hippocampus Expression values 

(S mean - SD mean) **(Logged values, so minus gives ratio)** 

$$  {Pooled\ Standard\  Deviation} = \sqrt\frac{(n_1-1)S_1^2 +(n_2-1)S_2^2}{(n_2 +n_2) -2}  $$  

$$  {Cohen's\ d\ value} = \frac{Enrichment}{Pooled\ Standard\ Deviation} $$

$$  {Correction\ Factor (J\ Factor)} = 1- \frac{3}{4df-1} $$

$$  {Hedges'\ g\ value} = Cohen's\ d\ \text{x}\ J\ $$

$$  {Variance\ in\ d (V_d)} = \frac{n_1- +n_2}{n_1 n_2} + \frac{d^2}{2(n_1 +n_2)}  $$

$$  {Variance\ in\ g (V_g)} = J^2\  \text{x}\ V_d  $$

$$  {Standard\ Error\ in\ g (SE_g)} = \sqrt{V_g}  $$

## Setup working environment and import data

In [1]:
import pandas as pd # Dataframes and file IO
import numpy as np # numerical calculations
%cd /Users/Ella1/Desktop/data sets 430AV2


/Users/Ella1/Desktop/data sets 430AV2


In [2]:
prefix = '430AV2_Hipp_'   # define a prefix to add to column names (making indexing easier later)

In [3]:
# import the data file to a data frame 'df'
df=pd.read_table('DATASET-GSE33302.txt', delimiter='\t',  index_col=0) #,nrows=500)  
df.shape

(45101, 38)

In [4]:
# remove probes that are know to cross-hybridise to more than one target
df =df[~df.index.str.contains('_x_|_s_')]    #   important reverse selector ~ 
df.shape

(40569, 38)

## Look at column names and then setup filters for grouping columns into S and SD groups

In [5]:
df.columns

Index(['Symbol', 'Definition', 'Ensembl_id', 'Entrez_id', 'Unigene_id',
       'GO-Process', 'GO-Function', 'GO-Component', 'Pathway_info',
       'Putative microRNA binding sites', 'Select Cellular Compartments',
       'Select Protein Classes', 'GSM824682_hipp_S.CEL',
       'GSM824683_hipp_S.CEL', 'GSM824684_hipp_S.CEL', 'GSM824685_hipp_S.CEL',
       'GSM824686_hipp_S.CEL', 'GSM824687_hipp_S.CEL', 'GSM824688_hipp_S.CEL',
       'GSM824689_hipp_S.CEL', 'GSM824690_hipp_S.CEL', 'avg-hipp_S',
       'log_fold-hipp_S_vs_hipp_SD', 'fold-hipp_S_vs_hipp_SD',
       'rawp-hipp_S_vs_hipp_SD', 'adjp-hipp_S_vs_hipp_SD',
       'GSM824691_hipp_SD.CEL', 'GSM824692_hipp_SD.CEL',
       'GSM824693_hipp_SD.CEL', 'GSM824694_hipp_SD.CEL',
       'GSM824695_hipp_SD.CEL', 'GSM824696_hipp_SD.CEL',
       'GSM824697_hipp_SD.CEL', 'GSM824698_hipp_SD.CEL', 'avg-hipp_SD',
       'ANOVA-rawp', 'ANOVA-adjp', 'largest fold'],
      dtype='object')

In [6]:
# define regular expressions for sleep (S) and sleep dep (SD) filters 
s_filt ='S.CEL'
sd_filt ='SD.CEL'

In [7]:
df_s=df.filter(regex= s_filt)
df_s.head()

Unnamed: 0_level_0,GSM824682_hipp_S.CEL,GSM824683_hipp_S.CEL,GSM824684_hipp_S.CEL,GSM824685_hipp_S.CEL,GSM824686_hipp_S.CEL,GSM824687_hipp_S.CEL,GSM824688_hipp_S.CEL,GSM824689_hipp_S.CEL,GSM824690_hipp_S.CEL
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1427138_at,7.42344,7.31787,7.113,7.27596,7.26964,7.24375,7.19931,7.02947,6.907
1425600_a_at,8.30681,8.31521,8.3731,8.10452,8.26169,8.18391,8.27704,8.15593,8.19991
1457168_at,4.78153,4.67758,4.70337,4.70163,4.99808,4.854,4.73304,4.78021,4.62219
1450135_at,6.13959,5.95893,6.16749,6.16749,6.16749,6.06839,6.11952,6.01828,6.23965
1424014_at,8.03189,8.06762,8.05188,8.10017,7.77316,8.05824,7.99341,8.06648,7.9118


In [8]:
df_sd=df.filter(regex= sd_filt)
df_sd.head()

Unnamed: 0_level_0,GSM824691_hipp_SD.CEL,GSM824692_hipp_SD.CEL,GSM824693_hipp_SD.CEL,GSM824694_hipp_SD.CEL,GSM824695_hipp_SD.CEL,GSM824696_hipp_SD.CEL,GSM824697_hipp_SD.CEL,GSM824698_hipp_SD.CEL
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1427138_at,7.29151,7.00668,6.87297,7.02529,6.93543,7.03978,6.99617,7.09737
1425600_a_at,8.24145,8.21345,8.11736,8.15133,7.82216,8.20343,8.14613,8.27861
1457168_at,4.78778,4.85565,4.68517,4.63958,4.61377,4.73304,4.91612,4.73304
1450135_at,5.94411,6.23357,6.44916,6.0053,6.32475,6.20132,6.25601,6.1291
1424014_at,8.09898,7.85999,8.13373,8.24425,8.14164,8.16822,7.9938,7.91334


## Calculations 

In [9]:
# Enrichment

df[prefix+'Enrich'] = df.filter(regex=sd_filt).mean(axis=1) - df.filter(regex=s_filt).mean(axis=1)

In [10]:
df[prefix+'Enrich'].head()

Probesets
1427138_at     -0.164566
1425600_a_at   -0.095273
1457168_at     -0.015773
1450135_at      0.076601
1424014_at      0.063172
Name: 430AV2_Hipp_Enrich, dtype: float64

In [11]:
# Calculating Pooled StDev
Scount = df.filter(regex=s_filt).count(axis=1)
SDcount = df.filter(regex=sd_filt).count(axis=1)

StdevS = (Scount-1) * df.filter(regex=s_filt).var(axis=1)
StdevSD = (SDcount-1) * df.filter(regex=sd_filt).var(axis=1)

df[prefix+'poolStDev'] = np.sqrt((StdevS+StdevSD)/(Scount+ SDcount-2))

In [12]:
# Calculating Cohen's d
df[prefix+'Cohens_d'] = df[prefix+'Enrich'] / df[prefix+'poolStDev']

In [13]:
#df[prefix+'poolStDev'].head()
df[prefix+'Cohens_d'] .head()

Probesets
1427138_at     -1.150200
1425600_a_at   -0.824930
1457168_at     -0.146011
1450135_at      0.593085
1424014_at      0.534449
Name: 430AV2_Hipp_Cohens_d, dtype: float64

In [14]:
# Calculating J value (Correction factor)

df[prefix+'J'] = 1-(3/(4*(Scount+SDcount-1)))                              


In [15]:
# Calculating Hedge's g

df[prefix+'Hedges_g'] = df[prefix+'Cohens_d'] * df[prefix+'J']

In [16]:
#df[prefix+'J'].head()
df[prefix+'Hedges_g'] .head()

Probesets
1427138_at     -1.096285
1425600_a_at   -0.786262
1457168_at     -0.139167
1450135_at      0.565284
1424014_at      0.509397
Name: 430AV2_Hipp_Hedges_g, dtype: float64

In [17]:
# Calculating Var_d
Scount = df.filter(regex=s_filt).count(axis=1)
SDcount = df.filter(regex=sd_filt).count(axis=1)

Ftop1 = Scount + SDcount
Ftop2 = Scount * SDcount
Fbottom1 = np.square(df[prefix+'Cohens_d']) 
Fbottom2 =  2*(Scount + SDcount)


df[prefix+'Var_d'] = (Ftop1/Ftop2) + (Fbottom1 /Fbottom2)

In [18]:
#check output
df[prefix+'Var_d'].head()

Probesets
1427138_at      0.275022
1425600_a_at    0.256126
1457168_at      0.236738
1450135_at      0.246457
1424014_at      0.244512
Name: 430AV2_Hipp_Var_d, dtype: float64

In [19]:
df[prefix+'Var_g'] = df[prefix+'Var_d'] * np.square(df[prefix+'J'])

In [20]:
# Calculating SEg
df[prefix+'SEg'] = np.sqrt(df[prefix+'Var_g'])

In [21]:
df.sort_values(by= '430AV2_Hipp_Hedges_g', ascending=False, inplace=True)
df

Unnamed: 0_level_0,Symbol,Definition,Ensembl_id,Entrez_id,Unigene_id,GO-Process,GO-Function,GO-Component,Pathway_info,Putative microRNA binding sites,...,ANOVA-adjp,largest fold,430AV2_Hipp_Enrich,430AV2_Hipp_poolStDev,430AV2_Hipp_Cohens_d,430AV2_Hipp_J,430AV2_Hipp_Hedges_g,430AV2_Hipp_Var_d,430AV2_Hipp_Var_g,430AV2_Hipp_SEg
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1419070_at,Cys1,cystin 1,ENSMUSG00000062563,12879,,,,microtubule basal body // cell projection // c...,,"mmu-miR-125a-3p(miRanda), mmu-miR-133a(miRanda...",...,0.000071,0.363031,0.363031,0.057862,6.274115,0.953125,5.980016,1.393891,1.266277,1.125290
1436387_at,C330006P03Rik,homer homolog 1 (Drosophila) [Source:MGI Symbo...,ENSMUSG00000007617,320588,,,,,,"mmu-let-7a(RNAhybrid|miRanda), mmu-let-7b(RNAh...",...,0.000450,1.054253,1.054253,0.217862,4.839085,0.953125,4.612253,0.924839,0.840167,0.916606
1442051_at,Gm20634,predicted gene 20634 [Source:MGI Symbol;Acc:MG...,ENSMUSG00000070392|ENSMUSG00000093769,,,,,,,"mmu-miR-144(miRanda), mmu-miR-29b(miRanda), mm...",...,0.000450,0.550515,0.550515,0.114861,4.792865,0.953125,4.568199,0.911745,0.828272,0.910095
1448478_at,Med20,mediator complex subunit 20,ENSMUSG00000073387|ENSMUSG00000092558,56771,,"transcription // regulation of transcription, ...",protein binding,nucleus,,"mmu-let-7a(miRanda), mmu-let-7b(miRanda), mmu-...",...,0.000709,0.335231,0.335231,0.076076,4.406540,0.953125,4.199984,0.807217,0.733314,0.856338
1441075_at,Nostrin,nitric oxide synthase trafficker [Source:MGI S...,ENSMUSG00000034738,329416,,"negative regulation of transcription, DNA-depe...",protein binding // DNA binding // transcriptio...,cytoplasmic vesicle // cytoplasm // nucleus //...,,"mmu-miR-122a(RNAhybrid|miRanda), mmu-miR-128a(...",...,0.000709,0.410271,0.410271,0.094111,4.359426,0.953125,4.155078,0.795070,0.722279,0.849870
1435030_at,Upf2,UPF2 regulator of nonsense transcripts homolog...,ENSMUSG00000043241,326622,,response to unfolded protein // RNA metabolic ...,protein binding // binding,cytoplasm // perinuclear region of cytoplasm,,"mmu-miR-103(miRanda), mmu-miR-103a(TargetScan)...",...,0.001311,0.310130,0.310130,0.077010,4.027160,0.953125,3.838387,0.713112,0.647824,0.804875
1418322_at,Crem,cAMP responsive element modulator,ENSMUSG00000063889,12916,,transcription // spermatogenesis // regulation...,transcription factor activity // DNA binding /...,transcription factor complex // nucleus,Selenium metabolism/Selenoproteins:WP108(WikiP...,"mmu-let-7a(RNAhybrid|miRanda), mmu-let-7d(RNAh...",...,0.001311,0.404178,0.404178,0.101865,3.967772,0.953125,3.781783,0.699147,0.635138,0.796955
1435935_at,,RIKEN cDNA 2410131K14 gene [Source:MGI Symbol;...,ENSMUSG00000032840,,,,,,,"mmu-miR-134(miRanda), mmu-miR-138(miRanda), mm...",...,0.001563,0.376201,0.376201,0.098763,3.809115,0.953125,3.630563,0.662857,0.602171,0.775997
1416064_a_at,Hspa5,heat shock protein 5,ENSMUSG00000026864,14828,,response to stress // ER overload response,protein binding // nucleotide binding // ATP b...,endoplasmic reticulum lumen // melanosome // e...,MAPK signaling pathway:WP493(WikiPathways),"mmu-miR-1192(TargetScan|miRanda), mmu-miR-1193...",...,0.001760,0.480943,0.480943,0.128501,3.742723,0.953125,3.567283,0.648110,0.588774,0.767316
1418937_at,Dio2,"deiodinase, iodothyronine, type II",ENSMUSG00000007682,13371,,hormone biosynthetic process // thyroid hormon...,thyroxine 5'-deiodinase activity // oxidoreduc...,integral to membrane // membrane,Selenium metabolism/Selenoproteins:WP108(WikiP...,"mmu-miR-1192(miRanda), mmu-miR-122(miRanda), m...",...,0.001760,0.330522,0.330522,0.088501,3.734692,0.953125,3.559628,0.646344,0.587170,0.766270


In [22]:
df.columns

Index(['Symbol', 'Definition', 'Ensembl_id', 'Entrez_id', 'Unigene_id',
       'GO-Process', 'GO-Function', 'GO-Component', 'Pathway_info',
       'Putative microRNA binding sites', 'Select Cellular Compartments',
       'Select Protein Classes', 'GSM824682_hipp_S.CEL',
       'GSM824683_hipp_S.CEL', 'GSM824684_hipp_S.CEL', 'GSM824685_hipp_S.CEL',
       'GSM824686_hipp_S.CEL', 'GSM824687_hipp_S.CEL', 'GSM824688_hipp_S.CEL',
       'GSM824689_hipp_S.CEL', 'GSM824690_hipp_S.CEL', 'avg-hipp_S',
       'log_fold-hipp_S_vs_hipp_SD', 'fold-hipp_S_vs_hipp_SD',
       'rawp-hipp_S_vs_hipp_SD', 'adjp-hipp_S_vs_hipp_SD',
       'GSM824691_hipp_SD.CEL', 'GSM824692_hipp_SD.CEL',
       'GSM824693_hipp_SD.CEL', 'GSM824694_hipp_SD.CEL',
       'GSM824695_hipp_SD.CEL', 'GSM824696_hipp_SD.CEL',
       'GSM824697_hipp_SD.CEL', 'GSM824698_hipp_SD.CEL', 'avg-hipp_SD',
       'ANOVA-rawp', 'ANOVA-adjp', 'largest fold', '430AV2_Hipp_Enrich',
       '430AV2_Hipp_poolStDev', '430AV2_Hipp_Cohens_d', '430AV2_

### Import key file from BioMart and index probesets to MGI gene symbols

In [23]:
dfX=pd.read_table('../FHS project/Sleep notebook Copy/BioMart_Ensmbl_index/mart_export72_430v2430Av2.txt',index_col=[3])
 
dfX.pop('Affy mouse430 2 probeset') # remove 430V2 probeset info (not needed for 430AV2 indexing)
dfX.head(5)

Unnamed: 0_level_0,Ensembl Gene ID,Description,MGI symbol
Affy mouse430a 2 probeset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1417126_a_at,ENSMUSG00000039221,ribosomal protein L22 like 1 [Source:MGI Symbo...,Rpl22l1
,ENSMUSG00000095611,predicted gene 10597 [Source:MGI Symbol;Acc:MG...,Gm10597
1417730_at,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1
1417730_at,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1
,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1


In [24]:
df_Join = df.join(dfX, how='left', sort=True)
df_FINAL1 = df_Join.groupby('MGI symbol').mean()
df_FINAL1[df_FINAL1.index.duplicated()==True]   # checking that no duplicate entries exist in the dataframe

Unnamed: 0_level_0,GSM824682_hipp_S.CEL,GSM824683_hipp_S.CEL,GSM824684_hipp_S.CEL,GSM824685_hipp_S.CEL,GSM824686_hipp_S.CEL,GSM824687_hipp_S.CEL,GSM824688_hipp_S.CEL,GSM824689_hipp_S.CEL,GSM824690_hipp_S.CEL,avg-hipp_S,...,ANOVA-adjp,largest fold,430AV2_Hipp_Enrich,430AV2_Hipp_poolStDev,430AV2_Hipp_Cohens_d,430AV2_Hipp_J,430AV2_Hipp_Hedges_g,430AV2_Hipp_Var_d,430AV2_Hipp_Var_g,430AV2_Hipp_SEg
MGI symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


### Columns from the list above can then easily be picked to produce files for use later. Examples below given:
 #### df3 = average S and SD expression for the platform and the log-fold changes
 #### df4 = Hedges g  values and associated variance for Meta-analysis (after indexing)

In [25]:
# df3 = df_FINAL1.loc[:,[u'avg-SD', u'avg-S', u'log_fold-S_vs_SD']]
# df3.columns =[prefix+'avg-SD', prefix+'avg-S', prefix+'log_fold-S_vs_SD']
# df3.to_csv('input_files/430AV2_SymbolExpression_forIndex.csv')

In [26]:
df4 = df_FINAL1.loc[:,[u'430AV2_Hipp_Enrich',u'430AV2_Hipp_Hedges_g', u'430AV2_Hipp_Var_g', u'430AV2_Hipp_SEg']]
df4.to_csv('../FHS project/Sleep notebook Copy/IPython_notebooks/input_files/430AV2_Hipp_SymbolforIndexHedges.csv')

In [27]:
df4.head(10)  # check final ouput

Unnamed: 0_level_0,430AV2_Hipp_Enrich,430AV2_Hipp_Hedges_g,430AV2_Hipp_Var_g,430AV2_Hipp_SEg
MGI symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0610005C13Rik,0.08549,0.758034,0.231395,0.481035
0610008F07Rik,0.043071,0.476664,0.221177,0.470295
0610009B22Rik,0.130839,0.955337,0.241338,0.491261
0610009D07Rik,-0.010994,-0.09233,0.215067,0.463753
0610009O20Rik,-0.046699,-0.527294,0.222672,0.471881
0610010K14Rik,-0.16217,-1.070819,0.24822,0.498216
0610012G03Rik,0.020352,0.047639,0.254883,0.503944
0610031J06Rik,0.017434,0.215709,0.215863,0.464611
0610037L13Rik,-0.03702,-0.460674,0.220736,0.469826
0610040J01Rik,-0.037361,-0.442641,0.220257,0.469316
