# Calculations of the Effect Size (ES) for each microarray study 

###  Using Hedges' g value, an adjusted Cohen's d  value

$$  {Enrichment} = \bar{X_2}-\bar{X_1}$$

Let Group 1 be 3h Sleeping Hypothalamus Expression values and Group 2 be 3h SD Hypothalamus Expression values 

(S mean - SD mean) **(Logged values, so minus gives ratio)** 

$$  {Pooled\ Standard\  Deviation} = \sqrt\frac{(n_1-1)S_1^2 +(n_2-1)S_2^2}{(n_2 +n_2) -2}  $$  

$$  {Cohen's\ d\ value} = \frac{Enrichment}{Pooled\ Standard\ Deviation} $$

$$  {Correction\ Factor (J\ Factor)} = 1- \frac{3}{4df-1} $$

$$  {Hedges'\ g\ value} = Cohen's\ d\ \text{x}\ J\ $$

$$  {Variance\ in\ d (V_d)} = \frac{n_1- +n_2}{n_1 n_2} + \frac{d^2}{2(n_1 +n_2)}  $$

$$  {Variance\ in\ g (V_g)} = J^2\  \text{x}\ V_d  $$

$$  {Standard\ Error\ in\ g (SE_g)} = \sqrt{V_g}  $$

## Setup working environment and import data

In [1]:
import pandas as pd # Dataframes and file IO
import numpy as np # numerical calculations
%cd /Users/Ella1/Desktop/data sets 430AV2


/Users/Ella1/Desktop/data sets 430AV2


In [2]:
prefix = '430AV2_HypoT_3h_'   # define a prefix to add to column names (making indexing easier later)

In [3]:
# import the data file to a data frame 'df'
df=pd.read_table('DATASET-GSE6514.txt', delimiter='\t',  index_col=0) #,nrows=500)  
df.shape

(45101, 137)

In [4]:
# remove probes that are know to cross-hybridise to more than one target
df =df[~df.index.str.contains('_x_|_s_')]    #   important reverse selector ~ 
df.shape

(40569, 137)

## Look at column names and then setup filters for grouping columns into S and SD groups

In [5]:
df.columns

Index(['Symbol', 'Definition', 'Ensembl_id', 'Entrez_id', 'Unigene_id',
       'GO-Process', 'GO-Function', 'GO-Component', 'Pathway_info',
       'Putative microRNA binding sites',
       ...
       'adjp-HypoT_12hS_vs_HypoT_12hSD', 'GSM149636_HypoT_12hSD.CEL',
       'GSM149637_HypoT_12hSD.CEL', 'GSM149648_HypoT_12hSD.CEL',
       'GSM149649_HypoT_12hSD.CEL', 'GSM149650_HypoT_12hSD.CEL',
       'avg-HypoT_12hSD', 'ANOVA-rawp', 'ANOVA-adjp', 'largest fold'],
      dtype='object', length=137)

In [6]:
# define regular expressions for sleep (S) and sleep dep (SD) filters 
s_filt ='HypoT_3hS.CEL'
sd_filt ='HypoT_3hSD.CEL'

In [7]:
df_s=df.filter(regex= s_filt)
df_s.head()

Unnamed: 0_level_0,GSM149601_HypoT_3hS.CEL,GSM149602_HypoT_3hS.CEL,GSM149603_HypoT_3hS.CEL,GSM149604_HypoT_3hS.CEL,GSM149605_HypoT_3hS.CEL
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1427138_at,6.66009,6.83341,7.13931,7.00522,6.61334
1425600_a_at,7.50817,7.86638,7.18785,6.85622,6.25142
1457168_at,5.67372,5.78951,5.69763,5.43336,5.47351
1450135_at,6.04249,5.89659,5.51381,5.55955,5.38584
1424014_at,7.50105,7.57047,8.44307,8.37,8.26825


In [8]:
df_sd=df.filter(regex= sd_filt)
df_sd.head()

Unnamed: 0_level_0,GSM149606_HypoT_3hSD.CEL,GSM149607_HypoT_3hSD.CEL,GSM149608_HypoT_3hSD.CEL,GSM149609_HypoT_3hSD.CEL,GSM149610_HypoT_3hSD.CEL
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1427138_at,6.66695,6.5274,6.30358,6.73212,6.46545
1425600_a_at,7.48415,7.02924,6.85651,7.4512,6.76965
1457168_at,5.83585,5.72586,5.87868,5.70601,5.49476
1450135_at,6.24634,5.85096,5.64197,5.71443,5.71497
1424014_at,7.62093,7.51731,7.98004,8.13447,8.42482


## Calculations 

In [9]:
# Enrichment

df[prefix+'Enrich'] = df.filter(regex=sd_filt).mean(axis=1) - df.filter(regex=s_filt).mean(axis=1)

In [10]:
df[prefix+'Enrich'].head()

Probesets
1427138_at     -0.311174
1425600_a_at   -0.015858
1457168_at      0.114686
1450135_at      0.154078
1424014_at     -0.095054
Name: 430AV2_HypoT_3h_Enrich, dtype: float64

In [11]:
# Calculating Pooled StDev
Scount = df.filter(regex=s_filt).count(axis=1)
SDcount = df.filter(regex=sd_filt).count(axis=1)

StdevS = (Scount-1) * df.filter(regex=s_filt).var(axis=1)
StdevSD = (SDcount-1) * df.filter(regex=sd_filt).var(axis=1)

df[prefix+'poolStDev'] = np.sqrt((StdevS+StdevSD)/(Scount+ SDcount-2))

In [12]:
# Calculating Cohen's d
df[prefix+'Cohens_d'] = df[prefix+'Enrich'] / df[prefix+'poolStDev']

In [13]:
#df[prefix+'poolStDev'].head()
df[prefix+'Cohens_d'] .head()

Probesets
1427138_at     -1.568729
1425600_a_at   -0.031891
1457168_at      0.758371
1450135_at      0.591613
1424014_at     -0.228138
Name: 430AV2_HypoT_3h_Cohens_d, dtype: float64

In [14]:
# Calculating J value (Correction factor)

df[prefix+'J'] = 1-(3/(4*(Scount+SDcount-1)))                              


In [15]:
# Calculating Hedge's g

df[prefix+'Hedges_g'] = df[prefix+'Cohens_d'] * df[prefix+'J']

In [16]:
#df[prefix+'J'].head()
df[prefix+'Hedges_g'] .head()

Probesets
1427138_at     -1.438002
1425600_a_at   -0.029234
1457168_at      0.695173
1450135_at      0.542312
1424014_at     -0.209126
Name: 430AV2_HypoT_3h_Hedges_g, dtype: float64

In [17]:
# Calculating Var_d
Scount = df.filter(regex=s_filt).count(axis=1)
SDcount = df.filter(regex=sd_filt).count(axis=1)

Ftop1 = Scount + SDcount
Ftop2 = Scount * SDcount
Fbottom1 = np.square(df[prefix+'Cohens_d']) 
Fbottom2 =  2*(Scount + SDcount)


df[prefix+'Var_d'] = (Ftop1/Ftop2) + (Fbottom1 /Fbottom2)

In [18]:
#check output
df[prefix+'Var_d'].head()

Probesets
1427138_at      0.523046
1425600_a_at    0.400051
1457168_at      0.428756
1450135_at      0.417500
1424014_at      0.402602
Name: 430AV2_HypoT_3h_Var_d, dtype: float64

In [19]:
df[prefix+'Var_g'] = df[prefix+'Var_d'] * np.square(df[prefix+'J'])

In [20]:
# Calculating SEg
df[prefix+'SEg'] = np.sqrt(df[prefix+'Var_g'])

In [21]:
df.sort_values(by= '430AV2_HypoT_3h_Hedges_g', ascending=False, inplace=True)
df

Unnamed: 0_level_0,Symbol,Definition,Ensembl_id,Entrez_id,Unigene_id,GO-Process,GO-Function,GO-Component,Pathway_info,Putative microRNA binding sites,...,ANOVA-adjp,largest fold,430AV2_HypoT_3h_Enrich,430AV2_HypoT_3h_poolStDev,430AV2_HypoT_3h_Cohens_d,430AV2_HypoT_3h_J,430AV2_HypoT_3h_Hedges_g,430AV2_HypoT_3h_Var_d,430AV2_HypoT_3h_Var_g,430AV2_HypoT_3h_SEg
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1452318_a_at,Hspa1b,heat shock protein 1B,ENSMUSG00000090877,15511,,response to stress // anti-apoptosis // negati...,protein binding // nucleotide binding // ATP b...,mitochondrion // intracellular // mitochondria...,,"mmu-miR-128(miRanda), mmu-miR-130a(miRanda), m...",...,1.889247e-22,2.890986,2.661550,0.412698,6.449140,0.916667,5.911712,2.479571,2.083528,1.443443
1418206_at,Sdf2l1,stromal cell-derived factor 2-like 1,ENSMUSG00000022769,64136,,,,endoplasmic reticulum lumen // endoplasmic ret...,,"mmu-miR-124(TargetScan|miRanda), mmu-miR-124ab...",...,2.581619e-10,1.534774,1.206610,0.194548,6.202107,0.916667,5.685265,2.323307,1.952223,1.397220
1417879_at,Nenf,neuron derived neurotrophic factor,ENSMUSG00000037499,66208,,positive regulation of MAPKKK cascade,heme binding // transition metal ion binding /...,extracellular region // extracellular space,,mmu-miR-543(miRanda),...,6.909720e-11,1.032284,0.674126,0.121586,5.544441,0.916667,5.082404,1.937041,1.627653,1.275795
1438025_at,Mtrf1l,mitochondrial translational release factor 1-l...,ENSMUSG00000019774,108853,,translation // translational termination,"translation release factor activity, codon spe...",mitochondrion // cytoplasm,,"mmu-miR-128a(RNAhybrid|miRanda), mmu-miR-128b(...",...,3.293908e-13,0.939048,0.523472,0.100420,5.212810,0.916667,4.778409,1.758669,1.477771,1.215636
1427126_at,Hspa1b,heat shock protein 1B,ENSMUSG00000090877,15511,,response to stress // anti-apoptosis // negati...,protein binding // nucleotide binding // ATP b...,mitochondrion // intracellular // mitochondria...,,"mmu-miR-128(miRanda), mmu-miR-130a(miRanda), m...",...,1.119386e-22,3.350738,3.015936,0.579512,5.204269,0.916667,4.770580,1.754221,1.474033,1.214098
1450843_a_at,Serpinh1,"serine (or cysteine) peptidase inhibitor, clad...",ENSMUSG00000070436,12406,,collagen biosynthetic process // response to s...,protein binding // serine-type endopeptidase i...,cytoplasm // endoplasmic reticulum lumen // en...,Endochondral Ossification:WP1270(WikiPathways),"mmu-miR-1(miRanda), mmu-miR-1192(miRanda), mmu...",...,1.888225e-12,1.393626,0.817790,0.161726,5.056644,0.916667,4.635257,1.678482,1.410392,1.187599
1452388_at,Hspa1a,heat shock protein 1A,ENSMUSG00000091971,193740,,DNA repair // response to stress // response t...,nucleotide binding // ATP binding,mitochondrion // cytoplasmic part,Apoptosis Modulation by HSP70:WP166(WikiPathwa...,"mmu-miR-130a(miRanda), mmu-miR-130b(miRanda), ...",...,6.941324e-16,1.832462,1.669184,0.333279,5.008364,0.916667,4.591000,1.654185,1.389975,1.178972
1429169_at,Rbm3,RNA binding motif protein 3 [Source:MGI Symbol...,ENSMUSG00000031167,19652,,translation // response to cold // gene silenc...,RNA binding // nucleic acid binding // nucleot...,,GenMAPP-mRNA_processing_binding_Reactome // mR...,"mmu-miR-1(miRanda), mmu-miR-1192(miRanda), mmu...",...,1.205980e-15,1.316408,0.965742,0.196323,4.919149,0.916667,4.509220,1.609901,1.352764,1.163084
1429165_at,3110001I22Rik,RIKEN cDNA 3110001I22 gene [Source:MGI Symbol;...,ENSMUSG00000079737,66598,,,,,,"mmu-miR-136(miRanda), mmu-miR-154(miRanda), mm...",...,6.526406e-05,0.474822,0.345740,0.071816,4.814269,0.916667,4.413080,1.558859,1.309875,1.144498
1428052_a_at,Zmym1,"zinc finger, MYM domain containing 1",ENSMUSG00000043872,68310,,,protein dimerization activity // zinc ion binding,nucleus,,"mmu-miR-103(miRanda), mmu-miR-107(miRanda), mm...",...,1.020090e-11,1.599828,1.034188,0.215704,4.794472,0.916667,4.394932,1.549348,1.301883,1.141001


In [22]:
df.columns

Index(['Symbol', 'Definition', 'Ensembl_id', 'Entrez_id', 'Unigene_id',
       'GO-Process', 'GO-Function', 'GO-Component', 'Pathway_info',
       'Putative microRNA binding sites',
       ...
       'ANOVA-adjp', 'largest fold', '430AV2_HypoT_3h_Enrich',
       '430AV2_HypoT_3h_poolStDev', '430AV2_HypoT_3h_Cohens_d',
       '430AV2_HypoT_3h_J', '430AV2_HypoT_3h_Hedges_g',
       '430AV2_HypoT_3h_Var_d', '430AV2_HypoT_3h_Var_g',
       '430AV2_HypoT_3h_SEg'],
      dtype='object', length=145)

### Import key file from BioMart and index probesets to MGI gene symbols

In [23]:
dfX=pd.read_table('../FHS project/Sleep notebook Copy/BioMart_Ensmbl_index/mart_export72_430v2430Av2.txt',index_col=[3])
 
dfX.pop('Affy mouse430 2 probeset') # remove 430V2 probeset info (not needed for 430AV2 indexing)
dfX.head(5)

Unnamed: 0_level_0,Ensembl Gene ID,Description,MGI symbol
Affy mouse430a 2 probeset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1417126_a_at,ENSMUSG00000039221,ribosomal protein L22 like 1 [Source:MGI Symbo...,Rpl22l1
,ENSMUSG00000095611,predicted gene 10597 [Source:MGI Symbol;Acc:MG...,Gm10597
1417730_at,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1
1417730_at,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1
,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1


In [24]:
df_Join = df.join(dfX, how='left', sort=True)
df_FINAL1 = df_Join.groupby('MGI symbol').mean()
df_FINAL1[df_FINAL1.index.duplicated()==True]   # checking that no duplicate entries exist in the dataframe

Unnamed: 0_level_0,GSM149516_CerCx_3hS.CEL,GSM149517_CerCx_3hS.CEL,GSM149518_CerCx_3hS.CEL,GSM149519_CerCx_3hS.CEL,GSM149520_CerCx_3hS.CEL,avg-CerCx_3hS,log_fold-CerCx_3hS_vs_CerCx_3hSD,fold-CerCx_3hS_vs_CerCx_3hSD,rawp-CerCx_3hS_vs_CerCx_3hSD,adjp-CerCx_3hS_vs_CerCx_3hSD,...,ANOVA-adjp,largest fold,430AV2_HypoT_3h_Enrich,430AV2_HypoT_3h_poolStDev,430AV2_HypoT_3h_Cohens_d,430AV2_HypoT_3h_J,430AV2_HypoT_3h_Hedges_g,430AV2_HypoT_3h_Var_d,430AV2_HypoT_3h_Var_g,430AV2_HypoT_3h_SEg
MGI symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


### Columns from the list above can then easily be picked to produce files for use later. Examples below given:
 #### df3 = average S and SD expression for the platform and the log-fold changes
 #### df4 = Hedges g  values and associated variance for Meta-analysis (after indexing)

In [25]:
# df3 = df_FINAL1.loc[:,[u'avg-SD', u'avg-S', u'log_fold-S_vs_SD']]
# df3.columns =[prefix+'avg-SD', prefix+'avg-S', prefix+'log_fold-S_vs_SD']
# df3.to_csv('input_files/430AV2_SymbolExpression_forIndex.csv')

In [26]:
df4 = df_FINAL1.loc[:,[u'430AV2_HypoT_3h_Enrich',u'430AV2_HypoT_3h_Hedges_g', u'430AV2_HypoT_3h_Var_g', u'430AV2_HypoT_3h_SEg']]
df4.to_csv('../FHS project/Sleep notebook Copy/IPython_notebooks/input_files/430AV2_HypoT_3h_SymbolforIndexHedges.csv')

In [27]:
df4.head(10)  # check final ouput

Unnamed: 0_level_0,430AV2_HypoT_3h_Enrich,430AV2_HypoT_3h_Hedges_g,430AV2_HypoT_3h_Var_g,430AV2_HypoT_3h_SEg
MGI symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0610005C13Rik,0.08493,0.382248,0.343417,0.586018
0610008F07Rik,0.118778,0.664416,0.358184,0.598484
0610009B22Rik,0.102728,0.650049,0.357239,0.597695
0610009D07Rik,0.177883,0.313462,0.341083,0.584022
0610009O20Rik,-0.003892,-0.028528,0.336152,0.579786
0610010K14Rik,-0.10549,-0.326858,0.341453,0.58434
0610012G03Rik,0.484069,2.023565,0.600572,0.757936
0610031J06Rik,-0.174324,-0.817487,0.369525,0.607886
0610037L13Rik,-0.061218,-0.601995,0.354231,0.595173
0610040J01Rik,0.156828,1.57728,0.460502,0.678603
