# Calculations of the Effect Size (ES) for each microarray study 

###  Using Hedges' g value, an adjusted Cohen's d  value

$$  {Enrichment} = \bar{X_2}-\bar{X_1}$$

Let Group 1 be 9h Sleeping Cerebral Cortex Expression values and Group 2 be 9h SD Cerebral Cortex Expression values 

(S mean - SD mean) **(Logged values, so minus gives ratio)** 

$$  {Pooled\ Standard\  Deviation} = \sqrt\frac{(n_1-1)S_1^2 +(n_2-1)S_2^2}{(n_2 +n_2) -2}  $$  

$$  {Cohen's\ d\ value} = \frac{Enrichment}{Pooled\ Standard\ Deviation} $$

$$  {Correction\ Factor (J\ Factor)} = 1- \frac{3}{4df-1} $$

$$  {Hedges'\ g\ value} = Cohen's\ d\ \text{x}\ J\ $$

$$  {Variance\ in\ d (V_d)} = \frac{n_1- +n_2}{n_1 n_2} + \frac{d^2}{2(n_1 +n_2)}  $$

$$  {Variance\ in\ g (V_g)} = J^2\  \text{x}\ V_d  $$

$$  {Standard\ Error\ in\ g (SE_g)} = \sqrt{V_g}  $$

## Setup working environment and import data

In [1]:
import pandas as pd # Dataframes and file IO
import numpy as np # numerical calculations
%cd /Users/Ella1/Desktop/data sets 430AV2


/Users/Ella1/Desktop/data sets 430AV2


In [2]:
prefix = '430AV2_CerCx_9h_'   # define a prefix to add to column names (making indexing easier later)

In [3]:
# import the data file to a data frame 'df'
df=pd.read_table('DATASET-GSE6514.txt', delimiter='\t',  index_col=0) #,nrows=500)  
df.shape

(45101, 137)

In [4]:
# remove probes that are know to cross-hybridise to more than one target
df =df[~df.index.str.contains('_x_|_s_')]    #   important reverse selector ~ 
df.shape

(40569, 137)

## Look at column names and then setup filters for grouping columns into S and SD groups

In [5]:
df.columns

Index(['Symbol', 'Definition', 'Ensembl_id', 'Entrez_id', 'Unigene_id',
       'GO-Process', 'GO-Function', 'GO-Component', 'Pathway_info',
       'Putative microRNA binding sites',
       ...
       'adjp-HypoT_12hS_vs_HypoT_12hSD', 'GSM149636_HypoT_12hSD.CEL',
       'GSM149637_HypoT_12hSD.CEL', 'GSM149648_HypoT_12hSD.CEL',
       'GSM149649_HypoT_12hSD.CEL', 'GSM149650_HypoT_12hSD.CEL',
       'avg-HypoT_12hSD', 'ANOVA-rawp', 'ANOVA-adjp', 'largest fold'],
      dtype='object', length=137)

In [6]:
# define regular expressions for sleep (S) and sleep dep (SD) filters 
s_filt ='CerCx_9hS.CEL'
sd_filt ='CerCx_9hSD.CEL'

In [7]:
df_s=df.filter(regex= s_filt)
df_s.head()

Unnamed: 0_level_0,GSM149550_CerCx_9hS.CEL,GSM149551_CerCx_9hS.CEL,GSM149552_CerCx_9hS.CEL,GSM149553_CerCx_9hS.CEL,GSM149554_CerCx_9hS.CEL
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1427138_at,6.91574,7.07094,7.01137,6.41678,6.66879
1425600_a_at,8.1501,8.27029,8.20993,7.98064,8.52652
1457168_at,5.948,5.80216,6.19872,6.04116,5.85585
1450135_at,5.7997,5.80361,5.66705,5.85742,5.34424
1424014_at,7.78964,8.19007,8.04329,6.64489,8.37859


In [8]:
df_sd=df.filter(regex= sd_filt)
df_sd.head()

Unnamed: 0_level_0,GSM149555_CerCx_9hSD.CEL,GSM149556_CerCx_9hSD.CEL,GSM149557_CerCx_9hSD.CEL,GSM149558_CerCx_9hSD.CEL,GSM149559_CerCx_9hSD.CEL
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1427138_at,6.84873,6.84183,6.79983,6.8581,6.7676
1425600_a_at,7.90713,8.11831,8.28288,8.37783,8.27117
1457168_at,5.88686,5.95763,5.90359,5.74488,6.11804
1450135_at,5.69519,5.95478,5.97657,5.61398,6.07124
1424014_at,7.73108,7.66511,7.75096,8.33666,7.90043


## Calculations 

In [9]:
# Enrichment

df[prefix+'Enrich'] = df.filter(regex=sd_filt).mean(axis=1) - df.filter(regex=s_filt).mean(axis=1)

In [10]:
df[prefix+'Enrich'].head()

Probesets
1427138_at      0.006494
1425600_a_at   -0.036032
1457168_at     -0.046978
1450135_at      0.167948
1424014_at      0.067552
Name: 430AV2_CerCx_9h_Enrich, dtype: float64

In [11]:
# Calculating Pooled StDev
Scount = df.filter(regex=s_filt).count(axis=1)
SDcount = df.filter(regex=sd_filt).count(axis=1)

StdevS = (Scount-1) * df.filter(regex=s_filt).var(axis=1)
StdevSD = (SDcount-1) * df.filter(regex=sd_filt).var(axis=1)

df[prefix+'poolStDev'] = np.sqrt((StdevS+StdevSD)/(Scount+ SDcount-2))

In [12]:
# Calculating Cohen's d
df[prefix+'Cohens_d'] = df[prefix+'Enrich'] / df[prefix+'poolStDev']

In [13]:
#df[prefix+'poolStDev'].head()
df[prefix+'Cohens_d'] .head()

Probesets
1427138_at      0.033534
1425600_a_at   -0.187918
1457168_at     -0.320663
1450135_at      0.829683
1424014_at      0.129585
Name: 430AV2_CerCx_9h_Cohens_d, dtype: float64

In [14]:
# Calculating J value (Correction factor)

df[prefix+'J'] = 1-(3/(4*(Scount+SDcount-1)))                              


In [15]:
# Calculating Hedge's g

df[prefix+'Hedges_g'] = df[prefix+'Cohens_d'] * df[prefix+'J']

In [16]:
#df[prefix+'J'].head()
df[prefix+'Hedges_g'] .head()

Probesets
1427138_at      0.030740
1425600_a_at   -0.172258
1457168_at     -0.293941
1450135_at      0.760542
1424014_at      0.118787
Name: 430AV2_CerCx_9h_Hedges_g, dtype: float64

In [17]:
# Calculating Var_d
Scount = df.filter(regex=s_filt).count(axis=1)
SDcount = df.filter(regex=sd_filt).count(axis=1)

Ftop1 = Scount + SDcount
Ftop2 = Scount * SDcount
Fbottom1 = np.square(df[prefix+'Cohens_d']) 
Fbottom2 =  2*(Scount + SDcount)


df[prefix+'Var_d'] = (Ftop1/Ftop2) + (Fbottom1 /Fbottom2)

In [18]:
#check output
df[prefix+'Var_d'].head()

Probesets
1427138_at      0.400056
1425600_a_at    0.401766
1457168_at      0.405141
1450135_at      0.434419
1424014_at      0.400840
Name: 430AV2_CerCx_9h_Var_d, dtype: float64

In [19]:
df[prefix+'Var_g'] = df[prefix+'Var_d'] * np.square(df[prefix+'J'])

In [20]:
# Calculating SEg
df[prefix+'SEg'] = np.sqrt(df[prefix+'Var_g'])

In [21]:
df.sort_values(by= '430AV2_CerCx_9h_Hedges_g', ascending=False, inplace=True)
df

Unnamed: 0_level_0,Symbol,Definition,Ensembl_id,Entrez_id,Unigene_id,GO-Process,GO-Function,GO-Component,Pathway_info,Putative microRNA binding sites,...,ANOVA-adjp,largest fold,430AV2_CerCx_9h_Enrich,430AV2_CerCx_9h_poolStDev,430AV2_CerCx_9h_Cohens_d,430AV2_CerCx_9h_J,430AV2_CerCx_9h_Hedges_g,430AV2_CerCx_9h_Var_d,430AV2_CerCx_9h_Var_g,430AV2_CerCx_9h_SEg
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1458623_at,,tau tubulin kinase 1 [Source:MGI Symbol;Acc:MG...,ENSMUSG00000015599,,,,,,,"mmu-miR-10a(miRanda), mmu-miR-10b(miRanda), mm...",...,1.243915e-12,0.777824,0.201736,0.032797,6.150965,0.916667,5.638385,2.291719,1.925680,1.387689
1416041_at,Sgk1,serum/glucocorticoid regulated kinase,ENSMUSG00000019970,20393,,cellular sodium ion homeostasis // protein ami...,protein binding // protein kinase activity // ...,cytoplasm // nucleus // endoplasmic reticulum,Insulin Signaling:WP65(WikiPathways) // IL-6 s...,"mmu-let-7e(RNAhybrid|miRanda), mmu-miR-1(RNAhy...",...,5.599883e-20,2.261174,0.790862,0.148716,5.317921,0.916667,4.874761,1.814014,1.524276,1.234616
1433607_at,Cbln4,cerebellin 4 precursor protein [Source:MGI Sym...,ENSMUSG00000067578,228942,,,,extracellular region // synapse // cell junction,,"mmu-miR-106a(miRanda), mmu-miR-106b(miRanda), ...",...,2.154191e-16,1.676918,1.115736,0.227300,4.908646,0.916667,4.499592,1.604740,1.348427,1.161218
1444676_at,,,,,Mm.442250,,,,,,...,1.935955e-02,0.452134,0.376074,0.082896,4.536672,0.916667,4.158616,1.429070,1.200815,1.095817
1457690_at,Kalrn,"kalirin, RhoGEF kinase [Source:MGI Symbol;Acc:...",ENSMUSG00000061751,545156,,protein amino acid phosphorylation // regulati...,magnesium ion binding // transferase activity ...,cytoplasm // cytoskeleton // intracellular,,"mmu-let-7a(RNAhybrid|miRanda), mmu-let-7b(RNAh...",...,1.304247e-20,1.470298,0.517502,0.114397,4.523740,0.916667,4.146762,1.423211,1.195893,1.093569
1416976_at,Stam2,signal transducing adaptor molecule (SH3 domai...,ENSMUSG00000055371,56324,,cell surface receptor linked signal transducti...,transmembrane receptor activity // protein bin...,cytoplasm // endosome // early endosome membra...,IL-2 Signaling Pathway:WP450(WikiPathways) // ...,"mmu-miR-1(miRanda), mmu-miR-101(TargetScan), m...",...,2.433943e-05,0.461406,0.303102,0.067893,4.464378,0.916667,4.092346,1.396533,1.173476,1.083271
1436790_a_at,Sox11,SRY-box containing gene 11,ENSMUSG00000063632,20666,,"transcription // regulation of transcription, ...",RNA polymerase II transcription factor activit...,nucleus,,"mmu-miR-101(TargetScan), mmu-miR-101a(miRanda)...",...,4.628929e-04,1.391656,0.975186,0.223051,4.372026,0.916667,4.007690,1.355730,1.139190,1.067329
1417262_at,Ptgs2,Prostaglandin-endoperoxide synthase 2,ENSMUSG00000032487,19225,,keratinocyte differentiation // prostaglandin ...,"oxidoreductase activity, acting on single dono...",endoplasmic reticulum membrane // protein comp...,XPodNet - protein-protein interactions in the ...,"mmu-miR-101(TargetScan|pictar), mmu-miR-101a(m...",...,4.143042e-29,3.221362,1.056238,0.249727,4.229578,0.916667,3.877113,1.294467,1.087711,1.042934
1445669_at,Spry4,sprouty homolog 4 (Drosophila) [Source:MGI Sym...,ENSMUSG00000024427,24066,,negative regulation of MAP kinase activity // ...,protein binding,cytoplasm // membrane,,"mmu-miR-101a(miRanda), mmu-miR-101b(miRanda), ...",...,3.522330e-10,0.857748,0.448162,0.108308,4.137857,0.916667,3.793036,1.256093,1.055467,1.027359
1417394_at,Klf4,Kruppel-like factor 4 (gut),ENSMUSG00000003032,16600,,response to chemical stimulus // epidermal cel...,nucleic acid binding // transcription factor a...,intracellular // nucleus,PluriNetWork:WP1763(WikiPathways) // White fat...,"mmu-miR-1(RNAhybrid|miRanda|pictar), mmu-miR-1...",...,5.040406e-11,1.177718,0.582980,0.140934,4.136545,0.916667,3.791832,1.255550,1.055011,1.027137


In [22]:
df.columns

Index(['Symbol', 'Definition', 'Ensembl_id', 'Entrez_id', 'Unigene_id',
       'GO-Process', 'GO-Function', 'GO-Component', 'Pathway_info',
       'Putative microRNA binding sites',
       ...
       'ANOVA-adjp', 'largest fold', '430AV2_CerCx_9h_Enrich',
       '430AV2_CerCx_9h_poolStDev', '430AV2_CerCx_9h_Cohens_d',
       '430AV2_CerCx_9h_J', '430AV2_CerCx_9h_Hedges_g',
       '430AV2_CerCx_9h_Var_d', '430AV2_CerCx_9h_Var_g',
       '430AV2_CerCx_9h_SEg'],
      dtype='object', length=145)

### Import key file from BioMart and index probesets to MGI gene symbols

In [23]:
dfX=pd.read_table('../FHS project/Sleep notebook Copy/BioMart_Ensmbl_index/mart_export72_430v2430Av2.txt',index_col=[3])
 
dfX.pop('Affy mouse430 2 probeset') # remove 430V2 probeset info (not needed for 430AV2 indexing)
dfX.head(5)

Unnamed: 0_level_0,Ensembl Gene ID,Description,MGI symbol
Affy mouse430a 2 probeset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1417126_a_at,ENSMUSG00000039221,ribosomal protein L22 like 1 [Source:MGI Symbo...,Rpl22l1
,ENSMUSG00000095611,predicted gene 10597 [Source:MGI Symbol;Acc:MG...,Gm10597
1417730_at,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1
1417730_at,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1
,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1


In [24]:
df_Join = df.join(dfX, how='left', sort=True)
df_FINAL1 = df_Join.groupby('MGI symbol').mean()
df_FINAL1[df_FINAL1.index.duplicated()==True]   # checking that no duplicate entries exist in the dataframe

Unnamed: 0_level_0,GSM149516_CerCx_3hS.CEL,GSM149517_CerCx_3hS.CEL,GSM149518_CerCx_3hS.CEL,GSM149519_CerCx_3hS.CEL,GSM149520_CerCx_3hS.CEL,avg-CerCx_3hS,log_fold-CerCx_3hS_vs_CerCx_3hSD,fold-CerCx_3hS_vs_CerCx_3hSD,rawp-CerCx_3hS_vs_CerCx_3hSD,adjp-CerCx_3hS_vs_CerCx_3hSD,...,ANOVA-adjp,largest fold,430AV2_CerCx_9h_Enrich,430AV2_CerCx_9h_poolStDev,430AV2_CerCx_9h_Cohens_d,430AV2_CerCx_9h_J,430AV2_CerCx_9h_Hedges_g,430AV2_CerCx_9h_Var_d,430AV2_CerCx_9h_Var_g,430AV2_CerCx_9h_SEg
MGI symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


### Columns from the list above can then easily be picked to produce files for use later. Examples below given:
 #### df3 = average S and SD expression for the platform and the log-fold changes
 #### df4 = Hedges g  values and associated variance for Meta-analysis (after indexing)

In [25]:
# df3 = df_FINAL1.loc[:,[u'avg-SD', u'avg-S', u'log_fold-S_vs_SD']]
# df3.columns =[prefix+'avg-SD', prefix+'avg-S', prefix+'log_fold-S_vs_SD']
# df3.to_csv('input_files/430AV2_SymbolExpression_forIndex.csv')

In [26]:
df4 = df_FINAL1.loc[:,[u'430AV2_CerCx_9h_Enrich',u'430AV2_CerCx_9h_Hedges_g', u'430AV2_CerCx_9h_Var_g', u'430AV2_CerCx_9h_SEg']]
df4.to_csv('../FHS project/Sleep notebook Copy/IPython_notebooks/input_files/430AV2_CerCx_9h_SymbolforIndexHedges.csv')

In [27]:
df4.head(10)  # check final ouput

Unnamed: 0_level_0,430AV2_CerCx_9h_Enrich,430AV2_CerCx_9h_Hedges_g,430AV2_CerCx_9h_Var_g,430AV2_CerCx_9h_SEg
MGI symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0610005C13Rik,-0.014928,-0.071619,0.336368,0.579972
0610008F07Rik,-0.141728,-0.789664,0.36729,0.606044
0610009B22Rik,0.207008,0.502481,0.348735,0.590538
0610009D07Rik,0.352971,0.804696,0.36862,0.607131
0610009O20Rik,-0.177406,-1.440144,0.439812,0.663183
0610010K14Rik,-0.146722,-0.279251,0.34001,0.583104
0610012G03Rik,0.114483,0.700775,0.425762,0.646072
0610031J06Rik,-0.061704,-0.209619,0.338308,0.581643
0610037L13Rik,0.16104,0.457902,0.346595,0.588723
0610040J01Rik,-0.028618,-0.142315,0.337124,0.580624
