# Calculations of the Effect Size (ES) for each microarray study 

###  Using Hedges' g value, an adjusted Cohen's d  value

$$  {Enrichment} = \bar{X_2}-\bar{X_1}$$

Let Group 1 be 6hSleeping whole brain C57/BL6 strain (B6) Expression values and Group 2 be 6hSD whole brain C57/BL6 strain (B6) Expression values 

(S mean - SD mean) **(Logged values, so minus gives ratio)** 

$$  {Pooled\ Standard\  Deviation} = \sqrt\frac{(n_1-1)S_1^2 +(n_2-1)S_2^2}{(n_2 +n_2) -2}  $$  

$$  {Cohen's\ d\ value} = \frac{Enrichment}{Pooled\ Standard\ Deviation} $$

$$  {Correction\ Factor (J\ Factor)} = 1- \frac{3}{4df-1} $$

$$  {Hedges'\ g\ value} = Cohen's\ d\ \text{x}\ J\ $$

$$  {Variance\ in\ d (V_d)} = \frac{n_1- +n_2}{n_1 n_2} + \frac{d^2}{2(n_1 +n_2)}  $$

$$  {Variance\ in\ g (V_g)} = J^2\  \text{x}\ V_d  $$

$$  {Standard\ Error\ in\ g (SE_g)} = \sqrt{V_g}  $$

## Setup working environment and import data

In [28]:
import pandas as pd # Dataframes and file IO
import numpy as np # numerical calculations
%cd /Users/Ella1/Desktop/data sets 430AV2


/Users/Ella1/Desktop/data sets 430AV2


In [29]:
prefix = '430AV2_ZT6_B6_'   # define a prefix to add to column names (mB6ing indexing easier later)

In [30]:
# import the data file to a data frame 'df'
df=pd.read_table('DATASET-GSE9442.txt', delimiter='\t',  index_col=0) #,nrows=500)  
df.shape

(45101, 87)

In [31]:
# remove probes that are know to cross-hybridise to more than one target
df =df[~df.index.str.contains('_x_|_s_')]    #   important reverse selector ~ 
df.shape

(40569, 87)

## Look at column names and then setup filters for grouping columns into S and SD groups

In [32]:
df.columns

Index(['Symbol', 'Definition', 'Ensembl_id', 'Entrez_id', 'Unigene_id',
       'GO-Process', 'GO-Function', 'GO-Component', 'Pathway_info',
       'Putative microRNA binding sites', 'Select Cellular Compartments',
       'Select Protein Classes', 'GSM239868_AK_S_ZT6.CEL',
       'GSM239869_AK_S_ZT6.CEL', 'GSM239870_AK_S_ZT6.CEL', 'avg-AK_S_ZT6',
       'log_fold-AK_S_ZT6_vs_AK_SD_ZT6', 'fold-AK_S_ZT6_vs_AK_SD_ZT6',
       'rawp-AK_S_ZT6_vs_AK_SD_ZT6', 'adjp-AK_S_ZT6_vs_AK_SD_ZT6',
       'GSM239871_AK_S_ZT12.CEL', 'GSM239872_AK_S_ZT12.CEL',
       'GSM239873_AK_S_ZT12.CEL', 'avg-AK_S_ZT12',
       'log_fold-AK_S_ZT12_vs_AK_SD_ZT12', 'fold-AK_S_ZT12_vs_AK_SD_ZT12',
       'rawp-AK_S_ZT12_vs_AK_SD_ZT12', 'adjp-AK_S_ZT12_vs_AK_SD_ZT12',
       'GSM239880_AK_SD_ZT6.CEL', 'GSM239881_AK_SD_ZT6.CEL',
       'GSM239882_AK_SD_ZT6.CEL', 'avg-AK_SD_ZT6', 'GSM239883_AK_SD_ZT12.CEL',
       'GSM239884_AK_SD_ZT12.CEL', 'GSM239885_AK_SD_ZT12.CEL',
       'avg-AK_SD_ZT12', 'GSM239891_B6_S_ZT6.CEL',
  

In [33]:
# define regular expressions for sleep (S) and sleep dep (SD) filters 
s_filt ='B6_S_ZT6.CEL'
sd_filt ='B6_SD_ZT6.CEL'

In [34]:
df_s=df.filter(regex= s_filt)
df_s.head()

Unnamed: 0_level_0,GSM239891_B6_S_ZT6.CEL,GSM239893_B6_S_ZT6.CEL
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1
1427138_at,6.23366,6.25013
1425600_a_at,7.54536,7.24677
1457168_at,4.71756,4.38148
1450135_at,4.89215,5.05446
1424014_at,7.96183,8.10473


In [35]:
df_sd=df.filter(regex= sd_filt)
df_sd.head()

Unnamed: 0_level_0,GSM239903_B6_SD_ZT6.CEL,GSM239904_B6_SD_ZT6.CEL,GSM239905_B6_SD_ZT6.CEL
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1427138_at,6.02424,6.04533,6.47572
1425600_a_at,7.50737,7.62468,7.23338
1457168_at,4.46026,4.65893,4.72749
1450135_at,5.00071,5.38261,4.97833
1424014_at,8.16797,8.16044,8.19058


## Calculations 

In [36]:
# Enrichment

df[prefix+'Enrich'] = df.filter(regex=sd_filt).mean(axis=1) - df.filter(regex=s_filt).mean(axis=1)

In [37]:
df[prefix+'Enrich'].head()

Probesets
1427138_at     -0.060132
1425600_a_at    0.059078
1457168_at      0.066040
1450135_at      0.147245
1424014_at      0.139717
Name: 430AV2_ZT6_B6_Enrich, dtype: float64

In [38]:
# Calculating Pooled StDev
Scount = df.filter(regex=s_filt).count(axis=1)
SDcount = df.filter(regex=sd_filt).count(axis=1)

StdevS = (Scount-1) * df.filter(regex=s_filt).var(axis=1)
StdevSD = (SDcount-1) * df.filter(regex=sd_filt).var(axis=1)

df[prefix+'poolStDev'] = np.sqrt((StdevS+StdevSD)/(Scount+ SDcount-2))

In [39]:
# Calculating Cohen's d
df[prefix+'Cohens_d'] = df[prefix+'Enrich'] / df[prefix+'poolStDev']

In [40]:
#df[prefix+'poolStDev'].head()
df[prefix+'Cohens_d'] .head()

Probesets
1427138_at     -0.288892
1425600_a_at    0.289161
1457168_at      0.371108
1450135_at      0.747408
1424014_at      2.339214
Name: 430AV2_ZT6_B6_Cohens_d, dtype: float64

In [41]:
# Calculating J value (Correction factor)

df[prefix+'J'] = 1-(3/(4*(Scount+SDcount-1)))                              


In [42]:
# Calculating Hedge's g

df[prefix+'Hedges_g'] = df[prefix+'Cohens_d'] * df[prefix+'J']

In [43]:
#df[prefix+'J'].head()
df[prefix+'Hedges_g'] .head()

Probesets
1427138_at     -0.234725
1425600_a_at    0.234943
1457168_at      0.301526
1450135_at      0.607269
1424014_at      1.900611
Name: 430AV2_ZT6_B6_Hedges_g, dtype: float64

In [44]:
# Calculating Var_d
Scount = df.filter(regex=s_filt).count(axis=1)
SDcount = df.filter(regex=sd_filt).count(axis=1)

Ftop1 = Scount + SDcount
Ftop2 = Scount * SDcount
Fbottom1 = np.square(df[prefix+'Cohens_d']) 
Fbottom2 =  2*(Scount + SDcount)


df[prefix+'Var_d'] = (Ftop1/Ftop2) + (Fbottom1 /Fbottom2)

In [45]:
#check output
df[prefix+'Var_d'].head()

Probesets
1427138_at      0.841679
1425600_a_at    0.841695
1457168_at      0.847105
1450135_at      0.889195
1424014_at      1.380526
Name: 430AV2_ZT6_B6_Var_d, dtype: float64

In [46]:
df[prefix+'Var_g'] = df[prefix+'Var_d'] * np.square(df[prefix+'J'])

In [47]:
# Calculating SEg
df[prefix+'SEg'] = np.sqrt(df[prefix+'Var_g'])

In [48]:
df.sort_values(by= '430AV2_ZT6_B6_Hedges_g', ascending=False, inplace=True)
df

Unnamed: 0_level_0,Symbol,Definition,Ensembl_id,Entrez_id,Unigene_id,GO-Process,GO-Function,GO-Component,Pathway_info,Putative microRNA binding sites,...,ANOVA-adjp,largest fold,430AV2_ZT6_B6_Enrich,430AV2_ZT6_B6_poolStDev,430AV2_ZT6_B6_Cohens_d,430AV2_ZT6_B6_J,430AV2_ZT6_B6_Hedges_g,430AV2_ZT6_B6_Var_d,430AV2_ZT6_B6_Var_g,430AV2_ZT6_B6_SEg
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1419807_at,D4Ertd335e,"DNA segment, Chr 4, ERATO Doi 335, expressed",,52302,,,,,,,...,4.823408e-01,0.195850,0.103112,0.002307,44.699093,0.8125,36.318013,200.634222,132.449936,11.508690
1422287_at,Phxr2,per-hexamer repeat gene 2,ENSMUSG00000055108,18687,,,,,,,...,8.175388e-01,0.194020,0.127810,0.003969,32.202503,0.8125,26.164534,104.533454,69.008413,8.307130
1420418_at,Syt2,synaptotagmin II,ENSMUSG00000026452,20980,,transport,calcium ion binding // transporter activity //...,synapse // vesicular fraction // cytoplasm // ...,,"mmu-let-7(TargetScan), mmu-miR-125a-3p(TargetS...",...,1.059304e-02,0.848137,0.099268,0.003459,28.697415,0.8125,23.316650,83.187498,54.916747,7.410583
1460737_at,Igbp1,immunoglobulin (CD79A) binding protein 1,ENSMUSG00000031221,18518,,B cell activation // response to biotic stimul...,protein phosphatase type 2A regulator activity,cytoplasm,,"mmu-let-7f(miRanda), mmu-miR-1192(miRanda), mm...",...,2.093642e-01,0.207877,0.121710,0.004475,27.198371,0.8125,22.098676,74.808471,49.385280,7.027466
1453077_a_at,Snapc3,"small nuclear RNA activating complex, polypept...",ENSMUSG00000028483,77634,,"transcription // regulation of transcription, ...",DNA binding,nucleus,,"mmu-let-7b(RNAhybrid|miRanda), mmu-let-7c(RNAh...",...,1.538636e-07,0.829527,0.247660,0.011456,21.618915,0.8125,17.565369,47.571083,31.404348,5.603958
1444997_at,,,,,,,,,,,...,1.448792e-01,0.434707,0.211943,0.011023,19.228157,0.8125,15.622878,37.805535,24.957560,4.995754
1427154_at,Krt2,keratin 2,ENSMUSG00000064201,16681,,keratinization // keratinocyte migration // ke...,protein binding // structural molecule activity,intermediate filament // keratin filament,,"mmu-miR-125a-3p(miRanda), mmu-miR-144(miRanda)...",...,1.209205e-01,0.305073,0.128393,0.006973,18.413312,0.8125,14.960816,34.738340,22.932733,4.788813
1437609_at,Ube2u,ubiquitin-conjugating enzyme E2U (putative) [S...,ENSMUSG00000069733,381534,,post-translational protein modification // reg...,small conjugating protein ligase activity,,,"mmu-miR-125a-5p(miRanda), mmu-miR-125b-5p(miRa...",...,1.820230e-01,0.315233,0.169615,0.009275,18.288204,0.8125,14.859166,34.279174,22.629611,4.757059
1422943_a_at,Hspb1,heat shock protein 1,ENSMUSG00000004951|ENSMUSG00000078915,15507,,response to stress // response to heat,,intracellular // soluble fraction // contracti...,IL-3 Signaling Pathway:WP373(WikiPathways) // ...,"mmu-let-7a(RNAhybrid|miRanda), mmu-let-7b(RNAh...",...,2.403861e-07,1.399137,0.513740,0.032523,15.796378,0.8125,12.834557,25.785889,17.022716,4.125859
1436737_a_at,Sorbs1,sorbin and SH3 domain containing 1,ENSMUSG00000025006,20411,,focal adhesion formation // insulin receptor s...,protein binding // insulin receptor binding //...,cell-cell adherens junction // stress fiber //...,Insulin Signaling:WP65(WikiPathways) // PPAR s...,"mmu-miR-101a(miRanda), mmu-miR-125a-3p(miRanda...",...,2.809740e-05,0.398350,0.329637,0.021541,15.302923,0.8125,12.433625,24.251278,16.009633,4.001204


In [49]:
df.columns

Index(['Symbol', 'Definition', 'Ensembl_id', 'Entrez_id', 'Unigene_id',
       'GO-Process', 'GO-Function', 'GO-Component', 'Pathway_info',
       'Putative microRNA binding sites', 'Select Cellular Compartments',
       'Select Protein Classes', 'GSM239868_AK_S_ZT6.CEL',
       'GSM239869_AK_S_ZT6.CEL', 'GSM239870_AK_S_ZT6.CEL', 'avg-AK_S_ZT6',
       'log_fold-AK_S_ZT6_vs_AK_SD_ZT6', 'fold-AK_S_ZT6_vs_AK_SD_ZT6',
       'rawp-AK_S_ZT6_vs_AK_SD_ZT6', 'adjp-AK_S_ZT6_vs_AK_SD_ZT6',
       'GSM239871_AK_S_ZT12.CEL', 'GSM239872_AK_S_ZT12.CEL',
       'GSM239873_AK_S_ZT12.CEL', 'avg-AK_S_ZT12',
       'log_fold-AK_S_ZT12_vs_AK_SD_ZT12', 'fold-AK_S_ZT12_vs_AK_SD_ZT12',
       'rawp-AK_S_ZT12_vs_AK_SD_ZT12', 'adjp-AK_S_ZT12_vs_AK_SD_ZT12',
       'GSM239880_AK_SD_ZT6.CEL', 'GSM239881_AK_SD_ZT6.CEL',
       'GSM239882_AK_SD_ZT6.CEL', 'avg-AK_SD_ZT6', 'GSM239883_AK_SD_ZT12.CEL',
       'GSM239884_AK_SD_ZT12.CEL', 'GSM239885_AK_SD_ZT12.CEL',
       'avg-AK_SD_ZT12', 'GSM239891_B6_S_ZT6.CEL',
  

### Import key file from BioMart and index probesets to MGI gene symbols

In [50]:
dfX=pd.read_table('../FHS project/Sleep notebook Copy/BioMart_Ensmbl_index/mart_export72_430v2430Av2.txt',index_col=[3])
 
dfX.pop('Affy mouse430 2 probeset') # remove 430V2 probeset info (not needed for 430AV2 indexing)
dfX.head(5)

Unnamed: 0_level_0,Ensembl Gene ID,Description,MGI symbol
Affy mouse430a 2 probeset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1417126_a_at,ENSMUSG00000039221,ribosomal protein L22 like 1 [Source:MGI Symbo...,Rpl22l1
,ENSMUSG00000095611,predicted gene 10597 [Source:MGI Symbol;Acc:MG...,Gm10597
1417730_at,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1
1417730_at,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1
,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1


In [51]:
df_Join = df.join(dfX, how='left', sort=True)
df_FINAL1 = df_Join.groupby('MGI symbol').mean()
df_FINAL1[df_FINAL1.index.duplicated()==True]   # checking that no duplicate entries exist in the dataframe

Unnamed: 0_level_0,GSM239868_AK_S_ZT6.CEL,GSM239869_AK_S_ZT6.CEL,GSM239870_AK_S_ZT6.CEL,avg-AK_S_ZT6,log_fold-AK_S_ZT6_vs_AK_SD_ZT6,fold-AK_S_ZT6_vs_AK_SD_ZT6,rawp-AK_S_ZT6_vs_AK_SD_ZT6,adjp-AK_S_ZT6_vs_AK_SD_ZT6,GSM239871_AK_S_ZT12.CEL,GSM239872_AK_S_ZT12.CEL,...,ANOVA-adjp,largest fold,430AV2_ZT6_B6_Enrich,430AV2_ZT6_B6_poolStDev,430AV2_ZT6_B6_Cohens_d,430AV2_ZT6_B6_J,430AV2_ZT6_B6_Hedges_g,430AV2_ZT6_B6_Var_d,430AV2_ZT6_B6_Var_g,430AV2_ZT6_B6_SEg
MGI symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


### Columns from the list above can then easily be picked to produce files for use later. Examples below given:
 #### df3 = average S and SD expression for the platform and the log-fold changes
 #### df4 = Hedges g  values and associated variance for Meta-analysis (after indexing)

In [52]:
# df3 = df_FINAL1.loc[:,[u'avg-SD', u'avg-S', u'log_fold-S_vs_SD']]
# df3.columns =[prefix+'avg-SD', prefix+'avg-S', prefix+'log_fold-S_vs_SD']
# df3.to_csv('input_files/430AV2_SymbolExpression_forIndex.csv')

In [53]:
df4 = df_FINAL1.loc[:,[u'430AV2_ZT6_B6_Enrich',u'430AV2_ZT6_B6_Hedges_g', u'430AV2_ZT6_B6_Var_g', u'430AV2_ZT6_B6_SEg']]
df4.to_csv('../FHS project/Sleep notebook Copy/IPython_notebooks/input_files/430AV2_ZT6_B6_SymbolforIndexHedges.csv')

In [54]:
df4.head(10)  # check final ouput

Unnamed: 0_level_0,430AV2_ZT6_B6_Enrich,430AV2_ZT6_B6_Hedges_g,430AV2_ZT6_B6_Var_g,430AV2_ZT6_B6_SEg
MGI symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0610005C13Rik,0.150597,1.142943,0.680762,0.825083
0610008F07Rik,0.114313,0.892054,0.629706,0.79354
0610009B22Rik,0.062193,0.650009,0.592381,0.769663
0610009D07Rik,-0.027815,-0.332272,0.569284,0.754405
0610009O20Rik,-0.106042,-2.276908,1.068561,1.033712
0610010K14Rik,-0.204008,-1.638292,0.81853,0.904727
0610012G03Rik,0.034872,0.638659,0.775008,0.864882
0610031J06Rik,-0.053212,-0.294608,0.55881,0.747536
0610037L13Rik,-0.0619,-0.456488,0.570968,0.755624
0610040J01Rik,-0.052467,-0.365625,0.563498,0.750665
