# Calculations of the Effect Size (ES) for each microarray study 

###  Using Hedges' g value, an adjusted Cohen's d  value

$$  {Enrichment} = \bar{X_2}-\bar{X_1}$$

Let Group 1 be 6hSleeping whole brain AKR/J strain (AK) Expression values and Group 2 be 6hSD whole brain AKR/J strain (AK) Expression values 

(S mean - SD mean) **(Logged values, so minus gives ratio)** 

$$  {Pooled\ Standard\  Deviation} = \sqrt\frac{(n_1-1)S_1^2 +(n_2-1)S_2^2}{(n_2 +n_2) -2}  $$  

$$  {Cohen's\ d\ value} = \frac{Enrichment}{Pooled\ Standard\ Deviation} $$

$$  {Correction\ Factor (J\ Factor)} = 1- \frac{3}{4df-1} $$

$$  {Hedges'\ g\ value} = Cohen's\ d\ \text{x}\ J\ $$

$$  {Variance\ in\ d (V_d)} = \frac{n_1- +n_2}{n_1 n_2} + \frac{d^2}{2(n_1 +n_2)}  $$

$$  {Variance\ in\ g (V_g)} = J^2\  \text{x}\ V_d  $$

$$  {Standard\ Error\ in\ g (SE_g)} = \sqrt{V_g}  $$

## Setup working environment and import data

In [1]:
import pandas as pd # Dataframes and file IO
import numpy as np # numerical calculations
%cd /Users/Ella1/Desktop/data sets 430AV2


/Users/Ella1/Desktop/data sets 430AV2


In [2]:
prefix = '430AV2_WB_AK_'   # define a prefix to add to column names (making indexing easier later)

In [3]:
# import the data file to a data frame 'df'
df=pd.read_table('DATASET-GSE9441.txt', delimiter='\t',  index_col=0) #,nrows=500)  
df.shape

(45101, 51)

In [4]:
# remove probes that are know to cross-hybridise to more than one target
df =df[~df.index.str.contains('_x_|_s_')]    #   important reverse selector ~ 
df.shape

(40569, 51)

## Look at column names and then setup filters for grouping columns into S and SD groups

In [5]:
df.columns

Index(['Symbol', 'Definition', 'Ensembl_id', 'Entrez_id', 'Unigene_id',
       'GO-Process', 'GO-Function', 'GO-Component', 'Pathway_info',
       'Putative microRNA binding sites', 'Select Cellular Compartments',
       'Select Protein Classes', 'GSM239832_AK_S.CEL', 'GSM239833_AK_S.CEL',
       'GSM239834_AK_S.CEL', 'avg-AK_S', 'log_fold-AK_S_vs_AK_SD',
       'fold-AK_S_vs_AK_SD', 'rawp-AK_S_vs_AK_SD', 'adjp-AK_S_vs_AK_SD',
       'GSM239835_AK_SD.CEL', 'GSM239836_AK_SD.CEL', 'GSM239837_AK_SD.CEL',
       'avg-AK_SD', 'GSM239838_B6_S.CEL', 'GSM239839_B6_S.CEL',
       'GSM239840_B6_S.CEL', 'avg-B6_S', 'log_fold-B6_S_vs_B6_SD',
       'fold-B6_S_vs_B6_SD', 'rawp-B6_S_vs_B6_SD', 'adjp-B6_S_vs_B6_SD',
       'GSM239841_B6_SD.CEL', 'GSM239842_B6_SD.CEL', 'GSM239843_B6_SD.CEL',
       'avg-B6_SD', 'GSM239844_D2_S.CEL', 'GSM239845_D2_S.CEL',
       'GSM239846_D2_S.CEL', 'avg-D2_S', 'log_fold-D2_S_vs_D2_SD',
       'fold-D2_S_vs_D2_SD', 'rawp-D2_S_vs_D2_SD', 'adjp-D2_S_vs_D2_SD',
       'G

In [6]:
# define regular expressions for sleep (S) and sleep dep (SD) filters 
s_filt ='AK_S.CEL'
sd_filt ='AK_SD.CEL'

In [7]:
df_s=df.filter(regex= s_filt)
df_s.head()

Unnamed: 0_level_0,GSM239832_AK_S.CEL,GSM239833_AK_S.CEL,GSM239834_AK_S.CEL
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1427138_at,5.10971,5.03325,5.28008
1425600_a_at,6.69959,6.69525,6.73935
1457168_at,4.00329,4.47225,4.26492
1450135_at,4.02864,4.25284,4.33413
1424014_at,7.87437,7.91219,7.90768


In [8]:
df_sd=df.filter(regex= sd_filt)
df_sd.head()

Unnamed: 0_level_0,GSM239835_AK_SD.CEL,GSM239836_AK_SD.CEL,GSM239837_AK_SD.CEL
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1427138_at,4.92819,4.84399,5.22099
1425600_a_at,6.42856,6.6083,6.94887
1457168_at,4.20079,4.11243,4.48287
1450135_at,4.0613,3.93757,3.78229
1424014_at,7.97069,7.86162,7.92039


## Calculations 

In [9]:
# Enrichment

df[prefix+'Enrich'] = df.filter(regex=sd_filt).mean(axis=1) - df.filter(regex=s_filt).mean(axis=1)

In [10]:
df[prefix+'Enrich'].head()

Probesets
1427138_at     -0.143290
1425600_a_at   -0.049487
1457168_at      0.018543
1450135_at     -0.278150
1424014_at      0.019487
Name: 430AV2_WB_AK_Enrich, dtype: float64

In [11]:
# Calculating Pooled StDev
Scount = df.filter(regex=s_filt).count(axis=1)
SDcount = df.filter(regex=sd_filt).count(axis=1)

StdevS = (Scount-1) * df.filter(regex=s_filt).var(axis=1)
StdevSD = (SDcount-1) * df.filter(regex=sd_filt).var(axis=1)

df[prefix+'poolStDev'] = np.sqrt((StdevS+StdevSD)/(Scount+ SDcount-2))

In [12]:
# Calculating Cohen's d
df[prefix+'Cohens_d'] = df[prefix+'Enrich'] / df[prefix+'poolStDev']

In [13]:
#df[prefix+'poolStDev'].head()
df[prefix+'Cohens_d'] .head()

Probesets
1427138_at     -0.863094
1425600_a_at   -0.263714
1457168_at      0.086150
1450135_at     -1.863099
1424014_at      0.472152
Name: 430AV2_WB_AK_Cohens_d, dtype: float64

In [14]:
# Calculating J value (Correction factor)

df[prefix+'J'] = 1-(3/(4*(Scount+SDcount-1)))                              


In [15]:
# Calculating Hedge's g

df[prefix+'Hedges_g'] = df[prefix+'Cohens_d'] * df[prefix+'J']

In [16]:
#df[prefix+'J'].head()
df[prefix+'Hedges_g'] .head()

Probesets
1427138_at     -0.733630
1425600_a_at   -0.224157
1457168_at      0.073228
1450135_at     -1.583634
1424014_at      0.401330
Name: 430AV2_WB_AK_Hedges_g, dtype: float64

In [17]:
# Calculating Var_d
Scount = df.filter(regex=s_filt).count(axis=1)
SDcount = df.filter(regex=sd_filt).count(axis=1)

Ftop1 = Scount + SDcount
Ftop2 = Scount * SDcount
Fbottom1 = np.square(df[prefix+'Cohens_d']) 
Fbottom2 =  2*(Scount + SDcount)


df[prefix+'Var_d'] = (Ftop1/Ftop2) + (Fbottom1 /Fbottom2)

In [18]:
#check output
df[prefix+'Var_d'].head()

Probesets
1427138_at      0.728744
1425600_a_at    0.672462
1457168_at      0.667285
1450135_at      0.955928
1424014_at      0.685244
Name: 430AV2_WB_AK_Var_d, dtype: float64

In [19]:
df[prefix+'Var_g'] = df[prefix+'Var_d'] * np.square(df[prefix+'J'])

In [20]:
# Calculating SEg
df[prefix+'SEg'] = np.sqrt(df[prefix+'Var_g'])

In [21]:
df.sort_values(by= '430AV2_WB_AK_Hedges_g', ascending=False, inplace=True)
df

Unnamed: 0_level_0,Symbol,Definition,Ensembl_id,Entrez_id,Unigene_id,GO-Process,GO-Function,GO-Component,Pathway_info,Putative microRNA binding sites,...,ANOVA-adjp,largest fold,430AV2_WB_AK_Enrich,430AV2_WB_AK_poolStDev,430AV2_WB_AK_Cohens_d,430AV2_WB_AK_J,430AV2_WB_AK_Hedges_g,430AV2_WB_AK_Var_d,430AV2_WB_AK_Var_g,430AV2_WB_AK_SEg
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1440227_at,Slc5a3,solute carrier family 5 (inositol transporters...,ENSMUSG00000089774,53881,,peripheral nervous system development // ion t...,structural constituent of ribosome // transpor...,ribonucleoprotein complex // ribosome // intra...,,"mmu-miR-1(miRanda), mmu-miR-124(TargetScan), m...",...,1.662375e-02,0.748667,0.748667,0.023203,32.265268,0.85,27.425478,87.420625,63.161401,7.947415
1438040_a_at,Hsp90b1,"heat shock protein 90kDa beta (Grp94), member 1",ENSMUSG00000020048,22027,,response to stress // protein folding,calcium ion binding // low-density lipoprotein...,melanosome // endoplasmic reticulum lumen // p...,Focal Adhesion-PI3K-Akt-mTOR-signaling pathway...,"mmu-miR-1(miRanda|pictar), mmu-miR-122(miRanda...",...,3.049687e-02,0.875340,0.738660,0.023851,30.970395,0.85,26.324835,80.597111,58.231413,7.630951
1458636_at,Rnf219,ring finger protein 219 [Source:MGI Symbol;Acc...,ENSMUSG00000022120,72486,,,protein binding // metal ion binding // zinc i...,,,"mmu-miR-101a(miRanda), mmu-miR-101b(miRanda), ...",...,7.238425e-02,0.426777,0.306273,0.014971,20.457577,0.85,17.388940,35.542703,25.679603,5.067505
1434228_at,Ppm2c,pyruvate dehyrogenase phosphatase catalytic su...,ENSMUSG00000049225,381511,,protein amino acid dephosphorylation,calcium ion binding // magnesium ion binding /...,mitochondrion // protein serine/threonine phos...,TCA Cycle:WP434(WikiPathways),"mmu-miR-101a(miRanda), mmu-miR-101b(miRanda), ...",...,1.240288e-02,0.672233,0.620687,0.038663,16.053949,0.85,13.645856,22.144106,15.999116,3.999890
1417185_at,Ly6a,"lymphocyte antigen 6 complex, locus A",ENSMUSG00000075602,110454,,,,anchored to membrane // intracellular // exter...,,"mmu-miR-138(miRanda), mmu-miR-15a(miRanda), mm...",...,1.000640e-02,0.764543,0.372260,0.023690,15.713971,0.85,13.356876,21.244074,15.348844,3.917760
1417606_a_at,Calr,calreticulin,ENSMUSG00000003814|ENSMUSG00000081731,12317,,cortical actin cytoskeleton organization // se...,calcium ion binding // sugar binding // zinc i...,extracellular space // MHC class I peptide loa...,Calcium Regulation in the Cardiac Cell:WP553(W...,"mmu-let-7b(RNAhybrid|miRanda), mmu-let-7c(RNAh...",...,1.205508e-05,0.765267,0.765267,0.054333,14.084687,0.85,11.971984,17.198201,12.425700,3.525011
1450768_at,Dlg1 /// LOC100047603,"discs, large homolog 1 (Drosophila)",ENSMUSG00000022770,100047603|13383,,positive regulation of multicellular organism ...,protein binding // phosphatase binding // prot...,endoplasmic reticulum membrane // postsynaptic...,Wnt Signaling Pathway NetPath:WP539(WikiPathways),"mmu-let-7b(RNAhybrid|miRanda), mmu-let-7f(RNAh...",...,4.321327e-01,0.514687,0.251230,0.018025,13.937977,0.85,11.847281,16.855601,12.178172,3.489724
1416064_a_at,Hspa5,heat shock protein 5,ENSMUSG00000026864,14828,,response to stress // ER overload response,protein binding // nucleotide binding // ATP b...,endoplasmic reticulum lumen // melanosome // e...,MAPK signaling pathway:WP493(WikiPathways),"mmu-miR-1192(TargetScan|miRanda), mmu-miR-1193...",...,1.191694e-07,1.365547,1.365547,0.104844,13.024603,0.85,11.070912,14.803356,10.695425,3.270386
1435246_at,Paqr8,progestin and adipoQ receptor family member VI...,ENSMUSG00000025931,74229,,,steroid binding // receptor activity // lipid ...,integral to membrane // membrane,,"mmu-miR-124(TargetScan|miRanda), mmu-miR-124ab...",...,2.974382e-03,0.944110,0.269030,0.022424,11.997636,0.85,10.197991,12.661939,9.148251,3.024608
1420886_a_at,Xbp1,X-box binding protein 1,ENSMUSG00000020484,22433,,"transcription // regulation of transcription, ...",transcription factor activity // DNA binding /...,nucleus,Insulin Signaling:WP65(WikiPathways),"mmu-miR-101a(RNAhybrid|miRanda), mmu-miR-101b(...",...,1.282237e-04,1.084927,1.084927,0.090951,11.928637,0.85,10.139342,12.524366,9.048854,3.008131


In [22]:
df.columns

Index(['Symbol', 'Definition', 'Ensembl_id', 'Entrez_id', 'Unigene_id',
       'GO-Process', 'GO-Function', 'GO-Component', 'Pathway_info',
       'Putative microRNA binding sites', 'Select Cellular Compartments',
       'Select Protein Classes', 'GSM239832_AK_S.CEL', 'GSM239833_AK_S.CEL',
       'GSM239834_AK_S.CEL', 'avg-AK_S', 'log_fold-AK_S_vs_AK_SD',
       'fold-AK_S_vs_AK_SD', 'rawp-AK_S_vs_AK_SD', 'adjp-AK_S_vs_AK_SD',
       'GSM239835_AK_SD.CEL', 'GSM239836_AK_SD.CEL', 'GSM239837_AK_SD.CEL',
       'avg-AK_SD', 'GSM239838_B6_S.CEL', 'GSM239839_B6_S.CEL',
       'GSM239840_B6_S.CEL', 'avg-B6_S', 'log_fold-B6_S_vs_B6_SD',
       'fold-B6_S_vs_B6_SD', 'rawp-B6_S_vs_B6_SD', 'adjp-B6_S_vs_B6_SD',
       'GSM239841_B6_SD.CEL', 'GSM239842_B6_SD.CEL', 'GSM239843_B6_SD.CEL',
       'avg-B6_SD', 'GSM239844_D2_S.CEL', 'GSM239845_D2_S.CEL',
       'GSM239846_D2_S.CEL', 'avg-D2_S', 'log_fold-D2_S_vs_D2_SD',
       'fold-D2_S_vs_D2_SD', 'rawp-D2_S_vs_D2_SD', 'adjp-D2_S_vs_D2_SD',
       'G

### Import key file from BioMart and index probesets to MGI gene symbols

In [23]:
dfX=pd.read_table('../FHS project/Sleep notebook Copy/BioMart_Ensmbl_index/mart_export72_430v2430Av2.txt',index_col=[3])
 
dfX.pop('Affy mouse430 2 probeset') # remove 430V2 probeset info (not needed for 430AV2 indexing)
dfX.head(5)

Unnamed: 0_level_0,Ensembl Gene ID,Description,MGI symbol
Affy mouse430a 2 probeset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1417126_a_at,ENSMUSG00000039221,ribosomal protein L22 like 1 [Source:MGI Symbo...,Rpl22l1
,ENSMUSG00000095611,predicted gene 10597 [Source:MGI Symbol;Acc:MG...,Gm10597
1417730_at,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1
1417730_at,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1
,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1


In [24]:
df_Join = df.join(dfX, how='left', sort=True)
df_FINAL1 = df_Join.groupby('MGI symbol').mean()
df_FINAL1[df_FINAL1.index.duplicated()==True]   # checking that no duplicate entries exist in the dataframe

Unnamed: 0_level_0,GSM239832_AK_S.CEL,GSM239833_AK_S.CEL,GSM239834_AK_S.CEL,avg-AK_S,log_fold-AK_S_vs_AK_SD,fold-AK_S_vs_AK_SD,rawp-AK_S_vs_AK_SD,adjp-AK_S_vs_AK_SD,GSM239835_AK_SD.CEL,GSM239836_AK_SD.CEL,...,ANOVA-adjp,largest fold,430AV2_WB_AK_Enrich,430AV2_WB_AK_poolStDev,430AV2_WB_AK_Cohens_d,430AV2_WB_AK_J,430AV2_WB_AK_Hedges_g,430AV2_WB_AK_Var_d,430AV2_WB_AK_Var_g,430AV2_WB_AK_SEg
MGI symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


### Columns from the list above can then easily be picked to produce files for use later. Examples below given:
 #### df3 = average S and SD expression for the platform and the log-fold changes
 #### df4 = Hedges g  values and associated variance for Meta-analysis (after indexing)

In [25]:
# df3 = df_FINAL1.loc[:,[u'avg-SD', u'avg-S', u'log_fold-S_vs_SD']]
# df3.columns =[prefix+'avg-SD', prefix+'avg-S', prefix+'log_fold-S_vs_SD']
# df3.to_csv('input_files/430AV2_SymbolExpression_forIndex.csv')

In [26]:
df4 = df_FINAL1.loc[:,[u'430AV2_WB_AK_Enrich',u'430AV2_WB_AK_Hedges_g', u'430AV2_WB_AK_Var_g', u'430AV2_WB_AK_SEg']]
df4.to_csv('../FHS project/Sleep notebook Copy/IPython_notebooks/input_files/430AV2_WB_AK_SymbolforIndexHedges.csv')

In [27]:
df4.head(10)  # check final ouput

Unnamed: 0_level_0,430AV2_WB_AK_Enrich,430AV2_WB_AK_Hedges_g,430AV2_WB_AK_Var_g,430AV2_WB_AK_SEg
MGI symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0610005C13Rik,-0.096293,-0.623884,0.514103,0.717009
0610008F07Rik,0.06599,1.367883,0.637592,0.798494
0610009B22Rik,0.263427,1.029843,0.570048,0.755015
0610009D07Rik,-0.021945,-0.08448,0.486485,0.697481
0610009O20Rik,-0.095423,-0.902389,0.549525,0.7413
0610010K14Rik,-0.328433,-0.469238,0.500015,0.707118
0610012G03Rik,0.097141,0.499229,0.552501,0.740421
0610031J06Rik,-0.01314,-0.208273,0.485281,0.696621
0610037L13Rik,-0.09394,-0.478124,0.500717,0.707613
0610040J01Rik,0.099143,0.580902,0.509787,0.713994
