# Calculations of the Effect Size (ES) for each microarray study 

###  Using Hedges' g value, an adjusted Cohen's d  value

$$  {Enrichment} = \bar{X_2}-\bar{X_1}$$

Let Group 1 be 12h Sleeping Cerebral Cortex Expression values and Group 2 be 12h SD Cerebral Cortex Expression values 

(S mean - SD mean) **(Logged values, so minus gives ratio)** 

$$  {Pooled\ Standard\  Deviation} = \sqrt\frac{(n_1-1)S_1^2 +(n_2-1)S_2^2}{(n_2 +n_2) -2}  $$  

$$  {Cohen's\ d\ value} = \frac{Enrichment}{Pooled\ Standard\ Deviation} $$

$$  {Correction\ Factor (J\ Factor)} = 1- \frac{3}{4df-1} $$

$$  {Hedges'\ g\ value} = Cohen's\ d\ \text{x}\ J\ $$

$$  {Variance\ in\ d (V_d)} = \frac{n_1- +n_2}{n_1 n_2} + \frac{d^2}{2(n_1 +n_2)}  $$

$$  {Variance\ in\ g (V_g)} = J^2\  \text{x}\ V_d  $$

$$  {Standard\ Error\ in\ g (SE_g)} = \sqrt{V_g}  $$

## Setup working environment and import data

In [1]:
import pandas as pd # Dataframes and file IO
import numpy as np # numerical calculations
%cd /Users/Ella1/Desktop/data sets 430AV2


/Users/Ella1/Desktop/data sets 430AV2


In [2]:
prefix = '430AV2_CerCx_12h_'   # define a prefix to add to column names (making indexing easier later)

In [3]:
# import the data file to a data frame 'df'
df=pd.read_table('DATASET-GSE6514.txt', delimiter='\t',  index_col=0) #,nrows=500)  
df.shape

(45101, 137)

In [4]:
# remove probes that are know to cross-hybridise to more than one target
df =df[~df.index.str.contains('_x_|_s_')]    #   important reverse selector ~ 
df.shape

(40569, 137)

## Look at column names and then setup filters for grouping columns into S and SD groups

In [5]:
df.columns

Index(['Symbol', 'Definition', 'Ensembl_id', 'Entrez_id', 'Unigene_id',
       'GO-Process', 'GO-Function', 'GO-Component', 'Pathway_info',
       'Putative microRNA binding sites',
       ...
       'adjp-HypoT_12hS_vs_HypoT_12hSD', 'GSM149636_HypoT_12hSD.CEL',
       'GSM149637_HypoT_12hSD.CEL', 'GSM149648_HypoT_12hSD.CEL',
       'GSM149649_HypoT_12hSD.CEL', 'GSM149650_HypoT_12hSD.CEL',
       'avg-HypoT_12hSD', 'ANOVA-rawp', 'ANOVA-adjp', 'largest fold'],
      dtype='object', length=137)

In [6]:
# define regular expressions for sleep (S) and sleep dep (SD) filters 
s_filt ='CerCx_12hS.CEL'
sd_filt ='CerCx_12hSD.CEL'

In [7]:
df_s=df.filter(regex= s_filt)
df_s.head()

Unnamed: 0_level_0,GSM149560_CerCx_12hS.CEL,GSM149561_CerCx_12hS.CEL,GSM149562_CerCx_12hS.CEL,GSM149563_CerCx_12hS.CEL,GSM149564_CerCx_12hS.CEL
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1427138_at,6.92287,6.83596,6.63258,6.81333,6.82105
1425600_a_at,8.64916,8.65114,8.77258,8.49826,8.33323
1457168_at,6.11595,5.87641,5.99933,5.78631,5.94891
1450135_at,5.77024,6.03409,5.97371,5.75598,5.55642
1424014_at,8.04668,8.10112,7.6536,8.28115,8.31959


In [8]:
df_sd=df.filter(regex= sd_filt)
df_sd.head()

Unnamed: 0_level_0,GSM149565_CerCx_12hSD.CEL,GSM149566_CerCx_12hSD.CEL,GSM149567_CerCx_12hSD.CEL,GSM149568_CerCx_12hSD.CEL,GSM149575_CerCx_12hSD.CEL
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1427138_at,6.81348,7.21456,6.73234,6.78261,7.08053
1425600_a_at,8.56389,7.80109,8.36049,8.52911,8.82191
1457168_at,6.25838,5.91617,5.95915,6.01574,5.97865
1450135_at,6.03434,6.07553,6.08746,5.88884,6.18947
1424014_at,7.72964,8.00561,7.65199,7.89556,8.14387


## Calculations 

In [9]:
# Enrichment

df[prefix+'Enrich'] = df.filter(regex=sd_filt).mean(axis=1) - df.filter(regex=s_filt).mean(axis=1)

In [10]:
df[prefix+'Enrich'].head()

Probesets
1427138_at      0.119546
1425600_a_at   -0.165576
1457168_at      0.080236
1450135_at      0.237040
1424014_at     -0.195094
Name: 430AV2_CerCx_12h_Enrich, dtype: float64

In [11]:
# Calculating Pooled StDev
Scount = df.filter(regex=s_filt).count(axis=1)
SDcount = df.filter(regex=sd_filt).count(axis=1)

StdevS = (Scount-1) * df.filter(regex=s_filt).var(axis=1)
StdevSD = (SDcount-1) * df.filter(regex=sd_filt).var(axis=1)

df[prefix+'poolStDev'] = np.sqrt((StdevS+StdevSD)/(Scount+ SDcount-2))

In [12]:
# Calculating Cohen's d
df[prefix+'Cohens_d'] = df[prefix+'Enrich'] / df[prefix+'poolStDev']

In [13]:
#df[prefix+'poolStDev'].head()
df[prefix+'Cohens_d'] .head()

Probesets
1427138_at      0.716392
1425600_a_at   -0.561708
1457168_at      0.617758
1450135_at      1.525781
1424014_at     -0.830621
Name: 430AV2_CerCx_12h_Cohens_d, dtype: float64

In [14]:
# Calculating J value (Correction factor)

df[prefix+'J'] = 1-(3/(4*(Scount+SDcount-1)))                              


In [15]:
# Calculating Hedge's g

df[prefix+'Hedges_g'] = df[prefix+'Cohens_d'] * df[prefix+'J']

In [16]:
#df[prefix+'J'].head()
df[prefix+'Hedges_g'] .head()

Probesets
1427138_at      0.656693
1425600_a_at   -0.514899
1457168_at      0.566278
1450135_at      1.398632
1424014_at     -0.761402
Name: 430AV2_CerCx_12h_Hedges_g, dtype: float64

In [17]:
# Calculating Var_d
Scount = df.filter(regex=s_filt).count(axis=1)
SDcount = df.filter(regex=sd_filt).count(axis=1)

Ftop1 = Scount + SDcount
Ftop2 = Scount * SDcount
Fbottom1 = np.square(df[prefix+'Cohens_d']) 
Fbottom2 =  2*(Scount + SDcount)


df[prefix+'Var_d'] = (Ftop1/Ftop2) + (Fbottom1 /Fbottom2)

In [18]:
#check output
df[prefix+'Var_d'].head()

Probesets
1427138_at      0.425661
1425600_a_at    0.415776
1457168_at      0.419081
1450135_at      0.516400
1424014_at      0.434497
Name: 430AV2_CerCx_12h_Var_d, dtype: float64

In [19]:
df[prefix+'Var_g'] = df[prefix+'Var_d'] * np.square(df[prefix+'J'])

In [20]:
# Calculating SEg
df[prefix+'SEg'] = np.sqrt(df[prefix+'Var_g'])

In [21]:
df.sort_values(by= '430AV2_CerCx_12h_Hedges_g', ascending=False, inplace=True)
df

Unnamed: 0_level_0,Symbol,Definition,Ensembl_id,Entrez_id,Unigene_id,GO-Process,GO-Function,GO-Component,Pathway_info,Putative microRNA binding sites,...,ANOVA-adjp,largest fold,430AV2_CerCx_12h_Enrich,430AV2_CerCx_12h_poolStDev,430AV2_CerCx_12h_Cohens_d,430AV2_CerCx_12h_J,430AV2_CerCx_12h_Hedges_g,430AV2_CerCx_12h_Var_d,430AV2_CerCx_12h_Var_g,430AV2_CerCx_12h_SEg
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1436387_at,C330006P03Rik,homer homolog 1 (Drosophila) [Source:MGI Symbo...,ENSMUSG00000007617,320588,,,,,,"mmu-let-7a(RNAhybrid|miRanda), mmu-let-7b(RNAh...",...,5.979233e-29,5.778282,1.953484,0.190955,10.230069,0.916667,9.377563,5.632715,4.733046,2.175556
1436094_at,Vgf,VGF nerve growth factor inducible [Source:MGI ...,ENSMUSG00000037428,381677,,response to cold // response to cAMP // ovaria...,neuropeptide hormone activity,extracellular region // extracellular space //...,,"mmu-miR-141(miRanda), mmu-miR-185(miRanda), mm...",...,1.112763e-06,0.653702,0.535074,0.070950,7.541569,0.916667,6.913105,3.243763,2.725662,1.650958
1420720_at,LOC100044234 /// Nptx2,neuronal pentraxin 2,ENSMUSG00000059991,100044234|53324,,,calcium ion binding // sugar binding // metal ...,extracellular region,,"mmu-miR-1271(TargetScan), mmu-miR-128(TargetSc...",...,2.641582e-11,0.976680,0.553090,0.077475,7.138993,0.916667,6.544076,2.948261,2.477358,1.573962
1449906_at,Selp,"selectin, platelet",ENSMUSG00000026580,20344,,cell adhesion // inflammatory response // leuk...,sugar binding // glycoprotein binding // prote...,membrane fraction // membrane // external side...,IL-3 Signaling Pathway:WP373(WikiPathways) // ...,"mmu-miR-106a(miRanda), mmu-miR-106b(miRanda), ...",...,1.400974e-01,0.329564,0.180394,0.029315,6.153621,0.916667,5.640819,2.293353,1.927053,1.388183
1424482_at,Arhgef7,Rho guanine nucleotide exchange factor (GEF7),ENSMUSG00000031511,54126,,regulation of Rho protein signal transduction ...,protein binding // Rho guanyl-nucleotide excha...,intracellular,XPodNet - protein-protein interactions in the ...,"mmu-miR-101a(miRanda), mmu-miR-101b(miRanda), ...",...,1.535468e-04,0.811904,0.404076,0.069929,5.778376,0.916667,5.296845,2.069482,1.738940,1.318689
1421926_at,Mapk11,mitogen-activated protein kinase 11,ENSMUSG00000053137,19094,,protein kinase cascade // protein amino acid p...,MAP kinase activity // protein binding // MP k...,,Insulin Signaling:WP65(WikiPathways) // Parkin...,"mmu-let-7(TargetScan), mmu-let-7a(miRanda), mm...",...,2.820854e-19,0.794680,0.240486,0.042830,5.614951,0.916667,5.147038,1.976384,1.660711,1.288686
1457984_at,Crh,corticotropin releasing hormone [Source:MGI Sy...,ENSMUSG00000049796,12918,,inflammatory response // glucocorticoid biosyn...,hormone activity,extracellular region,Myometrial Relaxation and Contraction Pathways...,"mmu-miR-103(RNAhybrid|miRanda), mmu-miR-107(RN...",...,3.946359e-05,1.654352,1.224490,0.221618,5.525219,0.916667,5.064784,1.926402,1.618713,1.272287
1459637_at,,,,,,,,,,,...,4.482383e-04,0.483320,0.192522,0.037051,5.196134,0.916667,4.763123,1.749991,1.470478,1.212633
1434595_at,Trim9,tripartite motif-containing 9 [Source:MGI Symb...,ENSMUSG00000021071,94090,,synaptic vesicle exocytosis,protein binding // metal ion binding // zinc i...,cytoplasm // intracellular // synaptosome,,"mmu-miR-10a(RNAhybrid|miRanda), mmu-miR-10b(RN...",...,7.487603e-19,1.238976,0.477024,0.092774,5.141779,0.916667,4.713298,1.721895,1.446870,1.202859
1418937_at,Dio2,"deiodinase, iodothyronine, type II",ENSMUSG00000007682,13371,,hormone biosynthetic process // thyroid hormon...,thyroxine 5'-deiodinase activity // oxidoreduc...,integral to membrane // membrane,Selenium metabolism/Selenoproteins:WP108(WikiP...,"mmu-miR-1192(miRanda), mmu-miR-122(miRanda), m...",...,6.303485e-31,2.738436,0.746560,0.146548,5.094307,0.916667,4.669782,1.697598,1.426454,1.194343


In [22]:
df.columns

Index(['Symbol', 'Definition', 'Ensembl_id', 'Entrez_id', 'Unigene_id',
       'GO-Process', 'GO-Function', 'GO-Component', 'Pathway_info',
       'Putative microRNA binding sites',
       ...
       'ANOVA-adjp', 'largest fold', '430AV2_CerCx_12h_Enrich',
       '430AV2_CerCx_12h_poolStDev', '430AV2_CerCx_12h_Cohens_d',
       '430AV2_CerCx_12h_J', '430AV2_CerCx_12h_Hedges_g',
       '430AV2_CerCx_12h_Var_d', '430AV2_CerCx_12h_Var_g',
       '430AV2_CerCx_12h_SEg'],
      dtype='object', length=145)

### Import key file from BioMart and index probesets to MGI gene symbols

In [23]:
dfX=pd.read_table('../FHS project/Sleep notebook Copy/BioMart_Ensmbl_index/mart_export72_430v2430Av2.txt',index_col=[3])
 
dfX.pop('Affy mouse430 2 probeset') # remove 430V2 probeset info (not needed for 430AV2 indexing)
dfX.head(5)

Unnamed: 0_level_0,Ensembl Gene ID,Description,MGI symbol
Affy mouse430a 2 probeset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1417126_a_at,ENSMUSG00000039221,ribosomal protein L22 like 1 [Source:MGI Symbo...,Rpl22l1
,ENSMUSG00000095611,predicted gene 10597 [Source:MGI Symbol;Acc:MG...,Gm10597
1417730_at,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1
1417730_at,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1
,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1


In [24]:
df_Join = df.join(dfX, how='left', sort=True)
df_FINAL1 = df_Join.groupby('MGI symbol').mean()
df_FINAL1[df_FINAL1.index.duplicated()==True]   # checking that no duplicate entries exist in the dataframe

Unnamed: 0_level_0,GSM149516_CerCx_3hS.CEL,GSM149517_CerCx_3hS.CEL,GSM149518_CerCx_3hS.CEL,GSM149519_CerCx_3hS.CEL,GSM149520_CerCx_3hS.CEL,avg-CerCx_3hS,log_fold-CerCx_3hS_vs_CerCx_3hSD,fold-CerCx_3hS_vs_CerCx_3hSD,rawp-CerCx_3hS_vs_CerCx_3hSD,adjp-CerCx_3hS_vs_CerCx_3hSD,...,ANOVA-adjp,largest fold,430AV2_CerCx_12h_Enrich,430AV2_CerCx_12h_poolStDev,430AV2_CerCx_12h_Cohens_d,430AV2_CerCx_12h_J,430AV2_CerCx_12h_Hedges_g,430AV2_CerCx_12h_Var_d,430AV2_CerCx_12h_Var_g,430AV2_CerCx_12h_SEg
MGI symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


### Columns from the list above can then easily be picked to produce files for use later. Examples below given:
 #### df3 = average S and SD expression for the platform and the log-fold changes
 #### df4 = Hedges g  values and associated variance for Meta-analysis (after indexing)

In [25]:
# df3 = df_FINAL1.loc[:,[u'avg-SD', u'avg-S', u'log_fold-S_vs_SD']]
# df3.columns =[prefix+'avg-SD', prefix+'avg-S', prefix+'log_fold-S_vs_SD']
# df3.to_csv('input_files/430AV2_SymbolExpression_forIndex.csv')

In [26]:
df4 = df_FINAL1.loc[:,[u'430AV2_CerCx_12h_Enrich',u'430AV2_CerCx_12h_Hedges_g', u'430AV2_CerCx_12h_Var_g', u'430AV2_CerCx_12h_SEg']]
df4.to_csv('../FHS project/Sleep notebook Copy/IPython_notebooks/input_files/430AV2_CerCx_12h_SymbolforIndexHedges.csv')

In [27]:
df4.head(10)  # check final ouput

Unnamed: 0_level_0,430AV2_CerCx_12h_Enrich,430AV2_CerCx_12h_Hedges_g,430AV2_CerCx_12h_Var_g,430AV2_CerCx_12h_SEg
MGI symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0610005C13Rik,-0.030804,-0.248104,0.339189,0.582399
0610008F07Rik,0.070902,0.504746,0.34885,0.590635
0610009B22Rik,-0.108612,-0.939717,0.380265,0.616656
0610009D07Rik,-0.190724,-0.842346,0.373087,0.610691
0610009O20Rik,-0.039886,-0.280699,0.340051,0.583139
0610010K14Rik,-0.293218,-0.864538,0.373482,0.611132
0610012G03Rik,-0.008005,-0.000837,0.374294,0.611406
0610031J06Rik,0.031536,0.207522,0.338264,0.581605
0610037L13Rik,0.01758,0.140376,0.337096,0.5806
0610040J01Rik,-0.01609,-0.172246,0.337595,0.581029
