# Calculations of the Effect Size (ES) for each microarray study 

###  Using Hedges' g value, an adjusted Cohen's d  value

$$  {Enrichment} = \bar{X_2}-\bar{X_1}$$

Let Group 1 be Sleeping Astrocyte Expression values and Group 2 be SD Astrocyte Expression values 

(S mean - SD mean) **(Logged values, so minus gives ratio)** 

$$  {Pooled\ Standard\  Deviation} = \sqrt\frac{(n_1-1)S_1^2 +(n_2-1)S_2^2}{(n_2 +n_2) -2}  $$  

$$  {Cohen's\ d\ value} = \frac{Enrichment}{Pooled\ Standard\ Deviation} $$

$$  {Correction\ Factor (J\ Factor)} = 1- \frac{3}{4df-1} $$

$$  {Hedges'\ g\ value} = Cohen's\ d\ \text{x}\ J\ $$

$$  {Variance\ in\ d (V_d)} = \frac{n_1- +n_2}{n_1 n_2} + \frac{d^2}{2(n_1 +n_2)}  $$

$$  {Variance\ in\ g (V_g)} = J^2\  \text{x}\ V_d  $$

$$  {Standard\ Error\ in\ g (SE_g)} = \sqrt{V_g}  $$

## Setup working environment and import data

In [1]:
import pandas as pd # Dataframes and file IO
import numpy as np # numerical calculations
%cd /Users/Ella1/Desktop/data sets 430AV2


/Users/Ella1/Desktop/data sets 430AV2


In [2]:
prefix = '430AV2_Astro_'   # define a prefix to add to column names (making indexing easier later)

In [3]:
# import the data file to a data frame 'df'
df=pd.read_table('DATASET-GSE69079.txt', delimiter='\t',  index_col=0) #,nrows=500)  
df.shape

(45101, 43)

In [4]:
# remove probes that are know to cross-hybridise to more than one target
df =df[~df.index.str.contains('_x_|_s_')]    #   important reverse selector ~ 
df.shape

(40569, 43)

## Look at column names and then setup filters for grouping columns into S and SD groups

In [5]:
df.columns

Index(['Symbol', 'Definition', 'Ensembl_id', 'Entrez_id', 'Unigene_id',
       'GO-Process', 'GO-Function', 'GO-Component', 'Pathway_info',
       'Putative microRNA binding sites', 'Select Cellular Compartments',
       'Select Protein Classes', 'GSM1692599_Astro_S.CEL',
       'GSM1692600_Astro_S.CEL', 'GSM1692601_Astro_S.CEL',
       'GSM1692602_Astro_S.CEL', 'GSM1692603_Astro_S.CEL',
       'GSM1692604_Astro_S.CEL', 'avg-Astro_S', 'log_fold-Astro_S_vs_Astro_SD',
       'fold-Astro_S_vs_Astro_SD', 'rawp-Astro_S_vs_Astro_SD',
       'adjp-Astro_S_vs_Astro_SD', 'GSM1692611_Astro_SD.CEL',
       'GSM1692612_Astro_SD.CEL', 'GSM1692613_Astro_SD.CEL',
       'GSM1692614_Astro_SD_.CEL', 'GSM1692615_Astro_SD.CEL',
       'GSM1692616_Astro_SD.CEL', 'avg-Astro_SD', 'GSM1692617_NonAstro_S.CEL',
       'GSM1692618_NonAstro_S.CEL', 'avg-NonAstro_S',
       'log_fold-NonAstro_S_vs_NonAstro_SD', 'fold-NonAstro_S_vs_NonAstro_SD',
       'rawp-NonAstro_S_vs_NonAstro_SD', 'adjp-NonAstro_S_vs_NonAstro

In [6]:
# define regular expressions for sleep (S) and sleep dep (SD) filters 
s_filt ='Astro_S.CEL'
sd_filt ='Astro_SD.CEL'

In [7]:
df_s=df.filter(regex= s_filt)
df_s.head()

Unnamed: 0_level_0,GSM1692599_Astro_S.CEL,GSM1692600_Astro_S.CEL,GSM1692601_Astro_S.CEL,GSM1692602_Astro_S.CEL,GSM1692603_Astro_S.CEL,GSM1692604_Astro_S.CEL,GSM1692617_NonAstro_S.CEL,GSM1692618_NonAstro_S.CEL
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1427138_at,7.93047,7.78454,7.56618,7.09652,7.22021,7.00609,7.0899,7.19147
1425600_a_at,9.88229,10.11206,10.22082,10.01175,9.96849,9.92834,11.22009,11.34716
1457168_at,4.47018,4.43956,4.4132,4.29419,4.21937,4.00084,5.61581,4.07789
1450135_at,8.54655,8.22632,8.36909,9.08179,9.53777,9.01648,9.8343,9.94148
1424014_at,9.83192,10.1076,10.3285,10.295,10.20049,10.03741,10.18211,10.09195


In [8]:
df_sd=df.filter(regex= sd_filt)
df_sd.head()

Unnamed: 0_level_0,GSM1692611_Astro_SD.CEL,GSM1692612_Astro_SD.CEL,GSM1692613_Astro_SD.CEL,GSM1692615_Astro_SD.CEL,GSM1692616_Astro_SD.CEL,GSM1692621_NonAstro_SD.CEL,GSM1692622_NonAstro_SD.CEL
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1427138_at,7.06839,7.6891,7.18435,7.48069,7.28885,6.63031,7.24827
1425600_a_at,9.99086,9.7864,9.93582,9.95016,10.15582,11.33963,11.39472
1457168_at,4.52163,4.28775,4.40286,4.40388,4.71261,4.56403,4.42987
1450135_at,8.56851,8.52302,8.64692,9.58117,9.22119,10.47329,9.88955
1424014_at,10.24525,10.53669,10.44297,9.93712,10.1511,10.23977,10.22955


## Calculations 

In [9]:
# Enrichment

df[prefix+'Enrich'] = df.filter(regex=sd_filt).mean(axis=1) - df.filter(regex=s_filt).mean(axis=1)

In [10]:
df[prefix+'Enrich'].head()

Probesets
1427138_at     -0.133535
1425600_a_at    0.028398
1457168_at      0.033281
1450135_at      0.202728
1424014_at      0.120263
Name: 430AV2_Astro_Enrich, dtype: float64

In [11]:
# Calculating Pooled StDev
Scount = df.filter(regex=s_filt).count(axis=1)
SDcount = df.filter(regex=sd_filt).count(axis=1)

StdevS = (Scount-1) * df.filter(regex=s_filt).var(axis=1)
StdevSD = (SDcount-1) * df.filter(regex=sd_filt).var(axis=1)

df[prefix+'poolStDev'] = np.sqrt((StdevS+StdevSD)/(Scount+ SDcount-2))

In [12]:
# Calculating Cohen's d
df[prefix+'Cohens_d'] = df[prefix+'Enrich'] / df[prefix+'poolStDev']

In [13]:
#df[prefix+'poolStDev'].head()
df[prefix+'Cohens_d'] .head()

Probesets
1427138_at     -0.389380
1425600_a_at    0.044206
1457168_at      0.087240
1450135_at      0.288667
1424014_at      0.684724
Name: 430AV2_Astro_Cohens_d, dtype: float64

In [14]:
# Calculating J value (Correction factor)

df[prefix+'J'] = 1-(3/(4*(Scount+SDcount-1)))                              


In [15]:
# Calculating Hedge's g

df[prefix+'Hedges_g'] = df[prefix+'Cohens_d'] * df[prefix+'J']

In [16]:
#df[prefix+'J'].head()
df[prefix+'Hedges_g'] .head()

Probesets
1427138_at     -0.368520
1425600_a_at    0.041838
1457168_at      0.082566
1450135_at      0.273202
1424014_at      0.648042
Name: 430AV2_Astro_Hedges_g, dtype: float64

In [17]:
# Calculating Var_d
Scount = df.filter(regex=s_filt).count(axis=1)
SDcount = df.filter(regex=sd_filt).count(axis=1)

Ftop1 = Scount + SDcount
Ftop2 = Scount * SDcount
Fbottom1 = np.square(df[prefix+'Cohens_d']) 
Fbottom2 =  2*(Scount + SDcount)


df[prefix+'Var_d'] = (Ftop1/Ftop2) + (Fbottom1 /Fbottom2)

In [18]:
#check output
df[prefix+'Var_d'].head()

Probesets
1427138_at      0.272911
1425600_a_at    0.267922
1457168_at      0.268111
1450135_at      0.270635
1424014_at      0.283485
Name: 430AV2_Astro_Var_d, dtype: float64

In [19]:
df[prefix+'Var_g'] = df[prefix+'Var_d'] * np.square(df[prefix+'J'])

In [20]:
# Calculating SEg
df[prefix+'SEg'] = np.sqrt(df[prefix+'Var_g'])

In [21]:
df.sort_values(by= '430AV2_Astro_Hedges_g', ascending=False, inplace=True)
df

Unnamed: 0_level_0,Symbol,Definition,Ensembl_id,Entrez_id,Unigene_id,GO-Process,GO-Function,GO-Component,Pathway_info,Putative microRNA binding sites,...,ANOVA-adjp,largest fold,430AV2_Astro_Enrich,430AV2_Astro_poolStDev,430AV2_Astro_Cohens_d,430AV2_Astro_J,430AV2_Astro_Hedges_g,430AV2_Astro_Var_d,430AV2_Astro_Var_g,430AV2_Astro_SEg
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1417001_a_at,D4Wsu53e,"DNA segment, Chr 4, Wayne State University 53,...",ENSMUSG00000037266,27981,,,,,,"mmu-miR-106a(miRanda), mmu-miR-106b(miRanda), ...",...,0.000161,1.612658,1.525115,0.339239,4.495695,0.946429,4.254855,0.941566,0.843386,0.918361
1416812_at,Tia1,cytotoxic granule-associated RNA binding prote...,ENSMUSG00000071337,21841,,apoptosis // negative regulation of cytokine b...,RNA binding // nucleic acid binding // nucleot...,cytoplasm // nucleus,GenMAPP-mRNA_processing_binding_Reactome // mR...,"mmu-let-7(TargetScan), mmu-let-7a(miRanda), mm...",...,0.001490,0.760287,0.674160,0.208360,3.235553,0.946429,3.062220,0.616817,0.552500,0.743303
1448830_at,Dusp1,dual specificity phosphatase 1,ENSMUSG00000024190,19252,,protein amino acid dephosphorylation // intrac...,protein tyrosine phosphatase activity // phosp...,,EGFR1 Signaling Pathway:WP572(WikiPathways) //...,"mmu-let-7(TargetScan), mmu-let-7a(miRanda|pict...",...,0.000087,1.388683,0.911390,0.287874,3.165934,0.946429,2.996330,0.601962,0.539193,0.734298
1418322_at,Crem,cAMP responsive element modulator,ENSMUSG00000063889,12916,,transcription // spermatogenesis // regulation...,transcription factor activity // DNA binding /...,transcription factor complex // nucleus,Selenium metabolism/Selenoproteins:WP108(WikiP...,"mmu-let-7a(RNAhybrid|miRanda), mmu-let-7d(RNAh...",...,0.000002,2.243747,1.764489,0.567752,3.107850,0.946429,2.941358,0.589815,0.528313,0.726852
1449322_at,LOC100044742,protein tyrosine phosphatase type IVA 1-like,ENSMUSG00000026064|ENSMUSG00000058873|ENSMUSG0...,100044742|19243|433406,,,,,,"mmu-let-7f(RNAhybrid|miRanda), mmu-let-7g(RNAh...",...,0.001606,0.491730,0.389264,0.126274,3.082702,0.946429,2.917558,0.584626,0.523665,0.723647
1435357_at,D4Wsu53e,"DNA segment, Chr 4, Wayne State University 53,...",ENSMUSG00000037266,27981,,,,,,"mmu-miR-106a(miRanda), mmu-miR-106b(miRanda), ...",...,0.000017,2.175512,1.692230,0.550860,3.071981,0.946429,2.907410,0.582426,0.521695,0.722284
1452155_a_at,Ddx17,DEAD (Asp-Glu-Ala-Asp) box polypeptide 17,ENSMUSG00000055065,67040,,,nucleic acid binding // hydrolase activity // ...,nucleus,,"mmu-miR-1(miRanda), mmu-miR-106a(miRanda), mmu...",...,0.000013,1.025785,0.661564,0.222497,2.973367,0.946429,2.814079,0.562554,0.503895,0.709856
1416505_at,Nr4a1,"nuclear receptor subfamily 4, group A, member 1",ENSMUSG00000023034,15370,,"regulation of transcription, DNA-dependent // ...",transcription factor activity // protein homod...,nucleus,GenMAPP-Nuclear_Receptors // Spinal Cord Injur...,"mmu-let-7a(miRanda), mmu-let-7b(miRanda), mmu-...",...,0.000026,3.035430,1.970547,0.668362,2.948325,0.946429,2.790379,0.557611,0.499467,0.706730
1424638_at,Cdkn1a,cyclin-dependent kinase inhibitor 1A (P21),ENSMUSG00000023067,12575,,cellular response to extracellular stimulus //...,cyclin binding // zinc ion binding // kinase a...,cyclin-dependent protein kinase holoenzyme com...,GenMAPP-Cell_Cycle_KEGG // PluriNetWork:WP1763...,"mmu-miR-105(TargetScan), mmu-miR-106a(TargetSc...",...,0.001444,2.943762,2.555216,0.885714,2.884924,0.946429,2.730374,0.545283,0.488425,0.698874
1419427_at,Csf3,colony stimulating factor 3 (granulocyte),ENSMUSG00000038067,12985,,granulocyte differentiation // immune response,enzyme binding // cytokine activity // growth ...,extracellular region // extracellular space,Focal Adhesion-PI3K-Akt-mTOR-signaling pathway...,"mmu-miR-136(miRanda), mmu-miR-141(TargetScan|m...",...,0.007961,0.329335,0.281931,0.097964,2.877906,0.946429,2.723732,0.543935,0.487217,0.698010


In [22]:
df.columns

Index(['Symbol', 'Definition', 'Ensembl_id', 'Entrez_id', 'Unigene_id',
       'GO-Process', 'GO-Function', 'GO-Component', 'Pathway_info',
       'Putative microRNA binding sites', 'Select Cellular Compartments',
       'Select Protein Classes', 'GSM1692599_Astro_S.CEL',
       'GSM1692600_Astro_S.CEL', 'GSM1692601_Astro_S.CEL',
       'GSM1692602_Astro_S.CEL', 'GSM1692603_Astro_S.CEL',
       'GSM1692604_Astro_S.CEL', 'avg-Astro_S', 'log_fold-Astro_S_vs_Astro_SD',
       'fold-Astro_S_vs_Astro_SD', 'rawp-Astro_S_vs_Astro_SD',
       'adjp-Astro_S_vs_Astro_SD', 'GSM1692611_Astro_SD.CEL',
       'GSM1692612_Astro_SD.CEL', 'GSM1692613_Astro_SD.CEL',
       'GSM1692614_Astro_SD_.CEL', 'GSM1692615_Astro_SD.CEL',
       'GSM1692616_Astro_SD.CEL', 'avg-Astro_SD', 'GSM1692617_NonAstro_S.CEL',
       'GSM1692618_NonAstro_S.CEL', 'avg-NonAstro_S',
       'log_fold-NonAstro_S_vs_NonAstro_SD', 'fold-NonAstro_S_vs_NonAstro_SD',
       'rawp-NonAstro_S_vs_NonAstro_SD', 'adjp-NonAstro_S_vs_NonAstro

### Import key file from BioMart and index probesets to MGI gene symbols

In [23]:
dfX=pd.read_table('../FHS project/Sleep notebook Copy/BioMart_Ensmbl_index/mart_export72_430v2430Av2.txt',index_col=[3])
 
dfX.pop('Affy mouse430 2 probeset') # remove 430V2 probeset info (not needed for 430AV2 indexing)
dfX.head(5)

Unnamed: 0_level_0,Ensembl Gene ID,Description,MGI symbol
Affy mouse430a 2 probeset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1417126_a_at,ENSMUSG00000039221,ribosomal protein L22 like 1 [Source:MGI Symbo...,Rpl22l1
,ENSMUSG00000095611,predicted gene 10597 [Source:MGI Symbol;Acc:MG...,Gm10597
1417730_at,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1
1417730_at,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1
,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1


In [24]:
df_Join = df.join(dfX, how='left', sort=True)
df_FINAL1 = df_Join.groupby('MGI symbol').mean()
df_FINAL1[df_FINAL1.index.duplicated()==True]   # checking that no duplicate entries exist in the dataframe

Unnamed: 0_level_0,GSM1692599_Astro_S.CEL,GSM1692600_Astro_S.CEL,GSM1692601_Astro_S.CEL,GSM1692602_Astro_S.CEL,GSM1692603_Astro_S.CEL,GSM1692604_Astro_S.CEL,avg-Astro_S,log_fold-Astro_S_vs_Astro_SD,fold-Astro_S_vs_Astro_SD,rawp-Astro_S_vs_Astro_SD,...,ANOVA-adjp,largest fold,430AV2_Astro_Enrich,430AV2_Astro_poolStDev,430AV2_Astro_Cohens_d,430AV2_Astro_J,430AV2_Astro_Hedges_g,430AV2_Astro_Var_d,430AV2_Astro_Var_g,430AV2_Astro_SEg
MGI symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


### Columns from the list above can then easily be picked to produce files for use later. Examples below given:
 #### df3 = average S and SD expression for the platform and the log-fold changes
 #### df4 = Hedges g  values and associated variance for Meta-analysis (after indexing)

In [25]:
# df3 = df_FINAL1.loc[:,[u'avg-SD', u'avg-S', u'log_fold-S_vs_SD']]
# df3.columns =[prefix+'avg-SD', prefix+'avg-S', prefix+'log_fold-S_vs_SD']
# df3.to_csv('input_files/430AV2_SymbolExpression_forIndex.csv')

In [26]:
df4 = df_FINAL1.loc[:,[u'430AV2_Astro_Enrich',u'430AV2_Astro_Hedges_g', u'430AV2_Astro_Var_g', u'430AV2_Astro_SEg']]
df4.to_csv('../FHS project/Sleep notebook Copy/IPython_notebooks/input_files/430AV2_Astro_SymbolforIndexHedges.csv')

In [27]:
df4.head(10)  # check final ouput

Unnamed: 0_level_0,430AV2_Astro_Enrich,430AV2_Astro_Hedges_g,430AV2_Astro_Var_g,430AV2_Astro_SEg
MGI symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0610005C13Rik,-0.074648,-0.313418,0.243201,0.493154
0610008F07Rik,0.010761,0.040622,0.239982,0.489879
0610009B22Rik,0.085935,0.218759,0.241522,0.491449
0610009D07Rik,-0.061341,-0.103197,0.240315,0.490219
0610009O20Rik,-0.143238,-0.195236,0.241197,0.491119
0610010K14Rik,-0.331733,-0.245922,0.241943,0.491877
0610012G03Rik,0.036441,0.027742,0.241662,0.491589
0610031J06Rik,-0.184221,-0.134467,0.24053,0.490438
0610037L13Rik,-0.114165,-0.17395,0.240936,0.490852
0610040J01Rik,-0.489972,-0.424685,0.245939,0.495922
