In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyreadr import read_r

Dataset: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE102556

Dataset's original paper: https://www.nature.com/articles/nm.4386

Iara's paper: https://www.sciencedirect.com/science/article/pii/S0306452224004652?via%3Dihub

### __1. Obtaining differential expressed genes:__

In [2]:
# import genes dataframe:
df_genes = read_r("dataset/diff_df.rda")
df_genes = df_genes['diff_df']

print(df_genes.shape)

df_genes.head()

(1194, 4)


Unnamed: 0,gene,hgnc_symbol,group,type
0,ENSG00000121542,SEC22A,Nac_male,DGE
1,ENSG00000161939,RNASEK-C17orf49,Nac_male,DGE
2,ENSG00000172476,RAB40A,Nac_male,DGE
3,ENSG00000177051,FBXO46,Nac_male,DGE
4,ENSG00000277075,H2AC8,Nac_male,DGE


In [3]:
# get the genes:
genes = df_genes['gene'].to_numpy()
genes[:4]

array(['ENSG00000121542', 'ENSG00000161939', 'ENSG00000172476',
       'ENSG00000177051'], dtype=object)

### __2. Obtaining the metadata cathegories:__

In [4]:
# check the number of genes by cathegory:

group_list = df_genes['group'].to_numpy()

groups = np.array(list(set(group_list)))

for g in groups:
    n = np.where(group_list == g)[0].size
    print(f"{g} - {n}")
    

aINS_male - 14
OFC_female - 338
aINS_female - 38
Cg25_male - 198
dlPFC_male - 21
dlPFC_female - 216
Nac_male - 76
Sub_female - 24
Nac_female - 59
OFC_male - 35
Sub_male - 113
Cg25_female - 62


### __3. Obtaining the counts table:__

In [5]:
# import counts dataframe: FPKM normalization

df_counts = pd.read_csv("dataset/GSE102556_HumanMDD_fpkmtab.txt", sep="\t")

print(df_counts.shape)

df_counts.head()

(57773, 285)


Unnamed: 0,gene_id,14.AntIns,17.AntIns,20.AntIns,23.AntIns,28.AntIns,32.AntIns,40.AntIns,43.AntIns,45.AntIns,...,235.Subic,236.Subic,238.Subic,242.Subic,246.Subic,247.Subic,249.Subic,gene_name,biotype,length
0,ENSG00000000003,3.599772,3.965234,3.487418,3.154162,3.529485,3.002357,2.56092,3.601089,3.550312,...,4.231833,3.755271,11.066229,5.257674,5.581187,16.941318,3.075219,TSPAN6,protein_coding,2968
1,ENSG00000000005,0.11122,0.133513,0.213115,0.0,0.109048,0.041304,0.076145,0.128281,0.113825,...,0.108051,0.130209,0.26451,0.143947,0.141589,0.182637,0.403241,TNMD,protein_coding,1610
2,ENSG00000000419,24.8246,21.593504,24.873759,20.320854,24.630808,17.410097,24.681255,25.281778,21.066301,...,18.707798,22.723588,22.669098,18.752818,24.646684,21.821129,20.028039,DPM1,protein_coding,1207
3,ENSG00000000457,3.819466,3.188695,3.883921,3.775845,4.085334,3.442988,3.646076,3.536816,3.911153,...,2.373134,3.130108,4.31993,3.409801,3.541817,3.909859,4.043314,SCYL3,protein_coding,6876
4,ENSG00000000460,1.53118,1.217878,1.160999,0.992044,1.003926,1.088448,0.945403,1.267666,1.110391,...,1.122514,1.710124,1.700698,1.361686,2.038972,1.20982,0.907552,C1orf112,protein_coding,6354


<font size=3>

__Iara's paper:__\
"Our aim was to identify transcriptional alteration in the central nervous system of females and males with MDD. 

To achieve this, we analyzed publicly available post-mortem bulk RNA sequencing data of MDD individuals and healthy controls. 

This dataset comprises six brain regions: the orbitofrontal cortex (OFC), dorsolateral prefrontal cortex (dlPFC), ventromedial prefrontal cortex (BA25, Cg25), anterior insula (aINS), nucleus accumbens (Nac), and ventral subiculum (Sub)."

__Dataset's original paper:__\
"ventromedial prefrontal cortex (vmPFC; also known as subgenual PFC; Brodmann area (BA) 25), orbitofrontal cortex (OFC; BA11), dorsolateral PFC (dlPFC; BA8/9), anterior insula (aINS), nucleus accumbens (NAc) and ventral subiculum (vSUB)."

* ventromedial prefrontal cortex: BA25 -> Cg25
* orbitofrontal cortex: BA11 -> OFC
* dorsolateral PFC: BA8/9 (BA8_9) -> dlPFC
* anterior insula: aINS (AntIns) -> aINS
* nucleus accumbens: NAc -> Nac
* ventral subiculum: vSUB (Subic) -> Sub
  

In [6]:
# selecting from counts table the differentially expressed genes:
_, i, j = np.intersect1d(genes, df_counts['gene_id'], return_indices=True)

genes = genes[i]

print(genes[:5])

df_counts.iloc[j, 0:-3].head()

['ENSG00000004059' 'ENSG00000004776' 'ENSG00000006015' 'ENSG00000006283'
 'ENSG00000007062']


Unnamed: 0,gene_id,14.AntIns,17.AntIns,20.AntIns,23.AntIns,28.AntIns,32.AntIns,40.AntIns,43.AntIns,45.AntIns,...,201.Subic,205.Subic,222.Subic,235.Subic,236.Subic,238.Subic,242.Subic,246.Subic,247.Subic,249.Subic
45,ENSG00000004059,8.996242,8.903824,11.186563,10.507356,11.416718,10.804911,13.333951,11.894081,10.137459,...,7.188093,5.612762,9.248735,8.070514,8.366017,8.393182,7.617786,6.843214,6.757867,7.40926
57,ENSG00000004776,0.103986,0.436902,0.365299,0.144972,0.305866,0.656504,0.783119,0.239875,0.266054,...,0.693557,0.766658,0.455944,0.424298,0.365219,0.401873,1.143966,0.33095,0.853792,0.377014
116,ENSG00000006015,2.621305,2.950058,3.003535,2.333444,2.083884,2.308752,2.382756,2.57399,2.773338,...,3.667201,3.009326,3.233732,2.756555,3.431726,3.727868,3.461411,3.134144,2.841986,3.218329
134,ENSG00000006283,1.610692,3.044226,2.308878,2.918354,1.916791,4.232874,2.304393,1.451829,2.416006,...,2.358586,1.84185,1.895517,1.820432,1.475448,1.759985,2.410873,1.017426,0.354778,1.92343
169,ENSG00000007062,1.495194,0.986233,1.807888,1.254059,1.371925,1.452042,1.436966,1.023668,1.166075,...,2.07288,1.334414,1.367282,1.076632,1.039053,2.588542,0.838227,1.465773,1.502444,1.693324


In [7]:
print(f"Before: genes = {genes.shape}, df-counts = {df_counts.shape}")

df_counts = df_counts.iloc[j, 0:-3]
df_counts.shape

print(f"After:  genes = {genes.shape}, df-counts = {df_counts.shape}")

Before: genes = (1052,), df-counts = (57773, 285)
After:  genes = (1052,), df-counts = (1052, 282)


In [8]:
# relabeling sample names:

def relabel(x):
    y = x.split(".")
    
    if len(y) == 2:
        n, label = y
        
        match label:
            case "BA25": return n + ".Cg25"
            case "BA11": return n + ".OFC"
            case "BA8_9": return n + ".dlPFC"
            case "AntIns": return n + ".aINS"
            case "Nac": return n + ".Nac"
            case "Subic": return n + ".Sub"
    else:
        return x

relabel("14.AntIns")

'14.aINS'

In [9]:
df_counts.columns = df_counts.columns.to_series().apply(relabel)
df_counts.head()

Unnamed: 0,gene_id,14.aINS,17.aINS,20.aINS,23.aINS,28.aINS,32.aINS,40.aINS,43.aINS,45.aINS,...,201.Sub,205.Sub,222.Sub,235.Sub,236.Sub,238.Sub,242.Sub,246.Sub,247.Sub,249.Sub
45,ENSG00000004059,8.996242,8.903824,11.186563,10.507356,11.416718,10.804911,13.333951,11.894081,10.137459,...,7.188093,5.612762,9.248735,8.070514,8.366017,8.393182,7.617786,6.843214,6.757867,7.40926
57,ENSG00000004776,0.103986,0.436902,0.365299,0.144972,0.305866,0.656504,0.783119,0.239875,0.266054,...,0.693557,0.766658,0.455944,0.424298,0.365219,0.401873,1.143966,0.33095,0.853792,0.377014
116,ENSG00000006015,2.621305,2.950058,3.003535,2.333444,2.083884,2.308752,2.382756,2.57399,2.773338,...,3.667201,3.009326,3.233732,2.756555,3.431726,3.727868,3.461411,3.134144,2.841986,3.218329
134,ENSG00000006283,1.610692,3.044226,2.308878,2.918354,1.916791,4.232874,2.304393,1.451829,2.416006,...,2.358586,1.84185,1.895517,1.820432,1.475448,1.759985,2.410873,1.017426,0.354778,1.92343
169,ENSG00000007062,1.495194,0.986233,1.807888,1.254059,1.371925,1.452042,1.436966,1.023668,1.166075,...,2.07288,1.334414,1.367282,1.076632,1.039053,2.588542,0.838227,1.465773,1.502444,1.693324


In [10]:
# counts dataframe shape: (samples, genes)

df_counts = df_counts.set_index('gene_id').T
df_counts.head()

gene_id,ENSG00000004059,ENSG00000004776,ENSG00000006015,ENSG00000006283,ENSG00000007062,ENSG00000008300,ENSG00000008323,ENSG00000008382,ENSG00000011566,ENSG00000013725,...,ENSG00000263142,ENSG00000268500,ENSG00000268598,ENSG00000269343,ENSG00000269858,ENSG00000269955,ENSG00000270326,ENSG00000270585,ENSG00000271207,ENSG00000272636
14.aINS,8.996242,0.103986,2.621305,1.610692,1.495194,2.045752,0.0,2.184767,21.565145,0.049781,...,0.6117,0.0,0.529774,2.44207,0.862148,0.0,1.85558,0.0,0.0,5.121861
17.aINS,8.903824,0.436902,2.950058,3.044226,0.986233,2.577052,0.0,2.85873,18.0163,0.053784,...,1.02733,0.0,0.635963,2.128504,1.152073,0.0,2.227519,0.242431,0.039514,5.773299
20.aINS,11.186563,0.365299,3.003535,2.308878,1.807888,2.433533,0.0,2.651374,20.61409,0.095389,...,0.552249,0.0,0.676757,2.91219,1.356395,0.0,1.185201,0.085994,0.042048,4.829895
23.aINS,10.507356,0.144972,2.333444,2.918354,1.254059,2.282867,0.01465,2.406256,19.575888,0.093694,...,0.787202,0.0,0.369292,2.499131,1.252572,0.0,1.293479,0.1877,0.0,4.259085
28.aINS,11.416718,0.305866,2.083884,1.916791,1.371925,2.001595,0.0,1.999301,20.719654,0.087857,...,0.94577,0.0,0.173143,2.483541,1.112255,0.0,1.2129,0.176007,0.0,6.01041


### __4. Obtaining metadata:__

In [11]:
# import metadata:

df_meta = pd.read_csv("dataset/GSE102556-GPL11154_series_matrix.txt", sep="\t")
df_meta.head()

Unnamed: 0,!Sample_title,14: Orbitofrontal (OFC; BA11),17: Orbitofrontal (OFC; BA11),20: Orbitofrontal (OFC; BA11),23: Orbitofrontal (OFC; BA11),28: Orbitofrontal (OFC; BA11),32: Orbitofrontal (OFC; BA11),40: Orbitofrontal (OFC; BA11),43: Orbitofrontal (OFC; BA11),45: Orbitofrontal (OFC; BA11),...,201: Subiculum (Sub),205: Subiculum (Sub),222: Subiculum (Sub),235: Subiculum (Sub),236: Subiculum (Sub),238: Subiculum (Sub),242: Subiculum (Sub),246: Subiculum (Sub),247: Subiculum (Sub),249: Subiculum (Sub)
0,!Sample_characteristics_ch1,gender: Male,gender: Male,gender: Male,gender: Male,gender: Male,gender: Male,gender: Male,gender: Male,gender: Male,...,gender: Female,gender: Female,gender: Female,gender: Female,gender: Female,gender: Female,gender: Female,gender: Female,gender: Male,gender: Female
1,!Sample_characteristics_ch1,Cause of death: Natural,Cause of death: Natural,Cause of death: Accident,Cause of death: Suicide,Cause of death: Natural,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,...,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Natural,Cause of death: Suicide,Cause of death: Accident,Cause of death: Suicide
2,!Sample_characteristics_ch1,age: 47,age: 41,age: 31,age: 19,age: 46,age: 40,age: 49,age: 33,age: 38,...,age: 45,age: 48,age: 36,age: 52,age: 60,age: 22,age: 68,age: 59,age: 59,age: 51
3,!Sample_characteristics_ch1,phenotype: CTRL,phenotype: CTRL,phenotype: CTRL,phenotype: CTRL,phenotype: CTRL,phenotype: CTRL,phenotype: MDD,phenotype: CTRL,phenotype: CTRL,...,phenotype: CTRL,phenotype: MDD,phenotype: MDD,phenotype: MDD,phenotype: CTRL,phenotype: CTRL,phenotype: CTRL,phenotype: MDD,phenotype: CTRL,phenotype: CTRL


In [12]:
# reforming dataframe: index

df_meta['!Sample_title'] = ["gender", "cause of death", "age", "phenotype"]
df_meta = df_meta.set_index("!Sample_title")

df_meta.head()

Unnamed: 0_level_0,14: Orbitofrontal (OFC; BA11),17: Orbitofrontal (OFC; BA11),20: Orbitofrontal (OFC; BA11),23: Orbitofrontal (OFC; BA11),28: Orbitofrontal (OFC; BA11),32: Orbitofrontal (OFC; BA11),40: Orbitofrontal (OFC; BA11),43: Orbitofrontal (OFC; BA11),45: Orbitofrontal (OFC; BA11),51: Orbitofrontal (OFC; BA11),...,201: Subiculum (Sub),205: Subiculum (Sub),222: Subiculum (Sub),235: Subiculum (Sub),236: Subiculum (Sub),238: Subiculum (Sub),242: Subiculum (Sub),246: Subiculum (Sub),247: Subiculum (Sub),249: Subiculum (Sub)
!Sample_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
gender,gender: Male,gender: Male,gender: Male,gender: Male,gender: Male,gender: Male,gender: Male,gender: Male,gender: Male,gender: Male,...,gender: Female,gender: Female,gender: Female,gender: Female,gender: Female,gender: Female,gender: Female,gender: Female,gender: Male,gender: Female
cause of death,Cause of death: Natural,Cause of death: Natural,Cause of death: Accident,Cause of death: Suicide,Cause of death: Natural,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,...,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Natural,Cause of death: Suicide,Cause of death: Accident,Cause of death: Suicide
age,age: 47,age: 41,age: 31,age: 19,age: 46,age: 40,age: 49,age: 33,age: 38,age: 53,...,age: 45,age: 48,age: 36,age: 52,age: 60,age: 22,age: 68,age: 59,age: 59,age: 51
phenotype,phenotype: CTRL,phenotype: CTRL,phenotype: CTRL,phenotype: CTRL,phenotype: CTRL,phenotype: CTRL,phenotype: MDD,phenotype: CTRL,phenotype: CTRL,phenotype: MDD,...,phenotype: CTRL,phenotype: MDD,phenotype: MDD,phenotype: MDD,phenotype: CTRL,phenotype: CTRL,phenotype: CTRL,phenotype: MDD,phenotype: CTRL,phenotype: CTRL


In [13]:
# reforming dataframe: column names

df_meta.columns = df_meta.columns.to_series().apply(lambda x: x.split(":")[0])

df_meta.head()

Unnamed: 0_level_0,14,17,20,23,28,32,40,43,45,51,...,201,205,222,235,236,238,242,246,247,249
!Sample_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
gender,gender: Male,gender: Male,gender: Male,gender: Male,gender: Male,gender: Male,gender: Male,gender: Male,gender: Male,gender: Male,...,gender: Female,gender: Female,gender: Female,gender: Female,gender: Female,gender: Female,gender: Female,gender: Female,gender: Male,gender: Female
cause of death,Cause of death: Natural,Cause of death: Natural,Cause of death: Accident,Cause of death: Suicide,Cause of death: Natural,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,...,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Suicide,Cause of death: Natural,Cause of death: Suicide,Cause of death: Accident,Cause of death: Suicide
age,age: 47,age: 41,age: 31,age: 19,age: 46,age: 40,age: 49,age: 33,age: 38,age: 53,...,age: 45,age: 48,age: 36,age: 52,age: 60,age: 22,age: 68,age: 59,age: 59,age: 51
phenotype,phenotype: CTRL,phenotype: CTRL,phenotype: CTRL,phenotype: CTRL,phenotype: CTRL,phenotype: CTRL,phenotype: MDD,phenotype: CTRL,phenotype: CTRL,phenotype: MDD,...,phenotype: CTRL,phenotype: MDD,phenotype: MDD,phenotype: MDD,phenotype: CTRL,phenotype: CTRL,phenotype: CTRL,phenotype: MDD,phenotype: CTRL,phenotype: CTRL


In [14]:
# reforming dataframe: content names

for n in df_meta.index:
    df_meta.loc[n] = df_meta.loc[n].apply(lambda x: x.split(": ")[-1])

df_meta.head()

Unnamed: 0_level_0,14,17,20,23,28,32,40,43,45,51,...,201,205,222,235,236,238,242,246,247,249
!Sample_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
gender,Male,Male,Male,Male,Male,Male,Male,Male,Male,Male,...,Female,Female,Female,Female,Female,Female,Female,Female,Male,Female
cause of death,Natural,Natural,Accident,Suicide,Natural,Suicide,Suicide,Suicide,Suicide,Suicide,...,Suicide,Suicide,Suicide,Suicide,Suicide,Suicide,Natural,Suicide,Accident,Suicide
age,47,41,31,19,46,40,49,33,38,53,...,45,48,36,52,60,22,68,59,59,51
phenotype,CTRL,CTRL,CTRL,CTRL,CTRL,CTRL,MDD,CTRL,CTRL,MDD,...,CTRL,MDD,MDD,MDD,CTRL,CTRL,CTRL,MDD,CTRL,CTRL


In [15]:
# removing repeated subjects:
df_meta = df_meta.loc[:,~df_meta.columns.duplicated()]
df_meta.head()

Unnamed: 0_level_0,14,17,20,23,28,32,40,43,45,51,...,205,212,222,235,236,238,242,246,247,249
!Sample_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
gender,Male,Male,Male,Male,Male,Male,Male,Male,Male,Male,...,Female,Female,Female,Female,Female,Female,Female,Female,Male,Female
cause of death,Natural,Natural,Accident,Suicide,Natural,Suicide,Suicide,Suicide,Suicide,Suicide,...,Suicide,Natural,Suicide,Suicide,Suicide,Suicide,Natural,Suicide,Accident,Suicide
age,47,41,31,19,46,40,49,33,38,53,...,48,82,36,52,60,22,68,59,59,51
phenotype,CTRL,CTRL,CTRL,CTRL,CTRL,CTRL,MDD,CTRL,CTRL,MDD,...,MDD,CTRL,MDD,MDD,CTRL,CTRL,CTRL,MDD,CTRL,CTRL


### __5. Saving all as .csv:__