# Synthetic Data Generation Strategy 3: Upsampling KEGG Pathways and sampling from a independent normals

## Outline:

1.  Get means and variances from empirical data
2.  Get pathway information from KEGG.
3.  Change mean values of proteins in pathway
4.  Sample from independent normals for each protein

In [2]:
%pylab inline
import numpy as np
import pandas as pd
import pickle as pkl

ovarian = pd.read_csv('/Users/alex/Documents/proteomics/data_preparation/proteomics_data/ovarian_inbiomap_exp.tsv', index_col=0)

ovarian.head()

Populating the interactive namespace from numpy and matplotlib


Unnamed: 0,ZNF91,NDEL1,ELAVL1,SUMO1,SUMO3,CHMP5,UBC,HTT,E2F4,ACP5,...,SPANXN4,ZNF605,SERPINB10,ANKAR,RRH,DHH,CYSLTR1,ZNF268,COL23A1,MEDAG
PNNL-TCGA-09-1664,0.0,-0.119,-0.188,0.571,-0.224,0.332,0.0,-0.216,0.205,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PNNL-TCGA-13-1484,0.0,0.0,-0.121,-1.45,0.079,0.103,0.0,-0.975,0.0,0.175,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PNNL-TCGA-13-1488,0.0,0.0,-0.219,-0.0765,-1.34,-0.42,0.0,0.0158,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PNNL-TCGA-13-1489,0.0,0.0,0.236,0.749,0.107,-0.0168,0.0,-0.147,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PNNL-TCGA-13-1494,0.0,0.433,0.0158,-0.21,0.000859,-0.527,0.0,0.0926,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# I. Get Means and Variances

In [3]:
means = ovarian.mean(axis=0)
variances = ovarian.var()
means.shape, variances.shape

((16349,), (16349,))

# II. Get Pathways

In [12]:
pathways = pkl.load(open("../KEGG_pathway_gene_lists.pkl", "rb"))
pathway_id = "hsa00830"
pathway_genes = np.unique(pathways[pathway_id])
pathway_genes

array(['ADH1A', 'ADH1B', 'ADH1C', 'ADH4', 'ADH5', 'ADH6', 'ADH7',
       'ALDH1A1', 'ALDH1A2', 'AOX1', 'AWAT2', 'BCO1', 'CYP1A1', 'CYP1A2',
       'CYP26A1', 'CYP26B1', 'CYP26C1', 'CYP2A6', 'CYP2B6', 'CYP2C18',
       'CYP2C8', 'CYP2C9', 'CYP2S1', 'CYP3A4', 'CYP3A5', 'CYP3A7',
       'CYP3A7-CYP3A51P', 'CYP4A11', 'DGAT1', 'DHRS3', 'DHRS4', 'DHRS4L1',
       'DHRS4L2', 'DHRS9', 'HSD17B6', 'LRAT', 'PNPLA4', 'RDH10', 'RDH11',
       'RDH12', 'RDH16', 'RDH5', 'RDH8', 'RETSAT', 'RPE65', 'SDR16C5',
       'UGT1A1', 'UGT1A10', 'UGT1A3', 'UGT1A4', 'UGT1A5', 'UGT1A6',
       'UGT1A7', 'UGT1A8', 'UGT1A9', 'UGT2A1', 'UGT2A2', 'UGT2A3',
       'UGT2B10', 'UGT2B11', 'UGT2B15', 'UGT2B17', 'UGT2B28', 'UGT2B4',
       'UGT2B7'], 
      dtype='<U15')

# III. Update means

In [4]:
new_pathway_means = pd.Series(np.random.normal(0,variances), index=variances.index)[pathway_genes].fillna(0)
new_means = pd.concat([means, new_pathway_means], axis=1).fillna(0).sum(axis=1).reindex(means.index)
new_means

ZNF91        0.000000
NDEL1        0.010325
ELAVL1       0.031412
SUMO1        0.126737
SUMO3       -0.039357
CHMP5        0.038141
UBC          0.000000
HTT         -0.099602
E2F4         0.168939
ACP5        -0.106778
SLC35F6     -0.133397
HSPA5        0.032161
SNRNP40      0.106933
RPA2         0.078114
HSD17B7      0.145818
ZBTB16       0.000000
KLHL15      -0.031931
CIZ1         0.043799
EGR2         0.000000
TNFSF10     -0.067006
MAPK9        0.035393
PPP2CB      -0.046517
PDCD5        0.003451
SLC27A2      0.039796
EID3         0.000000
LPCAT1       0.037730
FAM177A1    -0.031166
HAUS7       -0.016349
KPNB1       -0.024028
SMPD1        0.326922
               ...   
EEF1DP3      0.000000
OMP          0.000000
FREM1        0.000000
TCEAL7       0.000000
MCF2L2       0.000000
ITGA11      -0.055196
SPAG11B      0.000000
CMA1        -0.182264
ANKRD33B     0.000000
CNR2         0.000000
GIG44        0.000000
LINC00588    0.000000
BPIFB4       0.000000
CAMK2N2      0.000000
TAAR2     

# IV. Sample from independent normals for each protein (with updated means)

In [30]:
positives = pd.DataFrame(np.random.normal(new_means, variances, size=(100, len(means))), columns=ovarian.columns)
positives.index = [pathway_id] * 100
positives

Unnamed: 0,ZNF91,NDEL1,ELAVL1,SUMO1,SUMO3,CHMP5,UBC,HTT,E2F4,ACP5,...,SPANXN4,ZNF605,SERPINB10,ANKAR,RRH,DHH,CYSLTR1,ZNF268,COL23A1,MEDAG
hsa00830,0.0,0.109015,-0.020573,0.137357,-0.012751,0.032148,0.0,-0.203069,0.173460,-0.065268,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hsa00830,0.0,0.106028,-0.092185,0.085059,-0.342775,0.047271,0.0,0.073079,-0.023571,-0.029832,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hsa00830,0.0,-0.077424,0.013955,-0.005742,0.028695,0.114396,0.0,-0.200138,0.353028,-0.380131,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hsa00830,0.0,0.010270,-0.042695,0.266221,0.395558,-0.094020,0.0,-0.128058,0.297858,-0.038056,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hsa00830,0.0,-0.045191,0.188900,0.407073,-0.801294,0.035595,0.0,-0.263882,0.041411,-0.024873,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hsa00830,0.0,-0.063957,-0.008080,0.156604,-0.258679,0.055340,0.0,-0.120246,-0.155399,0.116142,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hsa00830,0.0,0.080117,0.232033,0.046239,0.021808,-0.045195,0.0,-0.089839,0.130172,0.283613,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hsa00830,0.0,0.013982,0.041078,0.299031,-0.017405,-0.025250,0.0,0.056596,0.438443,-0.327568,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hsa00830,0.0,0.064187,-0.077644,0.057558,-0.427874,0.044305,0.0,-0.187147,0.099690,-0.214498,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hsa00830,0.0,0.061300,-0.033522,0.356256,-0.191490,-0.111456,0.0,-0.260958,0.300827,0.153562,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# V. Sample from independent normals for each protein (with old means)

This is effectively the same as "Strategy 1" -- see other notebook

In [31]:
negatives = pd.DataFrame(np.random.normal(means, variances, size=(100, len(means))), columns=ovarian.columns)
negatives.index = ['negative'] * 100
negatives

Unnamed: 0,ZNF91,NDEL1,ELAVL1,SUMO1,SUMO3,CHMP5,UBC,HTT,E2F4,ACP5,...,SPANXN4,ZNF605,SERPINB10,ANKAR,RRH,DHH,CYSLTR1,ZNF268,COL23A1,MEDAG
negative,0.0,-0.129899,0.086811,0.203287,0.216322,-0.065877,0.0,-0.085900,0.276047,0.045291,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
negative,0.0,0.031473,0.008599,-0.092172,0.638810,0.153265,0.0,-0.198953,0.202982,0.022069,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
negative,0.0,-0.116067,0.146476,0.260898,0.013141,0.035435,0.0,0.036591,0.123292,-0.238619,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
negative,0.0,0.046648,-0.004047,0.153365,-0.338085,-0.037647,0.0,-0.160163,0.349540,-0.077165,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
negative,0.0,-0.105501,0.054660,0.121175,-0.143017,0.156828,0.0,-0.042327,0.339567,-0.108608,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
negative,0.0,-0.040432,0.098273,0.113101,0.049741,0.129538,0.0,-0.164375,-0.076003,-0.216587,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
negative,0.0,0.015895,0.153022,0.002184,-0.189237,0.141463,0.0,0.024630,-0.095067,-0.007922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
negative,0.0,-0.159445,-0.090044,-0.081687,0.117919,0.133038,0.0,-0.043923,0.101122,-0.090644,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
negative,0.0,0.048355,0.074644,0.512239,0.000093,-0.012329,0.0,-0.083018,0.203551,-0.042769,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
negative,0.0,-0.203974,-0.045865,0.361870,0.113284,-0.022076,0.0,-0.166849,0.134867,-0.028935,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
dataset = pd.concat([positives, negatives]).sample(frac=1)  # shuffle
dataset

Unnamed: 0,ZNF91,NDEL1,ELAVL1,SUMO1,SUMO3,CHMP5,UBC,HTT,E2F4,ACP5,...,SPANXN4,ZNF605,SERPINB10,ANKAR,RRH,DHH,CYSLTR1,ZNF268,COL23A1,MEDAG
negative,0.0,-0.019874,-0.080834,0.207478,0.265390,0.137612,0.0,0.073087,-0.039293,-0.292087,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hsa00830,0.0,-0.056755,-0.007682,0.182622,0.118059,0.122067,0.0,-0.085794,-0.141778,-0.105173,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hsa00830,0.0,-0.059636,-0.089287,-0.254298,0.434062,0.014554,0.0,-0.317753,0.134906,-0.286403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
negative,0.0,0.079505,0.024603,0.237344,0.113744,-0.095339,0.0,-0.179155,0.016426,0.108480,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
negative,0.0,0.010122,0.098030,0.126540,-0.238149,0.067100,0.0,-0.294073,0.065689,-0.473700,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
negative,0.0,0.067421,0.016609,0.095344,-0.031912,0.023939,0.0,-0.057045,0.101819,-0.271301,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
negative,0.0,0.072702,-0.006949,0.116425,0.130332,0.118648,0.0,-0.165960,0.327440,-0.134976,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hsa00830,0.0,0.010270,-0.042695,0.266221,0.395558,-0.094020,0.0,-0.128058,0.297858,-0.038056,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hsa00830,0.0,-0.033334,0.040909,-0.017024,0.020302,0.014048,0.0,-0.095650,0.056969,-0.376962,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hsa00830,0.0,0.195071,0.107620,0.283208,-0.157650,0.218903,0.0,-0.022918,0.111819,-0.115953,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
filename = 'synthetic_'+pathway_id+'_100pos_100neg.csv'
dataset.to_csv(filename, index=True, header=True)