# Extract stuff from Kevin's data

In [3]:
import os
import glob

import pylab as plt
import matplotlib
from IPython.display import display, HTML

import numpy as np
from scipy.sparse import lil_matrix
import pandas as pd
from pandas import HDFStore

from collections import defaultdict
import math

%matplotlib inline

### Load Kevin's peak data in positive mode

In [5]:
basedir = '/Users/joewandy/Dropbox/Analysis/omics_integration/data'

In [6]:
peaks = pd.read_csv(basedir + '/intensities_pos.csv', index_col=0)
peaks.head()
peaks.columns = peaks.columns.values.astype(int)

In [7]:
samples_peaks = pd.read_csv(basedir + '/metadata_samples.csv', index_col=0)
samples_peaks.head()

Unnamed: 0_level_0,Time,Parasite,Treatment
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
UN_1,7,UN,Unsorted
UN4,7,UN,Unsorted
INFEC_1,7,INFEC,Unsorted
INFEC_2,7,INFEC,Unsorted
INFEC_3,7,INFEC,Unsorted


### Load Kevin's transcript data

In [8]:
rna = pd.read_csv(basedir + '/rna_all.csv', index_col=0)
display(rna.head())

Unnamed: 0,ENSMUSG00000000001,ENSMUSG00000000003,ENSMUSG00000000028,ENSMUSG00000000031,ENSMUSG00000000037,ENSMUSG00000000049,ENSMUSG00000000056,ENSMUSG00000000058,ENSMUSG00000000078,ENSMUSG00000000085,...,ENSMUSG00000110415,ENSMUSG00000110416,ENSMUSG00000110417,ENSMUSG00000110418,ENSMUSG00000110419,ENSMUSG00000110420,ENSMUSG00000110421,ENSMUSG00000110422,ENSMUSG00000110423,ENSMUSG00000110424
HK1cnt,4390,0,44,0,2,0,312,1910,10297,436,...,0,0,0,0,53,0,0,0,0,38
HK2cnt,4003,0,47,0,0,1,366,1901,9329,457,...,0,0,0,0,52,0,0,0,0,25
HK3cnt,5739,0,57,0,3,1,418,2582,14173,732,...,0,0,0,0,95,0,0,0,0,29
INF2cnt,3005,0,43,0,0,0,352,928,9478,399,...,0,0,0,0,40,0,0,0,0,51
INF3cnt,3674,0,46,1,0,0,370,1868,9162,429,...,0,0,0,0,47,0,0,0,0,43


In [9]:
samples_rna = pd.read_csv(basedir + '/metadata_rna.csv', index_col=0)
display(samples_rna)

Unnamed: 0_level_0,Time,Parasite,Treatment
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
INF2cnt,7,INFEC,Unsorted
M01cnt,7,UN,Unsorted
M03cnt,7,UN,Unsorted
M02cnt,7,UN,Unsorted
HK3cnt,7,HK,Unsorted
HK2cnt,7,HK,Unsorted
INF4cnt,7,INFEC,Unsorted
INF3cnt,7,INFEC,Unsorted
HK1cnt,7,HK,Unsorted


### Select a group for analysis

In [10]:
time = 7
parasite = 'INFEC'
treatment = 'Unsorted'

Selected samples for the metabolomics data

In [11]:
pos = (samples_peaks['Time'] == time) & (samples_peaks['Parasite'] == parasite) & \
      (samples_peaks['Treatment'] == treatment)
    
groups_peaks = samples_peaks[pos]
display(groups_peaks)

Unnamed: 0_level_0,Time,Parasite,Treatment
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
INFEC_1,7,INFEC,Unsorted
INFEC_2,7,INFEC,Unsorted
INFEC_3,7,INFEC,Unsorted
INFEC_4,7,INFEC,Unsorted


Selected samples for the transcript data

In [13]:
pos = (samples_rna['Time'] == time) & (samples_rna['Parasite'] == parasite) & \
      (samples_rna['Treatment'] == treatment)
    
groups_rna = samples_rna[pos]
display(groups_rna)
print(groups_rna.index.values)

Unnamed: 0_level_0,Time,Parasite,Treatment
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
INF2cnt,7,INFEC,Unsorted
INF4cnt,7,INFEC,Unsorted
INF3cnt,7,INFEC,Unsorted


['INF2cnt' 'INF4cnt' 'INF3cnt']


Keep peak data that do not contain NAs

In [14]:
print(peaks.columns)

Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            2914, 2915, 2916, 2917, 2918, 2919, 2920, 2921, 2922, 2923],
           dtype='int64', length=2923)


In [15]:
pp = peaks.loc[groups_peaks.index.values]
print(pp.shape)
print(pp.columns)

pp = pp.dropna(axis=1, how='any')
print(pp.shape)
display(pp)

(4, 2923)
Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            2914, 2915, 2916, 2917, 2918, 2919, 2920, 2921, 2922, 2923],
           dtype='int64', length=2923)
(4, 2779)


Unnamed: 0,1,2,3,4,6,7,8,11,12,13,...,2912,2915,2916,2917,2918,2919,2920,2921,2922,2923
INFEC_1,257776768.0,5529046.0,6960773.5,654349.81,345849.22,219935.03,246547.3,118354888.0,5529046.0,4144117.5,...,46311.26,27899.9,46987.21,26511.22,27804.65,31506.33,21575.88,20833.71,31194.07,33556.29
INFEC_2,161851648.0,4672629.0,3889198.75,320942.19,175617.34,138481.02,140896.5,97892280.0,4672629.0,3702166.25,...,40102.05,28176.85,45702.19,24959.22,27148.91,30018.25,22636.13,28097.6,30959.71,29259.81
INFEC_3,403047296.0,10444900.0,9469953.0,1096783.63,570069.19,447617.34,558778.19,145060640.0,10444900.0,5183267.0,...,40972.51,34968.3,35957.28,23790.51,33914.49,35318.34,10543.33,14062.33,27639.48,26006.63
INFEC_4,182654768.0,3589490.5,4283163.0,291107.75,174276.03,72580.04,156987.55,97634784.0,3949721.25,2880351.25,...,33934.88,9922.9,40218.66,24715.55,24650.8,37442.79,23278.97,28183.72,35170.65,24254.83


Read peak metadata containing the identifications

In [16]:
metadata_peaks = pd.read_csv(basedir + '/metadata_peaks.csv', index_col=0)

# keep only peaks we've selected from before
metadata_peaks = metadata_peaks[metadata_peaks.index.isin(pp.columns)]

# drop rows containing NA, i.e. in the PiMP Annotation column
metadata_peaks = metadata_peaks.dropna()

display(metadata_peaks)

Unnamed: 0_level_0,Mass,RT,Polarity,FrAnK Annotation,PiMP Annotation,InChI Key
Peak id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,147.0764,905.00,positive,Annotate in FrAnK,"2-Amino-3-hydroxypropanoic acid,3-Ureidoisobut...","AEFLONBTGZFSGQ-UHFFFAOYSA-N,CXISPYVYMQWFLE-UHF..."
2,156.0768,917.24,positive,L-Histidine (C6H9N3O2) Prob = 98.8800000000,"2,5-Dioxopiperazine,3-(Pyrazol-1-yl)-L-alanine...","BXRLWGXPSRYJDZ-VKHMYHEASA-N,BXRNXXXXHLBUKK-UHF..."
3,171.0764,904.98,positive,Annotate in FrAnK,"(3R,5S)-1-pyrroline-3-hydroxy-5-carboxylic Aci...","AOMLMYXPXUTBQH-UHFFFAOYSA-N,HFXAFXVXPMUQCQ-BYP..."
4,151.0478,903.42,positive,Annotate in FrAnK,"2-Aminoacrylic acid,2-Oxazolidinone,2-amino-4-...","DXWQLTOXWVWMOH-UHFFFAOYSA-N,ICCHEGCKVBMSTF-UHF..."
6,358.1639,904.96,positive,No Fragments,"(2S)-4'-Hydroxy-5,7,3'-trimethoxyflavan,2'-Hyd...","ADHYECILSBTSIG-UHFFFAOYSA-N,GFHICTQGQGHRRY-UHF..."
7,380.1458,905.24,positive,No Fragments,"(2S)-4'-Hydroxy-5,7,3'-trimethoxyflavan,2'-Hyd...","ADHYECILSBTSIG-UHFFFAOYSA-N,GFHICTQGQGHRRY-UHF..."
11,132.0767,885.84,positive,Annotate in FrAnK,"3-Guanidinopropanoate,Beta-Guanidinopropionic ...","BJNBRIBHKLJMAG-ARJAWSKDSA-N,CDKXZKUBCGJTDG-UHF..."
12,156.0768,907.33,positive,L-Histidine (C6H9N3O2) Prob = 98.8900000000,"2,5-Dioxopiperazine,3-(Pyrazol-1-yl)-L-alanine...","BXRLWGXPSRYJDZ-VKHMYHEASA-N,BXRNXXXXHLBUKK-UHF..."
13,203.0526,888.61,positive,Annotate in FrAnK,"2-Deoxy-D-gluconate,3(S)-hydroxy-all-cis-8,11,...","BJHIKXHVCXFQLS-PQLUHFTBSA-N,BJHIKXHVCXFQLS-PYW..."
14,178.0587,891.23,positive,Annotate in FrAnK,"2,5-Dioxopiperazine,3-(Pyrazol-1-yl)-L-alanine...","BXRLWGXPSRYJDZ-VKHMYHEASA-N,BXRNXXXXHLBUKK-UHF..."


Keep the same peaks as the metadata

In [17]:
selected = pp.columns
overlap = selected.isin(metadata_peaks.index)
pp = (pp.transpose().loc[overlap]).transpose()

display(pp)

Unnamed: 0,1,2,3,4,6,7,11,12,13,14,...,2876,2879,2884,2887,2889,2891,2893,2903,2918,2920
INFEC_1,257776768.0,5529046.0,6960773.5,654349.81,345849.22,219935.03,118354888.0,5529046.0,4144117.5,543059.88,...,30643.2,41397.09,44042.79,40723.36,57655.79,62732.66,35946.98,46090.29,27804.65,21575.88
INFEC_2,161851648.0,4672629.0,3889198.75,320942.19,175617.34,138481.02,97892280.0,4672629.0,3702166.25,401106.16,...,30187.91,36199.68,34283.23,42700.02,74755.02,55266.55,36471.09,56350.35,27148.91,22636.13
INFEC_3,403047296.0,10444900.0,9469953.0,1096783.63,570069.19,447617.34,145060640.0,10444900.0,5183267.0,1353098.5,...,26590.68,37846.77,49374.02,44923.03,52484.71,54524.59,34264.76,43735.63,33914.49,10543.33
INFEC_4,182654768.0,3589490.5,4283163.0,291107.75,174276.03,72580.04,97634784.0,3949721.25,2880351.25,498905.44,...,59616.12,37599.96,33720.57,53287.47,50114.15,53082.81,41599.08,48580.25,24650.8,23278.97


Keep transcript data that are not all 0s in the columns

In [19]:
rr = rna.loc[groups_rna.index.values]
print(rr.shape)

pos = (rr != 0).any(axis=0)
rr = rr.loc[:, pos]

print(rr.shape)
display(rr)

(3, 48526)
(3, 21794)


Unnamed: 0,ENSMUSG00000000001,ENSMUSG00000000028,ENSMUSG00000000031,ENSMUSG00000000049,ENSMUSG00000000056,ENSMUSG00000000058,ENSMUSG00000000078,ENSMUSG00000000085,ENSMUSG00000000088,ENSMUSG00000000093,...,ENSMUSG00000110391,ENSMUSG00000110393,ENSMUSG00000110397,ENSMUSG00000110399,ENSMUSG00000110404,ENSMUSG00000110405,ENSMUSG00000110410,ENSMUSG00000110414,ENSMUSG00000110419,ENSMUSG00000110424
INF2cnt,3005,43,0,0,352,928,9478,399,4879,0,...,12,30,3,0,3,31,2,6,40,51
INF4cnt,2221,36,0,3,260,912,8990,365,4433,3,...,3,13,0,0,2,14,4,6,37,41
INF3cnt,3674,46,1,0,370,1868,9162,429,4338,1,...,5,38,0,2,0,16,0,5,47,43


In [21]:
rr.transpose().to_csv('/Users/joewandy/git/omics_integration/web_omics/static/data/gene_data.csv')

In [22]:
df = pd.read_csv('/Users/joewandy/git/omics_integration/web_omics/static/data/my_analysis_peaks.csv')

In [26]:
temp = pp.transpose()

In [27]:
temp

Unnamed: 0,INFEC_1,INFEC_2,INFEC_3,INFEC_4
1,2.577768e+08,1.618516e+08,4.030473e+08,1.826548e+08
2,5.529046e+06,4.672629e+06,1.044490e+07,3.589490e+06
3,6.960774e+06,3.889199e+06,9.469953e+06,4.283163e+06
4,6.543498e+05,3.209422e+05,1.096784e+06,2.911078e+05
6,3.458492e+05,1.756173e+05,5.700692e+05,1.742760e+05
7,2.199350e+05,1.384810e+05,4.476173e+05,7.258004e+04
11,1.183549e+08,9.789228e+07,1.450606e+08,9.763478e+07
12,5.529046e+06,4.672629e+06,1.044490e+07,3.949721e+06
13,4.144118e+06,3.702166e+06,5.183267e+06,2.880351e+06
14,5.430599e+05,4.011062e+05,1.353098e+06,4.989054e+05


In [23]:
df

Unnamed: 0,pid,sec_id,mass,rt,polarity,c_id,formula,adduct,rc_id,compound,db,identifier
0,741583,1,147.076381,905.000626,positive,2825862,C5H10N2O3,M+H,3453657,L-Glutamine,kegg,C00064
1,741598,16,162.076139,891.966177,positive,2826034,C6H11NO4,M+H,3453882,L-2-Aminoadipate,kegg,C00956
2,741599,17,116.070596,771.412376,positive,2826041,C5H9NO2,M+H,3453891,L-Proline,kegg,C00148
3,741611,29,132.101919,677.261473,positive,2826103,C6H13NO2,M+H,3453976,L-Leucine,kegg,C00123
4,741680,98,132.101919,646.385592,positive,2826221,C6H13NO2,M+H,3454135,L-Leucine,kegg,C00123
5,741741,159,118.086238,675.043801,positive,2826784,C5H11NO2,M+H,3454797,Betaine,kegg,C00719
6,741742,160,127.050210,680.104996,positive,2826804,C5H6N2O2,M+H,3454828,Imidazole-4-acetate,kegg,C02835
7,741748,166,132.065550,870.746642,positive,2826829,C5H9NO3,M+H,3454869,Hydroxyproline,kegg,C01157
8,741751,169,180.086637,866.284697,positive,2826867,C6H13NO5,M+H,3454924,D-Glucosamine,kegg,C00329
9,741754,172,114.066220,575.769092,positive,2826884,C4H7N3O,M+H,3454944,Creatinine,kegg,C00791


In [30]:
new_df = pd.merge(df, temp, left_on='sec_id', right_index=True)

In [34]:
new_df

Unnamed: 0,pid,sec_id,mass,rt,polarity,c_id,formula,adduct,rc_id,compound,db,identifier,INFEC_1,INFEC_2,INFEC_3,INFEC_4
0,741583,1,147.076381,905.000626,positive,2825862,C5H10N2O3,M+H,3453657,L-Glutamine,kegg,C00064,257776800.0,161851600.0,403047300.0,182654800.0
1,741598,16,162.076139,891.966177,positive,2826034,C6H11NO4,M+H,3453882,L-2-Aminoadipate,kegg,C00956,115290.9,99417.42,159995.6,149107.0
2,741599,17,116.070596,771.412376,positive,2826041,C5H9NO2,M+H,3453891,L-Proline,kegg,C00148,68319220.0,104547600.0,168765700.0,61559460.0
3,741611,29,132.101919,677.261473,positive,2826103,C6H13NO2,M+H,3453976,L-Leucine,kegg,C00123,50557660.0,42253740.0,98747390.0,38298500.0
4,741680,98,132.101919,646.385592,positive,2826221,C6H13NO2,M+H,3454135,L-Leucine,kegg,C00123,43307320.0,34981930.0,86632200.0,32244140.0
5,741741,159,118.086238,675.043801,positive,2826784,C5H11NO2,M+H,3454797,Betaine,kegg,C00719,28304080.0,20651030.0,55936990.0,25157780.0
6,741742,160,127.05021,680.104996,positive,2826804,C5H6N2O2,M+H,3454828,Imidazole-4-acetate,kegg,C02835,64107.03,90700.8,131612.1,81136.29
7,741748,166,132.06555,870.746642,positive,2826829,C5H9NO3,M+H,3454869,Hydroxyproline,kegg,C01157,38442970.0,31296330.0,70251200.0,29767400.0
8,741751,169,180.086637,866.284697,positive,2826867,C6H13NO5,M+H,3454924,D-Glucosamine,kegg,C00329,211026.9,194151.5,374550.4,162274.5
9,741754,172,114.06622,575.769092,positive,2826884,C4H7N3O,M+H,3454944,Creatinine,kegg,C00791,28058660.0,17411060.0,63340690.0,20037900.0


In [38]:
new_df = new_df[['identifier', 'INFEC_1', 'INFEC_2', 'INFEC_3', 'INFEC_4']]

In [40]:
new_df.to_csv('/Users/joewandy/git/omics_integration/web_omics/static/data/compound_data.csv', index=False)

In [43]:
rr.transpose().to_csv('/Users/joewandy/git/omics_integration/web_omics/static/data/gene_data.csv')