# Extract stuff from Kevin's data

In [1]:
import os
import glob

import pylab as plt
import matplotlib
from IPython.display import display, HTML

import numpy as np
from scipy.sparse import lil_matrix
import pandas as pd
from pandas import HDFStore

from collections import defaultdict
import math

%matplotlib inline

### Load Kevin's peak data in positive mode

In [2]:
home = 'C:\\Users\\joewa'

In [3]:
basedir = os.path.join(home, 'Dropbox/Analysis/omics_integration/data')

In [4]:
peaks = pd.read_csv(basedir + '/intensities_pos.csv', index_col=0)
peaks.head()
peaks.columns = peaks.columns.values.astype(int)

In [5]:
samples_peaks = pd.read_csv(basedir + '/metadata_samples.csv', index_col=0)
samples_peaks

Unnamed: 0_level_0,Time,Parasite,Treatment
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
UN_1,7,UN,Unsorted
UN4,7,UN,Unsorted
INFEC_1,7,INFEC,Unsorted
INFEC_2,7,INFEC,Unsorted
INFEC_3,7,INFEC,Unsorted
INFEC_4,7,INFEC,Unsorted
HK1,7,HK,Unsorted
HK2,7,HK,Unsorted
HK3,7,HK,Unsorted
HK4,7,HK,Unsorted


### Load Kevin's transcript data

We only use the data we have at timepoint 7

In [6]:
col_mapping = {
    'HK1cnt'        : (7,  'HK', 'Unsorted'),
    'HK2cnt'        : (7,  'HK', 'Unsorted'),
    'HK3cnt'        : (7,  'HK', 'Unsorted'),
    'INF2cnt'       : (7,  'INFEC', 'Unsorted'),
    'INF3cnt'       : (7,  'INFEC', 'Unsorted'),
    'INF4cnt'       : (7,  'INFEC', 'Unsorted'),
    'M01cnt'        : (7,  'UN', 'Unsorted'),
    'M02cnt'        : (7,  'UN', 'Unsorted'),
    'M03cnt'        : (7,  'UN', 'Unsorted'),
}

In [7]:
ss = []
for sample_name in col_mapping:
    ss.append((sample_name,) + col_mapping[sample_name])
    
samples_rna = pd.DataFrame(ss, columns=['Sample', 'Time', 'Parasite', 'Treatment'])
samples_rna.set_index(['Sample'])
samples_rna

Unnamed: 0,Sample,Time,Parasite,Treatment
0,HK1cnt,7,HK,Unsorted
1,HK2cnt,7,HK,Unsorted
2,HK3cnt,7,HK,Unsorted
3,INF2cnt,7,INFEC,Unsorted
4,INF3cnt,7,INFEC,Unsorted
5,INF4cnt,7,INFEC,Unsorted
6,M01cnt,7,UN,Unsorted
7,M02cnt,7,UN,Unsorted
8,M03cnt,7,UN,Unsorted


In [8]:
samples_rna.to_csv(basedir + '/metadata_rna.csv', index=False)

In [9]:
dfs = []
for f in glob.glob(basedir + '/RNA/*'):
    basename = os.path.basename(f)
    df = pd.read_csv(f, header=None, index_col=0, sep='\t', names=[basename])
    dfs.append(df)

In [10]:
rna = pd.concat(dfs, axis=1)
rna = rna.transpose()
display(rna)

Unnamed: 0,ENSMUSG00000000001,ENSMUSG00000000003,ENSMUSG00000000028,ENSMUSG00000000031,ENSMUSG00000000037,ENSMUSG00000000049,ENSMUSG00000000056,ENSMUSG00000000058,ENSMUSG00000000078,ENSMUSG00000000085,...,ENSMUSG00000110415,ENSMUSG00000110416,ENSMUSG00000110417,ENSMUSG00000110418,ENSMUSG00000110419,ENSMUSG00000110420,ENSMUSG00000110421,ENSMUSG00000110422,ENSMUSG00000110423,ENSMUSG00000110424
HK1cnt,4390,0,44,0,2,0,312,1910,10297,436,...,0,0,0,0,53,0,0,0,0,38
HK2cnt,4003,0,47,0,0,1,366,1901,9329,457,...,0,0,0,0,52,0,0,0,0,25
HK3cnt,5739,0,57,0,3,1,418,2582,14173,732,...,0,0,0,0,95,0,0,0,0,29
INF2cnt,3005,0,43,0,0,0,352,928,9478,399,...,0,0,0,0,40,0,0,0,0,51
INF3cnt,3674,0,46,1,0,0,370,1868,9162,429,...,0,0,0,0,47,0,0,0,0,43
INF4cnt,2221,0,36,0,0,3,260,912,8990,365,...,0,0,0,0,37,0,0,0,0,41
M01cnt,4255,0,60,0,4,0,495,1875,9600,515,...,0,0,0,0,60,0,0,0,0,73
M02cnt,4587,0,91,0,0,2,558,2140,10182,652,...,0,0,0,0,54,0,0,0,0,77
M03cnt,3927,0,57,0,0,3,467,1830,9252,494,...,0,0,0,0,40,0,0,1,0,64


In [11]:
rna.to_csv(basedir + '/rna_all.csv')

### Select a group for analysis

In [12]:
time = 7
parasite = ['INFEC', 'HK', 'UN']
treatment = 'Unsorted'

Selected samples for the metabolomics data

In [13]:
pos = (samples_peaks['Time'] == time) & (samples_peaks['Parasite'].isin(parasite)) & \
      (samples_peaks['Treatment'] == treatment)
    
groups_peaks = samples_peaks[pos]
display(groups_peaks)

Unnamed: 0_level_0,Time,Parasite,Treatment
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
UN_1,7,UN,Unsorted
UN4,7,UN,Unsorted
INFEC_1,7,INFEC,Unsorted
INFEC_2,7,INFEC,Unsorted
INFEC_3,7,INFEC,Unsorted
INFEC_4,7,INFEC,Unsorted
HK1,7,HK,Unsorted
HK2,7,HK,Unsorted
HK3,7,HK,Unsorted
HK4,7,HK,Unsorted


Selected samples for the transcript data

In [14]:
pos = (samples_rna['Time'] == time) & (samples_rna['Parasite'].isin(parasite)) & \
      (samples_rna['Treatment'] == treatment)
    
groups_rna = samples_rna[pos]
groups_rna = groups_rna.set_index('Sample')
display(groups_rna)

Unnamed: 0_level_0,Time,Parasite,Treatment
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HK1cnt,7,HK,Unsorted
HK2cnt,7,HK,Unsorted
HK3cnt,7,HK,Unsorted
INF2cnt,7,INFEC,Unsorted
INF3cnt,7,INFEC,Unsorted
INF4cnt,7,INFEC,Unsorted
M01cnt,7,UN,Unsorted
M02cnt,7,UN,Unsorted
M03cnt,7,UN,Unsorted


Keep peak data that do not contain NAs

In [15]:
print(peaks.columns)

Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            2914, 2915, 2916, 2917, 2918, 2919, 2920, 2921, 2922, 2923],
           dtype='int64', length=2923)


In [16]:
peaks

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,2914,2915,2916,2917,2918,2919,2920,2921,2922,2923
UN_1,395020480.0,7686279.5,9079840.0,973114.19,505043.44,659134.19,350100.38,355398.47,188286.83,,...,38989.88,27659.14,32403.78,22160.38,37909.68,46042.89,19431.87,13727.0,32780.18,31106.39
UN4,289368320.0,6820364.5,6973341.0,943333.75,180859.09,300419.78,270301.81,243417.81,200151.91,,...,41791.73,33738.63,45259.1,18273.53,25420.12,31853.42,22797.44,19371.16,35270.98,32009.4
INFEC_1,257776768.0,5529046.0,6960773.5,654349.81,173370.22,345849.22,219935.03,246547.3,112748.26,,...,38172.43,27899.9,46987.21,26511.22,27804.65,31506.33,21575.88,20833.71,31194.07,33556.29
INFEC_2,161851648.0,4672629.0,3889198.75,320942.19,,175617.34,138481.02,140896.5,,,...,39713.55,28176.85,45702.19,24959.22,27148.91,30018.25,22636.13,28097.6,30959.71,29259.81
INFEC_3,403047296.0,10444900.0,9469953.0,1096783.63,589896.56,570069.19,447617.34,558778.19,205501.92,,...,52523.15,34968.3,35957.28,23790.51,33914.49,35318.34,10543.33,14062.33,27639.48,26006.63
INFEC_4,182654768.0,3589490.5,4283163.0,291107.75,,174276.03,72580.04,156987.55,,,...,,9922.9,40218.66,24715.55,24650.8,37442.79,23278.97,28183.72,35170.65,24254.83
HK1,285778848.0,6862975.5,6574686.0,713782.0,172630.77,383025.72,352576.13,298229.69,160399.34,,...,41550.18,34887.93,41833.13,27782.36,25872.33,33388.16,26810.99,21362.8,25340.38,33231.15
HK2,315729216.0,6720417.0,7404907.5,829484.81,296470.88,432502.41,365256.88,389146.53,173012.19,,...,34745.13,27462.69,39432.03,25374.36,30242.3,39366.46,29045.95,25033.17,38797.14,32380.69
HK3,314446880.0,7340054.5,7629849.0,842249.0,347985.09,583344.13,340454.91,379696.5,,,...,48050.95,36202.15,38275.44,23019.56,36487.66,32074.4,18855.39,13031.08,23728.7,26957.13
HK4,278574080.0,5869848.5,5938572.0,585176.44,,206749.66,338232.78,246873.34,87785.41,6540312.0,...,43273.16,27949.41,46750.65,25262.93,26524.61,31760.65,20257.75,20385.47,25894.04,30282.27


In [17]:
pp = peaks.loc[groups_peaks.index.values]
print(pp.shape)
print(pp.columns)

pp = pp.dropna(axis=1, how='any')
print(pp.shape)
display(pp)

(10, 2923)
Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            2914, 2915, 2916, 2917, 2918, 2919, 2920, 2921, 2922, 2923],
           dtype='int64', length=2923)
(10, 2738)


Unnamed: 0,1,2,3,4,6,7,8,11,12,13,...,2912,2915,2916,2917,2918,2919,2920,2921,2922,2923
UN_1,395020480.0,7686279.5,9079840.0,973114.19,659134.19,350100.38,355398.47,131537344.0,7686279.5,5569112.5,...,48248.29,27659.14,32403.78,22160.38,37909.68,46042.89,19431.87,13727.0,32780.18,31106.39
UN4,289368320.0,6820364.5,6973341.0,943333.75,300419.78,270301.81,243417.81,128101592.0,6820364.5,4658997.0,...,37505.11,33738.63,45259.1,18273.53,25420.12,31853.42,22797.44,19371.16,35270.98,32009.4
INFEC_1,257776768.0,5529046.0,6960773.5,654349.81,345849.22,219935.03,246547.3,118354888.0,5529046.0,4144117.5,...,46311.26,27899.9,46987.21,26511.22,27804.65,31506.33,21575.88,20833.71,31194.07,33556.29
INFEC_2,161851648.0,4672629.0,3889198.75,320942.19,175617.34,138481.02,140896.5,97892280.0,4672629.0,3702166.25,...,40102.05,28176.85,45702.19,24959.22,27148.91,30018.25,22636.13,28097.6,30959.71,29259.81
INFEC_3,403047296.0,10444900.0,9469953.0,1096783.63,570069.19,447617.34,558778.19,145060640.0,10444900.0,5183267.0,...,40972.51,34968.3,35957.28,23790.51,33914.49,35318.34,10543.33,14062.33,27639.48,26006.63
INFEC_4,182654768.0,3589490.5,4283163.0,291107.75,174276.03,72580.04,156987.55,97634784.0,3949721.25,2880351.25,...,33934.88,9922.9,40218.66,24715.55,24650.8,37442.79,23278.97,28183.72,35170.65,24254.83
HK1,285778848.0,6862975.5,6574686.0,713782.0,383025.72,352576.13,298229.69,121024176.0,6862975.5,4257002.5,...,40429.59,34887.93,41833.13,27782.36,25872.33,33388.16,26810.99,21362.8,25340.38,33231.15
HK2,315729216.0,6720417.0,7404907.5,829484.81,432502.41,365256.88,389146.53,122869776.0,6720417.0,4968785.5,...,49646.69,27462.69,39432.03,25374.36,30242.3,39366.46,29045.95,25033.17,38797.14,32380.69
HK3,314446880.0,7340054.5,7629849.0,842249.0,583344.13,340454.91,379696.5,125891144.0,7340054.5,4225476.5,...,50185.04,36202.15,38275.44,23019.56,36487.66,32074.4,18855.39,13031.08,23728.7,26957.13
HK4,278574080.0,5869848.5,5938572.0,585176.44,206749.66,338232.78,246873.34,126622480.0,5869848.5,4098876.5,...,39691.23,27949.41,46750.65,25262.93,26524.61,31760.65,20257.75,20385.47,25894.04,30282.27


Read peak metadata containing the identifications

In [18]:
metadata_peaks = pd.read_csv(basedir + '/metadata_peaks.csv', index_col=0)

# keep only peaks we've selected from before
metadata_peaks = metadata_peaks[metadata_peaks.index.isin(pp.columns)]

# drop rows containing NA, i.e. in the PiMP Annotation column
metadata_peaks = metadata_peaks.dropna()

display(metadata_peaks)

Unnamed: 0_level_0,Mass,RT,Polarity,FrAnK Annotation,PiMP Annotation,InChI Key
Peak id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,147.0764,905.00,positive,Annotate in FrAnK,"2-Amino-3-hydroxypropanoic acid,3-Ureidoisobut...","AEFLONBTGZFSGQ-UHFFFAOYSA-N,CXISPYVYMQWFLE-UHF..."
2,156.0768,917.24,positive,L-Histidine (C6H9N3O2) Prob = 98.8800000000,"2,5-Dioxopiperazine,3-(Pyrazol-1-yl)-L-alanine...","BXRLWGXPSRYJDZ-VKHMYHEASA-N,BXRNXXXXHLBUKK-UHF..."
3,171.0764,904.98,positive,Annotate in FrAnK,"(3R,5S)-1-pyrroline-3-hydroxy-5-carboxylic Aci...","AOMLMYXPXUTBQH-UHFFFAOYSA-N,HFXAFXVXPMUQCQ-BYP..."
4,151.0478,903.42,positive,Annotate in FrAnK,"2-Aminoacrylic acid,2-Oxazolidinone,2-amino-4-...","DXWQLTOXWVWMOH-UHFFFAOYSA-N,ICCHEGCKVBMSTF-UHF..."
6,358.1639,904.96,positive,No Fragments,"(2S)-4'-Hydroxy-5,7,3'-trimethoxyflavan,2'-Hyd...","ADHYECILSBTSIG-UHFFFAOYSA-N,GFHICTQGQGHRRY-UHF..."
7,380.1458,905.24,positive,No Fragments,"(2S)-4'-Hydroxy-5,7,3'-trimethoxyflavan,2'-Hyd...","ADHYECILSBTSIG-UHFFFAOYSA-N,GFHICTQGQGHRRY-UHF..."
11,132.0767,885.84,positive,Annotate in FrAnK,"3-Guanidinopropanoate,Beta-Guanidinopropionic ...","BJNBRIBHKLJMAG-ARJAWSKDSA-N,CDKXZKUBCGJTDG-UHF..."
12,156.0768,907.33,positive,L-Histidine (C6H9N3O2) Prob = 98.8900000000,"2,5-Dioxopiperazine,3-(Pyrazol-1-yl)-L-alanine...","BXRLWGXPSRYJDZ-VKHMYHEASA-N,BXRNXXXXHLBUKK-UHF..."
13,203.0526,888.61,positive,Annotate in FrAnK,"2-Deoxy-D-gluconate,3(S)-hydroxy-all-cis-8,11,...","BJHIKXHVCXFQLS-PQLUHFTBSA-N,BJHIKXHVCXFQLS-PYW..."
14,178.0587,891.23,positive,Annotate in FrAnK,"2,5-Dioxopiperazine,3-(Pyrazol-1-yl)-L-alanine...","BXRLWGXPSRYJDZ-VKHMYHEASA-N,BXRNXXXXHLBUKK-UHF..."


Keep the same peaks as the metadata

In [19]:
selected = pp.columns
overlap = selected.isin(metadata_peaks.index)
pp = (pp.transpose().loc[overlap]).transpose()

display(pp)

Unnamed: 0,1,2,3,4,6,7,11,12,13,14,...,2876,2879,2884,2887,2889,2891,2893,2903,2918,2920
UN_1,395020480.0,7686279.5,9079840.0,973114.19,659134.19,350100.38,131537344.0,7686279.5,5569112.5,1022177.69,...,33254.56,37994.09,30680.23,36238.76,50253.92,59501.92,45891.71,42384.07,37909.68,19431.87
UN4,289368320.0,6820364.5,6973341.0,943333.75,300419.78,270301.81,128101592.0,6820364.5,4658997.0,717768.25,...,24425.09,36192.95,38991.07,45580.55,58139.01,56197.18,38990.04,50729.49,25420.12,22797.44
INFEC_1,257776768.0,5529046.0,6960773.5,654349.81,345849.22,219935.03,118354888.0,5529046.0,4144117.5,543059.88,...,30643.2,41397.09,44042.79,40723.36,57655.79,62732.66,35946.98,46090.29,27804.65,21575.88
INFEC_2,161851648.0,4672629.0,3889198.75,320942.19,175617.34,138481.02,97892280.0,4672629.0,3702166.25,401106.16,...,30187.91,36199.68,34283.23,42700.02,74755.02,55266.55,36471.09,56350.35,27148.91,22636.13
INFEC_3,403047296.0,10444900.0,9469953.0,1096783.63,570069.19,447617.34,145060640.0,10444900.0,5183267.0,1353098.5,...,26590.68,37846.77,49374.02,44923.03,52484.71,54524.59,34264.76,43735.63,33914.49,10543.33
INFEC_4,182654768.0,3589490.5,4283163.0,291107.75,174276.03,72580.04,97634784.0,3949721.25,2880351.25,498905.44,...,59616.12,37599.96,33720.57,53287.47,50114.15,53082.81,41599.08,48580.25,24650.8,23278.97
HK1,285778848.0,6862975.5,6574686.0,713782.0,383025.72,352576.13,121024176.0,6862975.5,4257002.5,664675.69,...,28759.28,38788.33,49704.8,34597.14,36835.69,58465.02,37052.05,53930.09,25872.33,26810.99
HK2,315729216.0,6720417.0,7404907.5,829484.81,432502.41,365256.88,122869776.0,6720417.0,4968785.5,663646.81,...,25699.9,40668.06,38886.83,43714.02,22172.4,52621.23,39187.09,42766.06,30242.3,29045.95
HK3,314446880.0,7340054.5,7629849.0,842249.0,583344.13,340454.91,125891144.0,7340054.5,4225476.5,858572.81,...,26149.55,32954.94,33760.33,47650.07,63001.7,62812.85,35329.72,49079.79,36487.66,18855.39
HK4,278574080.0,5869848.5,5938572.0,585176.44,206749.66,338232.78,126622480.0,5869848.5,4098876.5,608192.38,...,25133.33,36231.22,34022.88,48271.92,36807.02,61029.51,33066.66,41525.3,26524.61,20257.75


Keep transcript data that are not all 0s in the columns

In [20]:
groups_peaks

Unnamed: 0_level_0,Time,Parasite,Treatment
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
UN_1,7,UN,Unsorted
UN4,7,UN,Unsorted
INFEC_1,7,INFEC,Unsorted
INFEC_2,7,INFEC,Unsorted
INFEC_3,7,INFEC,Unsorted
INFEC_4,7,INFEC,Unsorted
HK1,7,HK,Unsorted
HK2,7,HK,Unsorted
HK3,7,HK,Unsorted
HK4,7,HK,Unsorted


In [21]:
groups_rna

Unnamed: 0_level_0,Time,Parasite,Treatment
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HK1cnt,7,HK,Unsorted
HK2cnt,7,HK,Unsorted
HK3cnt,7,HK,Unsorted
INF2cnt,7,INFEC,Unsorted
INF3cnt,7,INFEC,Unsorted
INF4cnt,7,INFEC,Unsorted
M01cnt,7,UN,Unsorted
M02cnt,7,UN,Unsorted
M03cnt,7,UN,Unsorted


In [22]:
rr = rna.loc[groups_rna.index.values]
print(rr.shape)

pos = (rr != 0).any(axis=0)
rr = rr.loc[:, pos]
rr = rr.transpose()

print(rr.shape)
display(rr)

(9, 48526)
(24850, 9)


Unnamed: 0,HK1cnt,HK2cnt,HK3cnt,INF2cnt,INF3cnt,INF4cnt,M01cnt,M02cnt,M03cnt
ENSMUSG00000000001,4390,4003,5739,3005,3674,2221,4255,4587,3927
ENSMUSG00000000028,44,47,57,43,46,36,60,91,57
ENSMUSG00000000031,0,0,0,0,1,0,0,0,0
ENSMUSG00000000037,2,0,3,0,0,0,4,0,0
ENSMUSG00000000049,0,1,1,0,0,3,0,2,3
ENSMUSG00000000056,312,366,418,352,370,260,495,558,467
ENSMUSG00000000058,1910,1901,2582,928,1868,912,1875,2140,1830
ENSMUSG00000000078,10297,9329,14173,9478,9162,8990,9600,10182,9252
ENSMUSG00000000085,436,457,732,399,429,365,515,652,494
ENSMUSG00000000088,4787,4110,5123,4879,4338,4433,3815,4048,4094


In [24]:
rr.to_csv('../static/data/uploads/gene_data.csv', index_label='Identifier')

In [56]:
rr.head(1000).to_csv('../static/data/uploads/gene_data_small.csv', index_label='Identifier')

Export peak data too

In [25]:
df = pd.read_csv('../static/data/uploads/my_analysis_peaks.csv')

In [26]:
temp = pp.transpose()

In [27]:
temp

Unnamed: 0,UN_1,UN4,INFEC_1,INFEC_2,INFEC_3,INFEC_4,HK1,HK2,HK3,HK4
1,3.950205e+08,2.893683e+08,2.577768e+08,1.618516e+08,4.030473e+08,1.826548e+08,2.857788e+08,3.157292e+08,3.144469e+08,2.785741e+08
2,7.686280e+06,6.820364e+06,5.529046e+06,4.672629e+06,1.044490e+07,3.589490e+06,6.862976e+06,6.720417e+06,7.340054e+06,5.869848e+06
3,9.079840e+06,6.973341e+06,6.960774e+06,3.889199e+06,9.469953e+06,4.283163e+06,6.574686e+06,7.404908e+06,7.629849e+06,5.938572e+06
4,9.731142e+05,9.433338e+05,6.543498e+05,3.209422e+05,1.096784e+06,2.911078e+05,7.137820e+05,8.294848e+05,8.422490e+05,5.851764e+05
6,6.591342e+05,3.004198e+05,3.458492e+05,1.756173e+05,5.700692e+05,1.742760e+05,3.830257e+05,4.325024e+05,5.833441e+05,2.067497e+05
7,3.501004e+05,2.703018e+05,2.199350e+05,1.384810e+05,4.476173e+05,7.258004e+04,3.525761e+05,3.652569e+05,3.404549e+05,3.382328e+05
11,1.315373e+08,1.281016e+08,1.183549e+08,9.789228e+07,1.450606e+08,9.763478e+07,1.210242e+08,1.228698e+08,1.258911e+08,1.266225e+08
12,7.686280e+06,6.820364e+06,5.529046e+06,4.672629e+06,1.044490e+07,3.949721e+06,6.862976e+06,6.720417e+06,7.340054e+06,5.869848e+06
13,5.569112e+06,4.658997e+06,4.144118e+06,3.702166e+06,5.183267e+06,2.880351e+06,4.257002e+06,4.968786e+06,4.225476e+06,4.098876e+06
14,1.022178e+06,7.177682e+05,5.430599e+05,4.011062e+05,1.353098e+06,4.989054e+05,6.646757e+05,6.636468e+05,8.585728e+05,6.081924e+05


In [28]:
df

Unnamed: 0,pid,sec_id,mass,rt,polarity,c_id,formula,adduct,rc_id,compound,db,identifier
0,741583,1,147.076381,905.000626,positive,2825862,C5H10N2O3,M+H,3453657,L-Glutamine,kegg,C00064
1,741598,16,162.076139,891.966177,positive,2826034,C6H11NO4,M+H,3453882,L-2-Aminoadipate,kegg,C00956
2,741599,17,116.070596,771.412376,positive,2826041,C5H9NO2,M+H,3453891,L-Proline,kegg,C00148
3,741611,29,132.101919,677.261473,positive,2826103,C6H13NO2,M+H,3453976,L-Leucine,kegg,C00123
4,741680,98,132.101919,646.385592,positive,2826221,C6H13NO2,M+H,3454135,L-Leucine,kegg,C00123
5,741741,159,118.086238,675.043801,positive,2826784,C5H11NO2,M+H,3454797,Betaine,kegg,C00719
6,741742,160,127.050210,680.104996,positive,2826804,C5H6N2O2,M+H,3454828,Imidazole-4-acetate,kegg,C02835
7,741748,166,132.065550,870.746642,positive,2826829,C5H9NO3,M+H,3454869,Hydroxyproline,kegg,C01157
8,741751,169,180.086637,866.284697,positive,2826867,C6H13NO5,M+H,3454924,D-Glucosamine,kegg,C00329
9,741754,172,114.066220,575.769092,positive,2826884,C4H7N3O,M+H,3454944,Creatinine,kegg,C00791


In [29]:
new_df = pd.merge(df, temp, left_on='sec_id', right_index=True)

In [30]:
new_df

Unnamed: 0,pid,sec_id,mass,rt,polarity,c_id,formula,adduct,rc_id,compound,...,UN_1,UN4,INFEC_1,INFEC_2,INFEC_3,INFEC_4,HK1,HK2,HK3,HK4
0,741583,1,147.076381,905.000626,positive,2825862,C5H10N2O3,M+H,3453657,L-Glutamine,...,395020500.0,289368300.0,257776800.0,161851600.0,403047300.0,182654800.0,285778800.0,315729200.0,314446900.0,278574100.0
1,741598,16,162.076139,891.966177,positive,2826034,C6H11NO4,M+H,3453882,L-2-Aminoadipate,...,154625.8,120014.6,115290.9,99417.42,159995.6,149107.0,140585.8,113691.5,102967.8,101951.2
2,741599,17,116.070596,771.412376,positive,2826041,C5H9NO2,M+H,3453891,L-Proline,...,103344800.0,116003700.0,68319220.0,104547600.0,168765700.0,61559460.0,124077600.0,59860940.0,97139540.0,72831090.0
3,741611,29,132.101919,677.261473,positive,2826103,C6H13NO2,M+H,3453976,L-Leucine,...,82202220.0,59370210.0,50557660.0,42253740.0,98747390.0,38298500.0,66018810.0,58046620.0,66078220.0,59227070.0
4,741680,98,132.101919,646.385592,positive,2826221,C6H13NO2,M+H,3454135,L-Leucine,...,72792220.0,53542500.0,43307320.0,34981930.0,86632200.0,32244140.0,59959860.0,49939480.0,55866370.0,52418660.0
5,741741,159,118.086238,675.043801,positive,2826784,C5H11NO2,M+H,3454797,Betaine,...,41812720.0,39106570.0,28304080.0,20651030.0,55936990.0,25157780.0,33059600.0,29422450.0,33477050.0,30216740.0
6,741742,160,127.05021,680.104996,positive,2826804,C5H6N2O2,M+H,3454828,Imidazole-4-acetate,...,83046.88,127687.8,64107.03,90700.8,131612.1,81136.29,78452.04,53573.14,90366.37,64429.58
7,741748,166,132.06555,870.746642,positive,2826829,C5H9NO3,M+H,3454869,Hydroxyproline,...,64405150.0,45643670.0,38442970.0,31296330.0,70251200.0,29767400.0,46127270.0,43981860.0,47054120.0,45183850.0
8,741751,169,180.086637,866.284697,positive,2826867,C6H13NO5,M+H,3454924,D-Glucosamine,...,379360.3,284598.7,211026.9,194151.5,374550.4,162274.5,276855.0,255602.1,283920.7,271148.0
9,741754,172,114.06622,575.769092,positive,2826884,C4H7N3O,M+H,3454944,Creatinine,...,46515700.0,33964100.0,28058660.0,17411060.0,63340690.0,20037900.0,36862300.0,31986020.0,35870710.0,30911850.0


In [31]:
selected = ['identifier'] + groups_peaks.index.values.tolist()

In [32]:
selected

['identifier',
 'UN_1',
 'UN4',
 'INFEC_1',
 'INFEC_2',
 'INFEC_3',
 'INFEC_4',
 'HK1',
 'HK2',
 'HK3',
 'HK4']

In [33]:
new_df = new_df[selected]

In [34]:
new_df.to_csv('../static/data/uploads/compound_data.csv', index=False)

### Generate other dataframes we need

In [51]:
compound_design = groups_peaks[['Parasite']]
compound_design = compound_design.rename({'Parasite': 'group'}, axis=1)
compound_design.index.name = 'sample'
compound_design.to_csv('../static/data/uploads/compound_design.csv', index=True)
compound_design

Unnamed: 0_level_0,group
sample,Unnamed: 1_level_1
UN_1,UN
UN4,UN
INFEC_1,INFEC
INFEC_2,INFEC
INFEC_3,INFEC
INFEC_4,INFEC
HK1,HK
HK2,HK
HK3,HK
HK4,HK


In [52]:
gene_design = groups_rna[['Parasite']]
gene_design = gene_design.rename({'Parasite': 'group'}, axis=1)
gene_design.index.name = 'sample'
gene_design.to_csv('../static/data/uploads/gene_design.csv', index=True)
gene_design

Unnamed: 0_level_0,group
sample,Unnamed: 1_level_1
HK1cnt,HK
HK2cnt,HK
HK3cnt,HK
INF2cnt,INFEC
INF3cnt,INFEC
INF4cnt,INFEC
M01cnt,UN
M02cnt,UN
M03cnt,UN
