# Creating special dataframe for Pilar

1. Makes a dataframe from raw metagenome data.
2. Subsets raw dataframe to create new dataframe that only includes rows (metagenomes) which have values for specific columns which Pilar cares about.
3. Gets the EC lists associated with the subset metagenomes.
4. Computes # of ECs, # of associated compounds, and # of associated reactions for each subset metagenome based on EC list.
5. Writes out csv of conglomerated data.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import glob
import os

In [None]:
pilar_path = '/Users/dgagler/School/SecondaryProject/RedoxEnzymes/JGI_2019/Metagenomes/taxon_ids/'

path = glob.glob(os.path.join(pilar_path, '*.json'))

In [None]:
with open('/Users/dgagler/School/SecondaryProject/RedoxEnzymes/KEGG/enzyme_reaction.json', 'r') as f:
    ec_rxn_links = json.load(f)
    
with open('/Users/dgagler/School/SecondaryProject/RedoxEnzymes/KEGG/enzyme_compound.json', 'r') as f:
    ec_cpd_links = json.load(f)

In [None]:
pilar_metadata = {}

for i in path:
    organism_dict = {}
    with open(i, 'r') as f:
        data = json.load(f)
        taxon_id = data['metadata']['Taxon Object ID']
        metadata = data['metadata']
        organism_dict['Ecosystem'] = metadata['Ecosystem']
        organism_dict['Ecosystem Category'] = metadata['Ecosystem Category']
        organism_dict['Ecosystem Subtype'] = metadata['Ecosystem Type']
        if 'Ecotype' in metadata:
            organism_dict['Ecotype'] = metadata['Ecotype']
        if 'Specific Ecosystem' in metadata:
            organism_dict['Specific Ecosystem'] = metadata['Specific Ecosystem']
        if 'Geographic Location' in metadata:
            organism_dict['Geographic Location'] = metadata['Geographic Location']
        if 'Altitude In Meters' in metadata:
            organism_dict['Altitude (m)'] = metadata['Altitude In Meters']
        if 'Depth In Meters' in metadata:
            organism_dict['Depth (m)'] = metadata['Depth In Meters']
        if 'pH' in metadata:
            organism_dict['pH'] = metadata['pH']
        if 'Salinity' in metadata:
            organism_dict['Salinity'] = metadata['Salinity']
        if 'Oxygen Concentration' in metadata:
            organism_dict['Oxygen concentration'] = metadata['Oxygen concentration']
        if 'Oxygen Requirement' in metadata:
            organism_dict['Oxygen Requirement'] = metadata['Oxygen Requirement']
        if 'Nitrate' in metadata:
            organism_dict['Nitrate Concentration'] = metadata['Nitrate Concentration']
        if 'Temperature' in metadata:
            organism_dict['Temperature Range'] = metadata['Temperature Range']
        if 'Metabolism' in metadata:
            organism_dict['Metabolism'] = metadata['Metabolism']
        if 'Energy Source' in metadata:
            organism_dict['Source'] = metadata['Energy Source']
            
        pilar_metadata[taxon_id] = organism_dict

In [None]:
pilar_metadata_df = pd.DataFrame(pilar_metadata).T

In [None]:
pilar_enzymes = {}

for i in path:
    with open(i, 'r') as f:
        data = json.load(f)
        if 'assembled' in data:
            enzymes = data['assembled'].keys()
            enzymes = [e[3:] for e in enzymes]
            pilar_enzymes[data['metadata']['Taxon Object ID']] = enzymes
            #for k,v in data['assembled'].items():   
                #print(k[3:])
                #print(v[1])
                #pilar_enzymes[data['metadata']['Taxon Object ID']] = [k[3:], v[1]]


In [None]:
def get_cpd_dict(ec_dict):
    ec_cpd_dict = {}
    
    for k,v in ec_dict.items():
        organism_cpd_total = []
        fixed_ec_list = [('ec:' + i) for i in v]
        for i in fixed_ec_list:
            if i in ec_cpd_links:
                organism_cpd_total.append(ec_cpd_links[i])
                
        flat_organism_cpd_total = [i for sublist in organism_cpd_total for i in sublist]
        unique_organism_cpd_total = list(set(flat_organism_cpd_total))
        ec_cpd_dict[k] = [len(unique_organism_cpd_total), unique_organism_cpd_total]
        
    return ec_cpd_dict

In [None]:
def get_rxn_dict(ec_dict):
    ec_rxn_dict = {}
    
    for k,v in ec_dict.items():
        organism_rxn_total = []
        fixed_ec_list = [('ec:' + i) for i in v]
        for i in fixed_ec_list:
            if i in ec_rxn_links:
                organism_rxn_total.append(ec_rxn_links[i])
                
        flat_organism_rxn_total = [i for sublist in organism_rxn_total for i in sublist]
        unique_organism_rxn_total = list(set(flat_organism_rxn_total))
        ec_rxn_dict[k] = [len(unique_organism_rxn_total), unique_organism_rxn_total]
        
    return ec_rxn_dict

In [None]:
pilar_cpd_dict = get_cpd_dict(pilar_enzymes)
pilar_rxn_dict = get_rxn_dict(pilar_enzymes)

In [None]:
for k,v in pilar_cpd_dict.items():
    pilar_cpd_dict[k] = v[0]
for k,v in pilar_rxn_dict.items():
    pilar_rxn_dict[k] = v[0]

pilar_ecnum_dict = {}

for k,v in pilar_enzymes.items():
    pilar_ecnum_dict[k] = len(v)



In [None]:
pilar_cpd_df = pd.DataFrame(pilar_cpd_dict, index=['#cpds']).T
pilar_cpd_df.index.name='Taxon ID'

pilar_rxn_df = pd.DataFrame(pilar_rxn_dict, index=['#rxns']).T
pilar_rxn_df.index.name='Taxon ID'

pilar_ecnum_df = pd.DataFrame(pilar_ecnum_dict, index=['#ecs']).T
pilar_ecnum_df.index.name='Taxon ID'

In [None]:
core_df = pilar_cpd_df.join(pilar_rxn_df)
core_df = core_df.join(pilar_ecnum_df)

aba = list(pilar_enzymes.values())
core_df['ecs'] = aba


In [None]:
core_df = core_df.join(pilar_metadata_df)

In [None]:
core_df.to_csv(pilar_path+'/pilar_oxygen_metagenomes.csv')