# Code & example to update refineGEMs' internal database
The following code transforms a human-readable TSV file in the FILE_DIRECTORY (I) into the data format required to update the internal database of refineGEMs (II & III).

In [1]:
from refinegems.medium import update_db_multi, updated_db_to_schema
import pandas as pd
import numpy as np
import ntpath
import os

  from .autonotebook import tqdm as notebook_tqdm
* 'underscore_attrs_are_private' has been removed


## (I) File directory
The file directory needs to be specified here. All files from this directory will be used to generate new TSV files compatible with the database.

In [2]:
FILE_DIRECTORY = '../../../'

## (II) Transformation of human-readable TSV files to database-readable files
The following function `transform_medium_tsv_table_for_update` does the main job in transforming the TSV files into database-readable TSV files. </br>
This function is currently only setting up the input for the database tables 'medium2substance' and 'substance2db'. </br>
The code cell underneath the function definition cell specifies that only files ending with '_substances.tsv' should be transformed.

In [4]:
def transform_medium_tsv_tables_for_update(table_path: str, file_directory: str):
   """Transforms a TSV table containing a medium definition into a TSV file(s) usable 
   with the medium.py setup to be used to automatically update the database

   Args:
       - table_path (str): Path to the TSV file defining a medium
       - file_directory (str): Path to the current workspace directory
   """
   # Get medium name & DataFrame
   medium_name_list = ntpath.basename(table_path).split('_')
   medium_name = medium_name_list[0]
   if medium_name_list[1] in ['medium', 'subset']: medium_name += f'_{medium_name_list[1]}'
   medium_df = pd.read_csv(table_path, sep='\t')
   
   # Rename relevant & drop unnecessary columns
   medium_df.drop(['flux', 'formula'], axis=1, inplace=True)
   medium_df.rename({'name': 'substance'}, axis=1, inplace=True)
   
   ### Get the m2s table for update
   m2s_df = medium_df[['substance', 'source']].copy()
   
   # Rename 'source' to 'new_value'
   m2s_df.rename({'source': 'new_value'}, axis=1, inplace=True)
   
   # Add new columns medium, table & column to m2s table
   m2s_df['medium'] = medium_name
   m2s_df['table'] = 'medium2substance'
   m2s_df['column'] = 'source'
   
   # Add conditions column to m2s table
   m2s_df['conditions'] = m2s_df.apply(lambda row: f'substance={row["substance"]};medium={row["medium"]}', axis=1)
   m2s_df.drop(['substance', 'medium'], axis=1, inplace=True)
   
   # Extract m2s table
   m2s_df.to_csv(f'{file_directory}{medium_name}_substances_for_m2s_update.tsv', sep='\t', index=False)
   
   if len(medium_df.columns) > 4:
      ### Get the s2db table for update, if possible
      s2db_df = medium_df.drop('source', axis=1)
      
      # Merge VMH & BiGG if they have the same ID & Column does not already exist
      if not 'BiGG+VMH' in s2db_df.columns:
         # Create new column 'BiGG+VMH'
         s2db_df['BiGG+VMH'] = np.NaN
         
         # Merge VMH & BiGG column for same IDs & Remove original entries
         def merge_BiGG_VMH(row: pd.Series):
            if (row['BiGG'] == (row['VMH'])): # | (row["BiGG"].isna() & row["VMH"].isna())
               row['BiGG+VMH'] = row['BiGG']
               row['BiGG'] = np.NaN # Remove entry from column
               row['VMH'] = np.NaN # Remove entry from column
            else:
               row['BiGG+VMH'] = np.NaN
            return row
      
         s2db_df = s2db_df.apply(merge_BiGG_VMH, axis=1)
      
      # Transform table into long format
      s2db_df = pd.melt(s2db_df, id_vars='substance', var_name='db_type', value_name='db_id', ignore_index=True)
      
      # Remove all NaNs
      s2db_df.dropna(inplace=True)
      
      # Add new columns table & column to s2db table
      s2db_df['table'] = 'substance2db'
      s2db_df['column'] = 'substance_id, db_id, db_type'
      
      # Create 'new_value' from 'db_type' & 'db_id'
      s2db_df['new_value'] = s2db_df.apply(lambda row: f'{row["db_id"]}, {row["db_type"]}', axis=1)
      s2db_df.drop(['db_type', 'db_id'], axis=1, inplace=True)
      
      # Add conditions column to s2db table
      s2db_df['conditions'] = s2db_df.apply(lambda row: f'substance={row["substance"]}', axis=1)
      s2db_df.drop('substance', axis=1, inplace=True)
      
      # Extract s2db table
      s2db_df.to_csv(f'{file_directory}{medium_name}_substances_for_s2db_update.tsv', sep='\t', index=False)

In [6]:
for files in os.listdir(FILE_DIRECTORY):
    if files.endswith('_substances.tsv'):
        if 'already' in files: continue
        print(files)
        transform_medium_tsv_tables_for_update(f'{FILE_DIRECTORY}{files}', FILE_DIRECTORY)
    else:
        continue

RPMI_substances.tsv
CasA_subset_substances.tsv
CGXII_substances.tsv
dGMM_substances.tsv
M9_substances.tsv
CasA_medium_substances.tsv
MP-AU_substances.tsv
SNM3_substances.tsv
LB_substances.tsv


## (III) Add data from the database-readable TSV file to the database
The following code iterates over all newly generated TSV files and updates the database tables 'medium2substance' and 'substance2db' accordingly.

In [4]:
for files in os.listdir(FILE_DIRECTORY):
    if files.endswith('_update.tsv'):
            print(files)
            update = True if 'm2s' in files else False
            df = pd.read_csv(f'{FILE_DIRECTORY}{files}', sep='\t')
            update_db_multi(df, update_entries=update)
    else:
        continue

CasA_subset_substances_for_s2db_update.tsv
UNIQUE constraint failed: substance2db.substance_id, substance2db.db_id
Ocurred with: column=substance_id, db_id, db_type, new_value=cpd00395, SEED, condition=substance=L-Cysteate
UNIQUE constraint failed: substance2db.substance_id, substance2db.db_id
Ocurred with: column=substance_id, db_id, db_type, new_value=MNXM713, MetaNetX, condition=substance=L-Cysteate
UNIQUE constraint failed: substance2db.substance_id, substance2db.db_id
Ocurred with: column=substance_id, db_id, db_type, new_value=Lcyst, BiGG+VMH, condition=substance=L-Cysteate
CasA_medium_substances_for_m2s_update.tsv
RPMI_substances_for_m2s_update.tsv
SNM3_substances_for_m2s_update.tsv
MP-AU_substances_for_m2s_update.tsv
CGXII_substances_for_m2s_update.tsv
M9_substances_for_m2s_update.tsv
CasA_subset_substances_for_m2s_update.tsv
CGXII_substances_for_s2db_update.tsv
UNIQUE constraint failed: substance2db.substance_id, substance2db.db_id
Ocurred with: column=substance_id, db_id, db_

## Update Schema with updated database
!Be careful to  check the changes between the current SQL Schema file and the new one!

In [2]:
updated_db_to_schema()