## Representative's biosample id extraction

In [2]:
import os
import sys
import pandas as pd

In [37]:
# Both gut and oral metadata tables were processed to extract 'Genome' and 'Sample_accession' columns
# into a separate table

df_gut = pd.read_csv('files/input/mgnify_meta/representative-all_metadata_gut.tsv', sep='\t')
df_oral = pd.read_csv('files/input/mgnify_meta/representative-all_metadata_oral.tsv', sep='\t')

def sample_repr(df):
    condition = df['Genome'] == df['Species_rep']
    result = df.loc[condition, ['Genome', 'Sample_accession']]
    return result


sample_repr(df_oral).to_csv('files/output/mgnify_biosample_meta/sampleoral.txt', sep='\t', index=False)
sample_repr(df_gut).to_csv('files/output/mgnify_biosample_meta/samplegut.txt', sep='\t', index=False)

sample_repr(df_oral).head()

Unnamed: 0,Genome,Sample_accession
0,MGYG000298013,SAMN14570735
1,MGYG000298020,ERS6080737
2,MGYG000298021,SAMN14570687
3,MGYG000298022,SAMN14570798
4,MGYG000298023,ERS7876796


## Biosample metadata extraction

#### Oral metadata

In [None]:
! bash /home/azat/Notebooks/humigec/git/scripts/biosample_oral.sh

#### Gut metadata

In [None]:
! bash /home/azat/Notebooks/humigec/git/scripts/biosample_gut.sh

### Example of output (files/biosamples_data/biosamples_ids_oral/MGYG000299120.txt)

| BioSample  |  ENA first public  |  ENA last update  |  ENA-CHECKLIST  |  External Id  |  INSDC center alias  |  INSDC center name  |  INSDC first public  |  INSDC last update  |  INSDC status  |  Submitter Id  |  assembly quality  |  assembly software  |  binning parameters  |  binning software  |  broker name  |  collection_date  |  completeness score  |  completeness software  |  contamination score  |  env_broad_scale  |  env_local_scale  |  env_medium  |  geo_loc_name  |  geographic location (latitude)  |  geographic location (longitude)  |  investigation_type  |  isolation_source  |  metagenomic source  |  project_name  |  sample derived from  |  sample_name  |  sequencing method  |  taxonomic identity marker |
| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |-- | -- |
| SAMEA8395290  |  2021-03-23  |  2021-03-22  |  ERC000047  |  SAMEA8395290  |  EMG  |  EMG  |  2021-03-23T08:57:14Z  |  2021-03-22T20:42:55Z  |  public  |  SRR9217479-mag-bin.15  |  Many fragments with little to no review of assembly other than reporting of standard assembly statistics  |  metaspades_v3.14.1  |  default  |  MetaWRAP v1.1  |  EMG broker account, EMBL-EBI  |  not provided  |  92.94  |  CheckM v1.1.3  |  0  |  human-associated habitat  |  human oral cavity  |  not provided  |  not provided  |  not provided  |  not provided  |  metagenome-assembled genome  |  not provided  |  human oral metagenome  |  Human Oral MAGs  |  SRS4893097  |  SRR9217479-mag-bin.15  |  Illumina  |  multi-marker approach |


## Creation of table with MAG id, Biosample id and metadata columns

In [46]:
# Specify the directory path where the TSV files are located
# for oral this is '/files/biosamples_databiosamples_ids_oral/'
# for gut this is '/files/biosamples_databiosamples_ids_git/'
# specify the folder as the 1st argument

directory_gut = 'files/input/mgnify_meta/biosamples_data/biosamples_ids_gut/'
directory_oral = 'files/input/mgnify_meta/biosamples_data/biosamples_ids_oral/'

#### Gut ####
data_gut = []

for filename in os.listdir(directory_gut):
    if filename.endswith('.txt'):

        file_path = os.path.join(directory_gut, filename)
        df_gut = pd.read_csv(file_path, sep='\t')
        sample_name = filename[:-4]
        if not df_gut.empty:
            # Extract the 4 columns from the dataframe: 
            # Biosample id and 4 columns with the detailed information on the body site
            bio_sample = df_gut['BioSample'].values[0] if 'BioSample' in df_gut.columns else 'NA'
            env_broad_scale = df_gut['env_broad_scale'].values[0] if 'env_broad_scale' in df_gut.columns else 'NA'
            env_local_scale = df_gut['env_local_scale'].values[0] if 'env_local_scale' in df_gut.columns else 'NA'
            env_medium = df_gut['env_medium'].values[0] if 'env_medium' in df_gut.columns else 'NA'
            isolation_source = df_gut['isolation_source'].values[0] if 'isolation_source' in df_gut.columns else 'NA'

            data_gut.append([sample_name, bio_sample, env_broad_scale, env_local_scale, env_medium, isolation_source])
        else:
            data_gut.append([sample_name, 'NA', 'NA', 'NA', 'NA', 'NA'])
            
columns_gut = ['Genome', 'BioSample', 'env_broad_scale', 'env_local_scale', 'env_medium', 'isolation_source']
df_result_gut = pd.DataFrame(data_gut, columns=columns_gut)
df_result_gut = df_result_gut.fillna({'env_broad_scale': 'NA', 'env_local_scale': 'NA', 'env_medium': 'NA', 'isolation_source': 'NA'})
df_result_gut.to_csv('files/output/mgnify_biosample_meta/biosample_gut_table.tsv', sep='\t', index=False)

#### Oral ####
data_oral = []

for filename in os.listdir(directory_oral):
    if filename.endswith('.txt'):

        file_path = os.path.join(directory_oral, filename)
        df_oral = pd.read_csv(file_path, sep='\t')
        sample_name = filename[:-4]
        if not df_oral.empty:
            bio_sample = df_oral['BioSample'].values[0] if 'BioSample' in df_oral.columns else 'NA'
            env_broad_scale = df_oral['env_broad_scale'].values[0] if 'env_broad_scale' in df_oral.columns else 'NA'
            env_local_scale = df_oral['env_local_scale'].values[0] if 'env_local_scale' in df_oral.columns else 'NA'
            env_medium = df_oral['env_medium'].values[0] if 'env_medium' in df_oral.columns else 'NA'
            isolation_source = df_oral['isolation_source'].values[0] if 'isolation_source' in df_oral.columns else 'NA'

            data_oral.append([sample_name, bio_sample, env_broad_scale, env_local_scale, env_medium, isolation_source])
        else:
            data_oral.append([sample_name, 'NA', 'NA', 'NA', 'NA', 'NA'])

# Create a dataframe from the collected data
columns_oral = ['Genome', 'BioSample', 'env_broad_scale', 'env_local_scale', 'env_medium', 'isolation_source']
df_result_oral = pd.DataFrame(data_oral, columns=columns_oral)
df_result_oral = df_result_oral.fillna({'env_broad_scale': 'NA', 'env_local_scale': 'NA', 'env_medium': 'NA', 'isolation_source': 'NA'})
df_result_oral.to_csv('files/output/mgnify_biosample_meta/biosample_oral_table.tsv', sep='\t', index=False)

### Example of output (biosample_oral_table.tsv):

|Genome   |    Sample_accession   |    env_broad_scale | env_local_scale | env_medium   |   isolation_source
| --- | --- | --- | --- | --- | --- |
|MGYG000298071  |  SAMN14571080  |  human oral   |   human oral   |   saliva | saliva

***Then manual annotation was performed to classify all samples and stored at ebi_biosample_specimen_gut.tsv and ebi_biosample_specimen_oral.tsv***

***For each sample corresponding column 'Specimen' was added***

### Example of output (mgnify_biosample_specimen_oral.tsv)
|env_broad_scale | env_local_scale | env_medium | isolation_source | Body_site | Specimen_type  |  Specimen|
| -- | -- | -- |-- | -- |-- | -- |
| human-associated habitat   |   human oral cavity   |   not provided  |  not provided  |  Oral  |  Oral_unclassified   |   Oral_unclassified|

## Merge several tables related to one body site (Oral/Gut) into one

In [66]:
#### Gut ####
classification_df_gut = pd.read_csv('files/input/specimen_classifications/mgnify_biosample_specimen_gut.tsv', sep='\t')
classification_df_gut = classification_df_gut.fillna({'env_broad_scale': 'NA', 'env_local_scale': 'NA', 'env_medium': 'NA', 'isolation_source': 'NA'})

biosample_df_gut = pd.read_csv('files/output/mgnify_biosample_meta/biosample_gut_table.tsv', sep='\t')
biosample_df_gut = biosample_df_gut.fillna({'env_broad_scale': 'NA', 'env_local_scale': 'NA', 'env_medium': 'NA', 'isolation_source': 'NA'})


# Merge the dataframes based on the common columns
merged_df_gut = pd.merge(biosample_df_gut, classification_df_gut[['env_broad_scale', 'env_local_scale', 'env_medium', 'isolation_source', 'Specimen']], on=['env_broad_scale', 'env_local_scale', 'env_medium', 'isolation_source'], how='left')

new_order = ['Genome', 'Specimen', 'env_broad_scale', 'env_local_scale', 'env_medium', 'isolation_source']

merged_df_gut = merged_df_gut[new_order]
merged_df_gut.to_csv('files/output/mgnify_biosample_meta/mgnify_meta_sample_gut.tsv', sep='\t', index=False)

#### Oral ####
classification_df_oral = pd.read_csv('files/input/specimen_classifications/mgnify_biosample_specimen_oral.tsv', sep='\t')
classification_df_oral = classification_df_oral.fillna({'env_broad_scale': 'NA', 'env_local_scale': 'NA', 'env_medium': 'NA', 'isolation_source': 'NA'})

biosample_df_oral = pd.read_csv('files/output/mgnify_biosample_meta/biosample_oral_table.tsv', sep='\t')
biosample_df_oral = biosample_df_oral.fillna({'env_broad_scale': 'NA', 'env_local_scale': 'NA', 'env_medium': 'NA', 'isolation_source': 'NA'})


# Merge the dataframes based on the common columns
merged_df_oral = pd.merge(biosample_df_oral, classification_df_oral[['env_broad_scale', 'env_local_scale', 'env_medium', 'isolation_source', 'Specimen']], on=['env_broad_scale', 'env_local_scale', 'env_medium', 'isolation_source'], how='left')

merged_df_oral = merged_df_oral[new_order]
merged_df_oral.to_csv('files/output/mgnify_biosample_meta/mgnify_meta_sample_oral.tsv', sep='\t', index=False)


