In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0,"/u/project/ngarud/michaelw/Diversity-Along-Gut/Shalon_2023/scripts")
import config

from datetime import datetime

### Directories

In [2]:
metadata_path = "%s%s" % (config.metadata_directory, "metadata_shalon_2023.txt")
srr_information_path = "%s%s" % (config.metadata_directory, "shalon_metadata_unprocessed.txt")

In [3]:
metadata_path

'/u/project/ngarud/Garud_lab/metagenomic_fastq_files/Shalon_2023/metadata/metadata_shalon_2023.txt'

# Load the data

In [4]:
metadata = pd.read_csv(metadata_path, sep = "\t").rename(columns = {'Sample Name': 'sample_alias'})
srr_information = pd.read_csv(srr_information_path, sep = "\t")

In [5]:
metadata = pd.merge(metadata, srr_information[['sample_alias', 'run_accession']], on='sample_alias', how='left')


### add Date, month, day, hour, minute columns

In [6]:
metadata['year'] = metadata.apply(lambda row: datetime.strptime(row['swallow_date_time'], "%Y-%m-%dT%H:%M:%SZ").year if row.Type in ['Capsule 1', 'Capsule 2','Capsule 3','Capsule 4'] else datetime.strptime(row['recover_date_time'], "%Y-%m-%dT%H:%M:%SZ").year, axis = 1)
metadata['month'] = metadata.apply(lambda row: datetime.strptime(row['swallow_date_time'], "%Y-%m-%dT%H:%M:%SZ").month if row.Type in ['Capsule 1', 'Capsule 2','Capsule 3','Capsule 4',] else datetime.strptime(row['recover_date_time'], "%Y-%m-%dT%H:%M:%SZ").month, axis = 1)
metadata['day'] = metadata.apply(lambda row: datetime.strptime(row['swallow_date_time'], "%Y-%m-%dT%H:%M:%SZ").day if row.Type in ['Capsule 1', 'Capsule 2','Capsule 3','Capsule 4'] else datetime.strptime(row['recover_date_time'], "%Y-%m-%dT%H:%M:%SZ").day, axis = 1)
metadata['hour'] = metadata.apply(lambda row: datetime.strptime(row['swallow_date_time'], "%Y-%m-%dT%H:%M:%SZ").hour if row.Type in ['Capsule 1', 'Capsule 2','Capsule 3','Capsule 4'] else datetime.strptime(row['recover_date_time'], "%Y-%m-%dT%H:%M:%SZ").hour, axis = 1)
metadata['minute'] = metadata.apply(lambda row: datetime.strptime(row['swallow_date_time'], "%Y-%m-%dT%H:%M:%SZ").minute if row.Type in ['Capsule 1', 'Capsule 2','Capsule 3','Capsule 4'] else datetime.strptime(row['recover_date_time'], "%Y-%m-%dT%H:%M:%SZ").minute, axis = 1)
metadata['date'] = metadata.apply(lambda row: datetime.strptime(row['swallow_date_time'], "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d") if row.Type in ['Capsule 1', 'Capsule 2','Capsule 3','Capsule 4'] else row['Recover_date'], axis = 1)

    

In [7]:
metadata.columns

Index(['Run', 'Sample_ID', 'Subject_ID', 'sample_alias', 'sample_set',
       'Host_age', 'host_se', 'location', 'DeviceID', 'Type',
       'geo_loc_name_countr', 'geo_loc_name_country_continent',
       'swallow_date_time', 'Collection_Date', 'recover_date_time',
       'Recover_date', 'Recover_time', 'Hours_in_bod', 'Time_of_da',
       'experiment', 'pH', 'O2', 'clear_contam', 'possible_contam',
       'run_accession', 'year', 'month', 'day', 'hour', 'minute', 'date'],
      dtype='object')

# Process

### Functions

In [8]:
# Extract gut site

def extract_gut_location(Type):
    if (Type == "Stool") | (Type == "Saliva"):
        location = Type
    elif Type == "Capsule 1":
        location = "Small intestine 1"
    elif Type == "Capsule 2":
        location = "Small intestine 2"
    elif Type == "Capsule 3":
        location = "Small intestine 3"
    elif Type == "Capsule 4":
        location = "Ascending colon"
    return(location)
    

### Annotation

In [9]:
metadata['gut_location'] = metadata.apply(lambda row: extract_gut_location(row['Type']), axis = 1)


In [18]:
metadata[(metadata.Subject_ID == 1) & (metadata.sample_set == "7")]

Unnamed: 0,Run,Sample_ID,Subject_ID,sample_alias,sample_set,Host_age,host_se,location,DeviceID,Type,...,clear_contam,possible_contam,run_accession,year,month,day,hour,minute,date,gut_location
40,SRR18584992,1117,1,1117_1_S7_TCapsule 3_meta,7,55,male,Capsule,623F36B3,Capsule 3,...,False,False,SRR18584992,2020,6,30,6,30,2020-06-30,Small intestine 3
41,SRR18584992,1117,1,1117_1_S7_TCapsule 3_meta,7,55,male,Capsule,623F36B3,Capsule 3,...,False,False,SRR18794791,2020,6,30,6,30,2020-06-30,Small intestine 3
62,SRR18585003,1116,1,1116_1_S7_TCapsule 3_meta,7,55,male,Capsule,37F69DC5,Capsule 3,...,False,False,SRR18585003,2020,6,30,6,30,2020-06-30,Small intestine 3
63,SRR18585003,1116,1,1116_1_S7_TCapsule 3_meta,7,55,male,Capsule,37F69DC5,Capsule 3,...,False,False,SRR18794802,2020,6,30,6,30,2020-06-30,Small intestine 3
161,SRR18585053,1118,1,1118_1_S7_TCapsule 3_meta,7,55,male,Capsule,37F63B96,Capsule 3,...,False,True,SRR18794955,2020,6,30,6,30,2020-06-30,Small intestine 3
162,SRR18585053,1118,1,1118_1_S7_TCapsule 3_meta,7,55,male,Capsule,37F63B96,Capsule 3,...,False,True,SRR18585053,2020,6,30,6,30,2020-06-30,Small intestine 3
465,SRR18585206,1115,1,1115_1_S7_TCapsule 3_meta,7,55,male,Capsule,2E9A8C56,Capsule 3,...,False,True,SRR18585206,2020,6,30,6,30,2020-06-30,Small intestine 3
466,SRR18585206,1115,1,1115_1_S7_TCapsule 3_meta,7,55,male,Capsule,2E9A8C56,Capsule 3,...,False,True,SRR18794741,2020,6,30,6,30,2020-06-30,Small intestine 3


### Select columns

In [11]:
selected_columns = ['Sample_ID',
                    'Subject_ID',
                    'run_accession',
                    'Type',
                    'swallow_date_time', 
                    'Collection_Date', 
                    'recover_date_time',
                    'Recover_date', 
                    'Recover_time', 
                    'sample_set', 
                    'location', 
                    'gut_location', 
                    'day', 
                    "host_se"]

metadata_final = metadata.loc[:,selected_columns].rename(columns = {'host_se':"sex"})
metadata_final[:5]

Unnamed: 0,Sample_ID,Subject_ID,run_accession,Type,swallow_date_time,Collection_Date,recover_date_time,Recover_date,Recover_time,sample_set,location,gut_location,day,sex
0,2161,12,SRR18794727,Stool,,2020-09-13T00:00:00Z,2020-09-13T14:30:00Z,2020-09-13,07:30:00,Stool,Stool,Stool,13,male
1,2161,12,SRR18584972,Stool,,2020-09-13T00:00:00Z,2020-09-13T14:30:00Z,2020-09-13,07:30:00,Stool,Stool,Stool,13,male
2,1131,1,SRR18584973,Capsule 4,2020-07-02T07:30:00Z,2020-07-02T00:00:00Z,2020-07-02T15:00:00Z,2020-07-02,08:00:00,9,Capsule,Ascending colon,2,male
3,1131,1,SRR18794724,Capsule 4,2020-07-02T07:30:00Z,2020-07-02T00:00:00Z,2020-07-02T15:00:00Z,2020-07-02,08:00:00,9,Capsule,Ascending colon,2,male
4,2160,12,SRR18794728,Stool,,2020-09-12T00:00:00Z,2020-09-12T18:00:00Z,2020-09-12,11:00:00,Stool,Stool,Stool,12,male


In [13]:
metadata_final[(metadata_final.Subject_ID == 1) & (metadata_final.sample_set == "7")]

Unnamed: 0,Sample_ID,Subject_ID,run_accession,Type,swallow_date_time,Collection_Date,recover_date_time,Recover_date,Recover_time,sample_set,location,gut_location,day,sex
40,1117,1,SRR18584992,Capsule 3,2020-06-30T06:30:00Z,2020-06-29T00:00:00Z,2020-06-29T15:30:00Z,2020-06-29,08:30:00,7,Capsule,Small intestine 3,30,male
41,1117,1,SRR18794791,Capsule 3,2020-06-30T06:30:00Z,2020-06-29T00:00:00Z,2020-06-29T15:30:00Z,2020-06-29,08:30:00,7,Capsule,Small intestine 3,30,male
62,1116,1,SRR18585003,Capsule 3,2020-06-30T06:30:00Z,2020-06-29T00:00:00Z,2020-06-29T15:30:00Z,2020-06-29,08:30:00,7,Capsule,Small intestine 3,30,male
63,1116,1,SRR18794802,Capsule 3,2020-06-30T06:30:00Z,2020-06-29T00:00:00Z,2020-06-29T15:30:00Z,2020-06-29,08:30:00,7,Capsule,Small intestine 3,30,male
161,1118,1,SRR18794955,Capsule 3,2020-06-30T06:30:00Z,2020-06-29T00:00:00Z,2020-06-29T15:30:00Z,2020-06-29,08:30:00,7,Capsule,Small intestine 3,30,male
162,1118,1,SRR18585053,Capsule 3,2020-06-30T06:30:00Z,2020-06-29T00:00:00Z,2020-06-29T15:30:00Z,2020-06-29,08:30:00,7,Capsule,Small intestine 3,30,male
465,1115,1,SRR18585206,Capsule 3,2020-06-30T06:30:00Z,2020-06-29T00:00:00Z,2020-06-29T15:30:00Z,2020-06-29,08:30:00,7,Capsule,Small intestine 3,30,male
466,1115,1,SRR18794741,Capsule 3,2020-06-30T06:30:00Z,2020-06-29T00:00:00Z,2020-06-29T15:30:00Z,2020-06-29,08:30:00,7,Capsule,Small intestine 3,30,male


### Saving

In [None]:
metadata_output_path = "%s%s" % (config.analysis_directory, "metadata/shalon_metadata.txt")
metadata_final.to_csv(metadata_output_path, index=False, sep = "\t")

In [None]:
metadata_output_path = "%s%s" % (config.analysis_directory, "metadata/shalon_metadata.txt")
metadata_output_path