# Merge NCBI GEO metadata and Table S3 (C-R / C-N / N) status metadata

In [2]:
import pandas as pd

In [5]:
geo_metadata = pd.read_csv("../data/geo_metadata.tsv", sep="\t", index_col=0)
geo_metadata

Unnamed: 0_level_0,Age,Gender
Sample name,Unnamed: 1_level_1,Unnamed: 2_level_1
C1,34.44,Female
C2,43.35,Female
C3,43.85,Female
C4,41.42,Female
C5,42.12,Female
...,...,...
N28,29.00,Female
N29,32.00,Female
N30,41.00,Female
N31,42.00,Female


In [6]:
status_metadata = pd.read_csv("../data/table_s3.tsv", sep="\t", index_col=0)
status_metadata

Unnamed: 0_level_0,Status,TotalNumReads,NumUniquelyMappedReads,UniquelyMappedReadsPercentage
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C1,C-R,7293272,6044825,82.88%
C2,C-R,7870895,6526361,82.92%
C3,C-R,6622312,5620525,84.87%
C4,C-R,8248171,7041599,85.37%
C5,C-R,9233397,7922603,85.80%
...,...,...,...,...
N28,N,8579794,7286744,84.93%
N29,N,10808607,8789442,81.32%
N30,N,7312205,5604035,76.64%
N31,N,9232461,6992056,75.73%


## Double-check that we can merge these metadata files

In [9]:
# The sample IDs should match up perfectly
assert (geo_metadata.index == status_metadata.index).all()

# ...and there shouldn't be any overlap between the (non-index) columns
assert len(set(geo_metadata.columns) & set(status_metadata.columns)) == 0

## Merge the metadata files!

In [14]:
merged_metadata = pd.concat([geo_metadata, status_metadata], axis=1)
# Set a sample ID for consistency's sake
merged_metadata.index.name = "SampleID"
merged_metadata

Unnamed: 0_level_0,Age,Gender,Status,TotalNumReads,NumUniquelyMappedReads,UniquelyMappedReadsPercentage
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C1,34.44,Female,C-R,7293272,6044825,82.88%
C2,43.35,Female,C-R,7870895,6526361,82.92%
C3,43.85,Female,C-R,6622312,5620525,84.87%
C4,41.42,Female,C-R,8248171,7041599,85.37%
C5,42.12,Female,C-R,9233397,7922603,85.80%
...,...,...,...,...,...,...
N28,29.00,Female,N,8579794,7286744,84.93%
N29,32.00,Female,N,10808607,8789442,81.32%
N30,41.00,Female,N,7312205,5604035,76.64%
N31,42.00,Female,N,9232461,6992056,75.73%


## Export the merged metadata

In [15]:
merged_metadata.to_csv("../data/merged_metadata.tsv", sep="\t")