----
# Exporting data
- Want to export the results of the different maps to a csv for easy input to Postgres
    - data for the temp_map already exported from the notebook

In [3]:
import pandas as pd
import numpy as np
import csv
import json

----
## Tissue map
- results of tissue map located in two different jsonl files
- each biosample_id has one SRS associated with it and a number of tissue / bto_ids associated with it
- for each biosample, want to create a row that has the biosample_id, sra_id, tissue, and BTO_id
    - if there are multiple hits for the tissue / BTO_id, these should be in separate rows

In [3]:
# load the results of the tissue_map into a single dataframe 
tissue_df_1 = pd.read_json('../../data/results/tissue_output_1.jsonl', lines=True)
tissue_df_2 = pd.read_json('../../data/results/tissue_output_2.jsonl', lines=True)
tissue_df = pd.concat([tissue_df_1, tissue_df_2])

In [4]:
def write_to_file(writer, row):
    biosample_id = row['biosample_id']
    run_id = row['sra_id']
    tissue = row['tissue']
    bto_id = row['bto_matches']
    for tissue_split, bto_id_split in zip(tissue.split(','), bto_id.split(',')):
        writer.writerow([biosample_id, run_id, tissue_split, bto_id_split])


with open('../exports/tissue_map.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['biosample_id', 'run_id', 'tissue', 'bto_id'])
    tissue_df.apply(lambda row: write_to_file(writer, row), axis=1)


----
# Sex
- sex_output file has a line for each biosample 
- each line can be directly written to a csv for postgres

In [6]:
sex_df = pd.read_json('../../data/results/sex_output_1.jsonl', lines=True)

In [12]:
def write_to_file(writer, row):
    biosample_id = row['biosample_id']
    run_id = row['sra_id']
    male = bool(row['male']) if not np.isnan(row['male']) else ''
    female = bool(row['female']) if not np.isnan(row['female']) else ''
    other = bool(row['other']) if not np.isnan(row['other']) else ''
    writer.writerow([biosample_id, run_id, male, female, other])

with open('../exports/sex_map.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['biosample_id', 'run_id', 'male', 'female', 'other'])
    sex_df.apply(lambda row: write_to_file(writer, row), axis=1)
    