----
# Exporting data
- Want to export the results of the different maps to a csv for easy input to Postgres
    - data for the temp_map already exported from the notebook

In [1]:
import pandas as pd
import numpy as np
import csv
import json

----
## Tissue map
- results of tissue map located in two different jsonl files
- each biosample_id has one SRS associated with it and a number of tissue / bto_ids associated with it
- for each biosample, want to create a row that has the biosample_id, sra_id, tissue, and BTO_id
    - if there are multiple hits for the tissue / BTO_id, these should be in separate rows

In [None]:
# load the results of the tissue_map into a single dataframe 
tissue_df_1 = pd.read_json('../../data/results/tissue_output_1.jsonl', lines=True)
tissue_df_2 = pd.read_json('../../data/results/tissue_output_2.jsonl', lines=True)
tissue_df = pd.concat([tissue_df_1, tissue_df_2])

In [None]:
def write_to_file(writer, row):
    biosample_id = row['biosample_id']
    run_id = row['sra_id']
    tissue = row['tissue']
    bto_id = row['bto_matches']
    for tissue_split, bto_id_split in zip(tissue.split(','), bto_id.split(',')):
        writer.writerow([biosample_id, run_id, tissue_split, bto_id_split])


with open('../exports/tissue_map.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['biosample_id', 'run_id', 'tissue', 'bto_id'])
    tissue_df.apply(lambda row: write_to_file(writer, row), axis=1)


### Tissue map rerun export
- create dataframe for new rerun data
- previous table deleted in postgres to avoid conflicts
----
- rerun after fixing bugs for multiple matches in single source

In [2]:
# load the results of the tissue_map into a single dataframe 
tissue_df_1 = pd.read_json('../../data/results/tissue_output_1_rerun.jsonl', lines=True)
tissue_df_2 = pd.read_json('../../data/results/tissue_output_2_rerun.jsonl', lines=True)
tissue_df = pd.concat([tissue_df_1, tissue_df_2])

In [7]:
print(tissue_df[tissue_df['biosample_id'] == "SAMN00002389"]["bto_matches"])

220    BTO_0000725\tBTO_0001413
Name: bto_matches, dtype: object


In [5]:
a = {"biosample_id": "SAMN04571587", "sra_id": "", "source": "tissue\tcell_type\tcell_type", "text": "ear\tear fibroblast culture, passage 3\tear fibroblast culture, passage 3", "tissue": "ear\tear\tfibroblast", "bto_matches": "BTO_0000368\tBTO_0000368\tBTO_0000452"}
a["source"].split("\t")

['tissue', 'cell_type', 'cell_type']

In [8]:
def write_to_file(writer, row):
    biosample_id = row['biosample_id']
    srs_id = row['sra_id']
    tissue = row['tissue']
    source = row['source']
    text = row['text']
    bto_id = row['bto_matches']
    # store a set of existing bto_ids to avoid duplicates
    existing_matches = set()
    for source_split, text_split, tissue_split, bto_id_split in zip(source.split('\t'), text.split('\t'), tissue.split('\t'), bto_id.split('\t')):
        # skip if the tissue has already been identified
        if bto_id_split in existing_matches:
            continue
        else:
            writer.writerow([biosample_id, srs_id, source_split, text_split, tissue_split, bto_id_split])
            existing_matches.add(bto_id_split)


with open('../exports/tissue_map_rerun.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['biosample_id', 'srs_id', 'source', 'text', 'tissue', 'bto_id'])
    tissue_df.apply(lambda row: write_to_file(writer, row), axis=1)

In [15]:
for t in (tissue_df[tissue_df['biosample_id'] == 'SAMN18623547']['text']):
    print(t)

['gonzalez et al "genomic analysis provides insights into the functional capacity of soil bacteria communities inhabiting an altitudinal gradient in the atacama desert." microbiome journal (to be submitted)', ''],soil alkaline flat,soil metagenome


samples like 'SAMEA112151282' cause issues where a large experiment description is included in the xml. Returns multiple different hits:

- ovary
- digestive tube 
- bile
- gut
- feces
- digestive system
- liver

----
## Sex
- sex_output file has a line for each biosample 
- each line can be directly written to a csv for postgres

In [6]:
sex_df = pd.read_json('../../data/results/sex_output_1.jsonl', lines=True)

In [12]:
def write_to_file(writer, row):
    biosample_id = row['biosample_id']
    run_id = row['sra_id']
    male = bool(row['male']) if not np.isnan(row['male']) else ''
    female = bool(row['female']) if not np.isnan(row['female']) else ''
    other = bool(row['other']) if not np.isnan(row['other']) else ''
    writer.writerow([biosample_id, run_id, male, female, other])

with open('../exports/sex_map.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['biosample_id', 'run_id', 'male', 'female', 'other'])
    sex_df.apply(lambda row: write_to_file(writer, row), axis=1)
    