In [1]:
import csv
import gzip
import os
import sys
import requests
from io import StringIO


import boto3
import pandas as pd

In [50]:
cookies = {
    'Cookie': 'x',  
}

def get_sample_name_to_id_for_project(project_id):
    short_read_samples_url = f"https://staging.czid.org/samples/index_v2.json?projectId={project_id}&domain=all_data&offset=0&listAllIds=true&basic=false&workflow=short-read-mngs"
    long_read_samples_url = f"https://staging.czid.org/samples/index_v2.json?projectId={project_id}&domain=all_data&offset=0&listAllIds=true&basic=false&workflow=long-read-mngs"
    
    short_read_response = requests.get(short_read_samples_url, cookies=cookies)
    long_read_response = requests.get(long_read_samples_url, cookies=cookies)
    
    short_read_sample_name_to_id = {
        x["name"]: x["id"]
        for x in short_read_response.json()["samples"]
    }
    
    long_read_sample_name_to_id = {
        x["name"]: x["id"]
        for x in long_read_response.json()["samples"]
    }

    total_samples = {**short_read_sample_name_to_id, **long_read_sample_name_to_id}
    return total_samples


def get_report_csv_for_sample(sample_id):
    report_url = f"https://staging.czid.org/samples/{sample_id}/report_csv?pipeline_version=8.3"
    report_response = requests.get(report_url, cookies=cookies)
    csv_string = StringIO(report_response.text)
    report_csv = pd.read_csv(csv_string, index_col='tax_id')[:-1]
    return report_csv
    

def find_changed_taxa(report_csv, changed_csv):
  joined = report_csv.join(changed_csv, how='inner')
  joined_subset = joined[
      ['tax_level', 
       'tax_name', 
       'changed_field', 
       'old_value', 
       'new_value', 
       'nt_count', 
       'nt_contigs', 
       'nr_count', 
       'nr_contigs']
  ]
  filtered = joined_subset[joined_subset['changed_field'].isin(
      ["family_name", "genus_name", "species_name", "tax_name"]
  )] # many changes in phylum level, filter these out so we can see more important lower level changes
  return filtered



def create_changelog(sample_report, sample_name, changed_lineage_taxa):
    changed_report_taxa = find_changed_taxa(sample_report, changed_lineage_taxa)
    if not changed_report_taxa.empty:
        changed_report_taxa.to_csv(f"changed_lineages_{sample_name}.csv")
    else: 
        print(f"no lineage changes have been found for sample: {sample_name}")


def get_taxon_lineage_changelog(bucket_name, key): 
    # get change log for taxon lineages
    s3_client = boto3.client('s3')
    obj = s3_client.get_object(Bucket=bucket_name, Key=key)
    
    # Read the content of the object with gzip
    with gzip.GzipFile(fileobj=obj['Body']) as gzipfile:
        changed_lineage_taxa = pd.read_csv(gzipfile, index_col='taxid')

    return changed_lineage_taxa

In [51]:
bucket_name = "<bucket>"
key = "<key>/changed_lineage_taxa.csv.gz"
changed_lineage_taxa = get_taxon_lineage_changelog(bucket_name, key)

In [52]:
changed_lineage_taxa.head()

Unnamed: 0_level_0,tax_name,changed_field,old_value,new_value,superkingdom
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,root,tax_name,,root,
6,Azorhizobium,phylum_name,Proteobacteria,Pseudomonadota,Bacteria
7,Azorhizobium caulinodans,phylum_name,Proteobacteria,Pseudomonadota,Bacteria
9,Buchnera aphidicola,phylum_name,Proteobacteria,Pseudomonadota,Bacteria
10,Cellvibrio,phylum_name,Proteobacteria,Pseudomonadota,Bacteria


In [57]:
# create changelogs
# 1281 - diamond redone
# 1408 - 2024 baselines
sample_name_to_ids = get_sample_name_to_id_for_project(1408)
for sample_name, sample_id in sample_name_to_ids.items():
    try:
        sample_report = get_report_csv_for_sample(sample_id)
        create_changelog(sample_report, sample_name, changed_lineage_taxa)
    except:
        print(f"no data found for sample id {sample_id},  sample name {sample_name}")

no lineage changes have been found for sample: mWGS_SE_SRR7002140_TA.252.DNA_blaC_vanP_10p_36157_original
no lineage changes have been found for sample: mWGS_RNA_human-128-lung-rna_10p_36159_original
no lineage changes have been found for sample: mWGS_PE_SRR7002140_TA.252.DNA_blaC_vanP_10p_36156_original
no data found for sample id 41256,  sample name HG002_long_reads_metaG_36160_original
no data found for sample id 41255,  sample name 28A-idseq-mosq.2to4mil_subsample_10p_36161_original
