In [7]:
import gzip
import os
import sys

import boto3
import pandas as pd

In [3]:
def find_changed_taxa(report_csv, changed_csv):
  joined = report_csv.join(changed_csv, how='inner')
  joined_subset = joined[
      ['tax_level', 
       'tax_name', 
       'changed_field', 
       'old_value', 
       'new_value', 
       'nt_count', 
       'nt_contigs', 
       'nr_count', 
       'nr_contigs']
  ]
  filtered = joined_subset[joined_subset['changed_field'].isin(
      ["family_name", "genus_name", "species_name", "tax_name"]
  )] # many changes in phylum level, filter these out so we can see more important lower level changes
  return filtered


In [12]:
s3_client = boto3.client('s3')

bucket_name = "idseq-database"
key = "<key>/changed_lineage_taxa.csv.gz"


obj = s3_client.get_object(Bucket=bucket_name, Key=key)

# Read the content of the object with gzip
with gzip.GzipFile(fileobj=obj['Body']) as gzipfile:
    changed_lineage_taxa = pd.read_csv(gzipfile, index_col='taxid')

In [13]:
changed_lineage_taxa.head()

Unnamed: 0_level_0,tax_name,changed_field,old_value,new_value,superkingdom
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,root,tax_name,,root,
6,Azorhizobium,phylum_name,Proteobacteria,Pseudomonadota,Bacteria
7,Azorhizobium caulinodans,phylum_name,Proteobacteria,Pseudomonadota,Bacteria
9,Buchnera aphidicola,phylum_name,Proteobacteria,Pseudomonadota,Bacteria
10,Cellvibrio,phylum_name,Proteobacteria,Pseudomonadota,Bacteria


In [19]:
# download sample report table from CZID to compare against changelog (download report table off of sample report)
sample_reports = [
    "/Users/plogan/Downloads/atcc_even_34239_original_report.csv",
    "/Users/plogan/Downloads/norg_13__nacc_35__uniform_weight_per_organism__hiseq_reads__v10__34241_original_1_report.csv",
    "/Users/plogan/Downloads/UnAmbiguouslyMapped_ds.hous1_34246_original_report.csv"
]

In [20]:
for report_csv_name in sample_reports:
    report_csv = pd.read_csv(report_csv_name, index_col='tax_id')[:-1]
    changed_report_taxa = find_changed_taxa(report_csv, changed_lineage_taxa)
    changed_report_taxa.to_csv(f"changed_lineages_{os.path.basename(report_csv_name)}")
    