# 2a Clean metadata

In this notebook, we will clean our metadata. Primarily, we will be de-duplicating our data in preparation for __Mash__ filtration & clustering

## Setup

In [None]:
import os
import yaml
import pandas as pd

from pyphylon.util import load_config

CONFIG = load_config("config.yml")
WORKDIR = CONFIG["WORKDIR"]

temp_folder = os.path.join("../temp/")
with open("config.yml", 'r') as f:
    config = yaml.safe_load(f)
if config.get("REUSE_TEMP_DIR"):
    temp_folder = config["REUSE_TEMP_DIR"]

output_folder = os.path.join("../output/")

print(f"Temp folder: {temp_folder}")
print(f"Output folder: {output_folder}")

In [None]:
downloaded_species_summary = pd.read_csv(
    os.path.join(temp_folder, '1b_genome_summary.csv'),
    index_col=0,
    dtype={'genome_id': str}
)
downloaded_species_metadata = pd.read_csv(
    os.path.join(temp_folder, '1b_genome_metadata.csv'),
    index_col=0,
    dtype={'genome_id': str}
)

display(
    downloaded_species_summary.shape,
    downloaded_species_summary.head(),
    downloaded_species_metadata.shape,
    downloaded_species_metadata.head()
)

## De-duplicate entries

### Ensure `biosample_accession` is unique & drop duplicates

In [None]:
downloaded_species_metadata = downloaded_species_metadata.drop_duplicates(subset=['biosample_accession'])

# Sync summary to only include genomes that survived dedup
downloaded_species_summary = downloaded_species_summary.loc[downloaded_species_metadata.index]

display(
    f"Summary shape after sync: {downloaded_species_summary.shape}",
    f"Metadata shape after dedup: {downloaded_species_metadata.shape}",
    downloaded_species_metadata.head()
)

### (Optional) Ensure `assembly_accession` is unique

Most species have strains which do not have any value for this. For those species it is worth skipping this step. For our example, most strains have information for this column so we will apply this de-duplication too

Note: 100+ are Nans, so we will not use this one

In [None]:
#num_na = downloaded_species_metadata['assembly_accession'].isna().sum()

#print(f"{num_na} out of {downloaded_species_metadata.shape[0]} are NaNs: {int(100*num_na/downloaded_species_metadata.shape[0])}%")

In [None]:
#downloaded_species_metadata = downloaded_species_metadata.drop_duplicates(subset=['assembly_accession'])

#downloaded_species_metadata.shape

In [None]:
#downloaded_species_summary = downloaded_species_summary.loc[downloaded_species_metadata.index]

#downloaded_species_summary.shape

## Save files

In [None]:
# Save files
downloaded_species_summary.to_csv(os.path.join(temp_folder, '2a_genome_summary.csv'))
downloaded_species_metadata.to_csv(os.path.join(temp_folder, '2a_genome_metadata.csv'))