In [None]:
from io import BytesIO
from codecs import EncodedFile
from copy import deepcopy
from chardet import detect
from pathlib import Path

In [None]:
# Import dataset 
data_set = "/Users/dolteanu/local_documents/Coding/Ontario covid data/Ontario_covid/OneDrive_1_2022-02-06/"

In [None]:
# Import metadata
metadata = Path("/Users/dolteanu/local_documents/Coding/Ontario covid data/Ontario_covid/gisaid_filtered<10.csv")

In [None]:
def csv2dict(infile):
    """
    Simple function to read the a csv into a dictionary (faster than pandas)
    Args:
        infile: Path to input file, comma delimited

    Returns:dictionary with first field as key and second as value
    """
    dictionary = {}
    # will not work if csv file is saved as utf-8 in excel in a MAC, assumes no header
    with open(infile, mode='r') as reader:
        # data = BytesIO(reader.read())
        # data2 = deepcopy(data)
        # en = detect(data.read())['encoding']
        # reader = EncodedFile(data2, en, file_encoding='ascii')
        for line in reader:
            if line:
                line = line.strip().split(',')
                key = line[0]
                value = line[1]
                dictionary[key] = value
    return dictionary

In [None]:
# Iterate through all fasta files
import random
import pandas as pd
from Bio import SeqIO
from collections import Counter

# Dictionary to store SeqIO
seq_dict = {}
# Iterate through all fasta files
data_set = Path(data_set).resolve()
for file in data_set.glob('[!.]*'):
    with open(file) as handle:
        seq_dict.update(SeqIO.to_dict(SeqIO.parse(handle, "fasta")))
print(len(seq_dict))

In [None]:
# dictionary with Accession ID as keys and cluster name as values
# Prints out size of classes (cluster) in metadata 
# N.B. in MLDSP updated to print class size from seq_dict
cluster_dict = {}
cluster_dict = csv2dict(metadata)
cluster_stats = Counter(cluster_dict.values())
print(cluster_stats)

In [None]:
# Check to ensure all samples have a corresponding metadata dict entry 
# (inverse need not be true, this is useful for re-using the same metadata file after doing deduplication or subsampling)
missing_samples = set(seq_dict.keys()).difference(cluster_dict.keys())
print(missing_samples)

In [None]:
#Remove filtered samples from fasta that do not have a corresponding metadata label
for accession in missing_samples:
    seq_dict.pop(accession)




In [None]:
# Find classes with < X samples per class (useful for min class size)
bad_class = []
x =10
for label, count in cluster_stats.items():
    if count < x:
        bad_class.append(label)
        print(cluster_dict)
print(f'Classes with fewer than {x} samples:{bad_class}')

In [None]:
# Remove samples with fewer than X samples per class (eliminate class from fasta)
for accession,label in cluster_dict.items():
    if label in bad_class and accession in seq_dict.keys():
        print(f'Sample removed from fasta:\n{accession}({label})')
        seq_dict.pop(accession)


In [None]:
# Add path & filename to save cleaned fasta
SeqIO.write(seq_dict.values(),'cleaned_nextstrain<20.fasta','fasta')