In [None]:
import pandas as pd
import numpy as np
from collections import Counter

In [None]:
# Parse duplicates.txt of duplicates removed output from seqkit rmdup -s 
# THIS FILE HAS A SPACE AFTER EACH COMMA which is masked by excel/pandas CAREFUL, hence the `sep`
duplicates = pd.read_table('./gisaid_duplicated.txt', header=None, sep='\t+|, ',engine='python')

In [None]:
# Formatting of the duplicates.txt derived dataframe, rows represent a set of unique duplicates that were removed. 
# Index number represents number of duplicates in the set
duplicates.set_index([0], inplace=True)
duplicates.rename('{}_duplicates'.format,inplace=True)

In [None]:
# Load metadata file to check if duplicates are across different classes (clades in this case).`cleaned_gisaid<20.csv` can be generate by downloading dataset from /Users/dolteanu/local_documents/Coding/MSc_github/Data/Nextstrain/Nextstrain_gisaid>20_EPI.csv in this repository
metadata = pd.read_csv('./gisaid_metadata.csv', header=None, index_col=0)

In [None]:
#Iterate through dataframe by columns to make new table where every other column is class label of preceeding accession id
# Done this way to preserve table shape (only doubled number of columns)
table = []
for labels in duplicates:
    # Similar to map function, merges accession id with it's class label 
    sample = pd.merge(duplicates[labels],metadata,how='left',left_on=labels,right_index=True)
    table.append(sample)
final_table = pd.concat(table,axis=1,ignore_index=True)
final_table.to_csv('removed_duplicates_nextstrain_gisaid.csv',header=None)
   

In [None]:
# Create dictionary of fasta headers: class label for checking presence of multiple class labels
meta_dict = {}
for x in metadata.iterrows():
    meta_dict[x[0]] = x[1].iloc[0]

In [None]:
# Print out rows (set of identical duplicates) in csv which have multiple class labels and counts thereof.
for i in range(len(duplicates.index)):
    label = duplicates.iloc[i,:].map(meta_dict)
    if label.nunique() != 1:
        print(f'Table row {i}:\n{label.value_counts()}\n')