In [1]:
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
# Parse txt of duplicates removed output from seqkit rmdup -D 
# THIS FILE HAS A SPACE AFTER EACH COMMA which is masked by excel/pandas CAREFUL, hence the `sep`
duplicates = pd.read_table('./duplicates.txt', header=None, sep='\t+|, ',engine='python')

In [3]:
# Formatting of the duplicates.txt derived dataframe, rows represent a set of unique duplicates that were removed. 
# Index number represents number of duplicates in the set
duplicates.set_index([0], inplace=True)
duplicates.rename('{}_duplicates'.format,inplace=True)

In [4]:
# Load metadata file to check if duplicates are across different classes (clades in this case)
metadata = pd.read_csv('./epoch_metadata.csv', header=None, index_col=0)

In [5]:
#Iterate through dataframe by columns to make new table where every other column is class label of preceeding accession id
# Done this way to preserve table shape (only doubled number of columns)
table = []
for labels in duplicates:
    # Similar to map function, merges accession id with it's class label 
    sample = pd.merge(duplicates[labels],metadata,how='left',left_on=labels,right_index=True)
    table.append(sample)
final_table = pd.concat(table,axis=1,ignore_index=True)
final_table.to_csv('removed_duplicates_nextstrain_clades.csv',header=None)
   

In [6]:
# Create dictionary of fasta headers: class label for checking presence of multiple class labels
meta_dict = {}
for x in metadata.iterrows():
    meta_dict[x[0]] = x[1].iloc[0]

In [7]:
# Print out rows (set of identical duplicates) in csv which have multiple class labels and counts thereof.
for i in range(len(duplicates.index)):
    label = duplicates.iloc[i,:].map(meta_dict)
    if label.nunique() != 1:
        print(f'Table row {i}:\n{label.value_counts()}\n')

Table row 0:
3    78
4    19
Name: 97_duplicates, dtype: int64

Table row 1:
3.0    60
4.0     8
Name: 68_duplicates, dtype: int64

Table row 2:
1.0    37
0.0    25
Name: 62_duplicates, dtype: int64

Table row 3:
2.0    37
3.0     7
Name: 44_duplicates, dtype: int64

Table row 4:
1.0    20
0.0    18
Name: 38_duplicates, dtype: int64

Table row 5:
2.0    25
3.0    10
Name: 35_duplicates, dtype: int64

Table row 9:
1.0    22
0.0     9
Name: 31_duplicates, dtype: int64

Table row 10:
3.0    30
2.0     1
Name: 31_duplicates, dtype: int64

Table row 16:
0.0    18
1.0     9
Name: 27_duplicates, dtype: int64

Table row 23:
1.0    17
0.0     3
Name: 20_duplicates, dtype: int64

Table row 26:
3.0    17
4.0     2
Name: 19_duplicates, dtype: int64

Table row 38:
1.0    10
0.0     6
Name: 16_duplicates, dtype: int64

Table row 42:
3.0    13
2.0     2
Name: 15_duplicates, dtype: int64

Table row 46:
3.0    14
4.0     1
Name: 15_duplicates, dtype: int64

Table row 53:
3.0    12
4.0     2
Name: 14_du