In [3]:
def fasta_parse(fasta_file: str, comment='#'):
    """
    Parses a FASTA file and extracts sequence names and their corresponding sequences.

    The function reads a given FASTA file, where each sequence is identified by a line starting with 
    the '>' symbol, followed by the sequence name. The subsequent lines contain the sequence data, 
    which is concatenated into a single string for each sequence. The function also ignores any comment 
    lines that start with the specified comment character.

    Parameters:

    fasta_file : str
        The path to the FASTA file to be parsed.
    comment : str, optional
        A character indicating comment lines that should be ignored. Default is '#'.
    
    Returns:
    -------
    tuple:
        A tuple containing two lists:
        - names : List[str]
            A list of sequence names extracted from lines starting with '>'.
        - sequences : List[str]
            A list of corresponding sequences, with each sequence represented as a string.
    
    Example:
    --------
    >>> names, sequences = fasta_parse("example.fasta")
    >>> print(names)
    ['sequence1', 'sequence2']
    >>> print(sequences)
    ['ATCGATCG', 'GGCTAAGT']

    Notes:
    ------
    - Sequence names are taken from lines starting with '>', with the '>' character removed.
    - Sequences are converted to uppercase.
    - The function assumes that the sequences are stored in a standard FASTA format.

    """
    names = []
    sequences = []
    name = None
    sequence = []
    with open(fasta_file, 'r') as f:
        for line in f:
            if line.startswith(comment):
                continue
            line = line.strip()
            if line.startswith('>'):
                if name is not None:
                    names.append(name)
                    sequences.append(''.join(sequence))
                name = line[1:]
                sequence = []
            else:
                sequence.append(line.upper())
        if name is not None:
            names.append(name)
            sequences.append(''.join(sequence))

    return names, sequences

def dedup_sequences(data_tuple):
    """
    Removes duplicate sequences from a tuple containing sequence names and sequences.

    This function takes a tuple consisting of two lists: one for sequence names and 
    one for the corresponding sequences. It removes duplicate sequences and returns 
    a new tuple containing only the unique sequences and their corresponding names.

    Parameters:
    ----------
    data_tuple : tuple
        A tuple containing two lists:
        - names (list of str): A list of sequence names.
        - sequences (list of str): A list of nucleotide or protein sequences.

    Returns:
    -------
    tuple
        A tuple containing two lists:
        - result_names (list of str): A list of names corresponding to the unique sequences.
        - result_sequences (list of str): A list of unique sequences.

    Example:
    --------
    >>> names = ["seq1", "seq2", "seq3"]
    >>> sequences = ["AGCT", "CGTA", "AGCT"]
    >>> data_tuple = (names, sequences)
    >>> dedup_sequences(data_tuple)
    (["seq1", "seq2"], ["AGCT", "CGTA"])

    This function can also be modified to produce a tuple of duplicate values.
    """
    names, sequences = data_tuple
    unique_sequences = {}
    result_names = []
    result_sequences = []
    duplicate_names = []
    duplicate_sequences = []
    
    for name, seq in zip(names, sequences):
        if seq not in unique_sequences:
            unique_sequences[seq] = name
            result_names.append(name)
            result_sequences.append(seq)
        else:
            # these values are actually not used and return at this stage
            duplicate_names.append(name)
            duplicate_sequences.append(seq)

    print(f"{len(duplicate_sequences)} duplicate sequences removed.")
    return (result_names, result_sequences)

def write_fasta(data_tuple, output_fasta):
    """
    Writes a tuple of names and sequences to a FASTA file.
    
    Parameters:
    ----------
    data_tuple : tuple
        A tuple where the first element is a list of names and the second element is a list of sequences.
    output_fasta : str
        The name of the output FASTA file.
    """
    names, sequences = data_tuple
    with open(output_fasta, 'w') as fasta_file:
        for name, sequence in zip(names, sequences):
            fasta_file.write(f">{name}\n")
            fasta_file.write(f"{sequence}\n")

names, sequences = fasta_parse("data.fasta")
data_tuple = dedup_sequences((names, sequences))

# for archival, hopefully we don't have to read from here again
write_fasta(data_tuple, "dedup_data.fasta")

515 duplicate sequences removed.


In [4]:
!mmseqs easy-cluster --min-seq-id 0.9 -c 0.8 --threads 8 dedup_data.fasta data clustering_tmp

Create directory clustering_tmp
easy-cluster --min-seq-id 0.9 -c 0.8 --threads 8 dedup_data.fasta data clustering_tmp 

MMseqs Version:                     	15-6f452
Substitution matrix                 	aa:blosum62.out,nucl:nucleotide.out
Seed substitution matrix            	aa:VTML80.out,nucl:nucleotide.out
Sensitivity                         	4
k-mer length                        	0
Target search mode                  	0
k-score                             	seq:2147483647,prof:2147483647
Alphabet size                       	aa:21,nucl:5
Max sequence length                 	65535
Max results per query               	20
Split database                      	0
Split mode                          	2
Split memory limit                  	0
Coverage threshold                  	0.8
Coverage mode                       	0
Compositional bias                  	1
Compositional bias                  	1
Diagonal scoring                    	true
Exact k-mer matching                	0
Mask residues   

In [6]:
!mmseqs easy-cluster --min-seq-id 0.85 -c 0.7 --threads 8 dedup_data.fasta data_less_strict clustering_tmp

easy-cluster --min-seq-id 0.85 -c 0.7 --threads 8 dedup_data.fasta data_less_strict clustering_tmp 

MMseqs Version:                     	15-6f452
Substitution matrix                 	aa:blosum62.out,nucl:nucleotide.out
Seed substitution matrix            	aa:VTML80.out,nucl:nucleotide.out
Sensitivity                         	4
k-mer length                        	0
Target search mode                  	0
k-score                             	seq:2147483647,prof:2147483647
Alphabet size                       	aa:21,nucl:5
Max sequence length                 	65535
Max results per query               	20
Split database                      	0
Split mode                          	2
Split memory limit                  	0
Coverage threshold                  	0.7
Coverage mode                       	0
Compositional bias                  	1
Compositional bias                  	1
Diagonal scoring                    	true
Exact k-mer matching                	0
Mask residues                      

In [7]:
!mmseqs easy-cluster --min-seq-id 0.95 -c 0.9 --threads 8 dedup_data.fasta data_more_strict clustering_tmp

easy-cluster --min-seq-id 0.95 -c 0.9 --threads 8 dedup_data.fasta data_more_strict clustering_tmp 

MMseqs Version:                     	15-6f452
Substitution matrix                 	aa:blosum62.out,nucl:nucleotide.out
Seed substitution matrix            	aa:VTML80.out,nucl:nucleotide.out
Sensitivity                         	4
k-mer length                        	0
Target search mode                  	0
k-score                             	seq:2147483647,prof:2147483647
Alphabet size                       	aa:21,nucl:5
Max sequence length                 	65535
Max results per query               	20
Split database                      	0
Split mode                          	2
Split memory limit                  	0
Coverage threshold                  	0.9
Coverage mode                       	0
Compositional bias                  	1
Compositional bias                  	1
Diagonal scoring                    	true
Exact k-mer matching                	0
Mask residues                      

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the TSV file into a DataFrame
file_path = 'data_cluster.tsv'
cluster_df = pd.read_csv(file_path, sep='\t', header=None, names=['Cluster_Rep', 'Cluster_Member'])

# Display the first few rows
cluster_df.head()

Unnamed: 0,Cluster_Rep,Cluster_Member
0,tmRNA_Dich.nodo._TRW-35819_1-352,tmRNA_Dich.nodo._TRW-35819_1-352
1,srp_Myco.hyop._AE017332,srp_Myco.hyop._AE017332
2,5s_Fusarium-cerealis-7,5s_Fusarium-cerealis-7
3,5s_Fusarium-cerealis-7,5s_Fusarium-asiaticum-6
4,5s_Fusarium-cerealis-7,5s_Fusarium-lunulosporum-4


In [10]:
# Group by the cluster representative and count the number of members in each cluster
cluster_size = cluster_df.groupby('Cluster_Rep').size().reset_index(name='Cluster_Size')

# Sort by cluster size to identify the largest clusters
cluster_size_sorted = cluster_size.sort_values(by='Cluster_Size', ascending=False)

# Display the largest clusters
cluster_size_sorted

Unnamed: 0,Cluster_Rep,Cluster_Size
501,5s_Vibrio-vulnificus-1,41
273,5s_Escherichia-coli-11,35
388,5s_Nocardia-farcinica-1,33
460,5s_Streptomyces-coelicolor-4,29
509,5s_Xenopus-laevis-8,28
...,...,...
758,RNaseP_SM-A3153,1
757,RNaseP_SM-A3048,1
756,RNaseP_SM-A2946,1
755,RNaseP_SM-A2642,1
