In [1]:
import os

import pandas as pd

In [2]:
DATA_DIR = "../data/clustering/"

In [3]:
mmseqs_header = "query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits"
mmseqs_header = mmseqs_header.split(",")

data_iterator = pd.read_table(
    os.path.join(
        DATA_DIR,
        "mmseqs2",
        "resultDB.m8"
    ),
    names=mmseqs_header,
    chunksize=10000
)

In [4]:
cluster_df = []

for chunk in data_iterator:
    # Get index for minimum E-value and append query-target pairs
    # See https://stackoverflow.com/a/73356169
    # Avoid using groupby + min because target column is lost
    cluster_df.append(
        chunk.loc[
            chunk.groupby(["query"])["evalue"].idxmin(),
            ["query", "target", "evalue"]
        ]
    )

cluster_df = pd.concat(
    cluster_df,
    axis=0,
    ignore_index=True
)

# Perform another round of summarisation in case any query was splitted between
# different chunks
cluster_df = cluster_df.loc[
    cluster_df.groupby(["query"])["evalue"].idxmin(),
    ["query", "target", "evalue"]
]

# Add taxon_id and count columns for pivoting
cluster_df["taxon_id"] = cluster_df["query"].str.split("_").str[0]
cluster_df = cluster_df.value_counts().reset_index(name="count", drop=False)

cluster_df

Unnamed: 0,query,target,evalue
264069,1008305_WP_014451924.1,UniRef90_A7HM48,2.788000e-54
196846,1008305_WP_069292120.1,UniRef90_A0A1E3G6G4,4.804000e-36
355223,1008305_WP_069292121.1,UniRef90_A0A1E3G4P7,7.966000e-146
252529,1008305_WP_069292123.1,UniRef90_A0A1E3G4V5,5.366000e-65
13683,1008305_WP_069292124.1,UniRef90_A0A1E3G5S1,0.000000e+00
...,...,...,...
110792,981385_WP_330873140.1,UniRef90_UPI002ED48C44,3.951000e-298
307956,981385_WP_330873141.1,UniRef90_A0A7R6PZ10,2.963000e-187
31477,981385_WP_330873142.1,UniRef90_UPI002ED28E5A,5.554000e-186
264098,981385_WP_330873143.1,UniRef90_UPI002ED0B763,5.077000e-125


In [9]:
# Avoid duplicates in the index (TODO: check why this happens)
cluster_df_long = cluster_df[["taxon_id", "target", "count"]].drop_duplicates()

cluster_df_long = cluster_df_long.pivot(
    index="taxon_id",
    columns="target",
    values="count"
)
cluster_df_long = cluster_df_long.fillna(0.0)

cluster_df_long.to_csv(
    os.path.join(
        DATA_DIR,
        "mmseqs2",
        "resultDB_parsed.csv"
    ),
    index=False
)

cluster_df_long

target,UniRef90_A0A010ZWL7,UniRef90_A0A011RAA4,UniRef90_A0A011U4X9,UniRef90_A0A011UUK4,UniRef90_A0A011UVK9,UniRef90_A0A017HG54,UniRef90_A0A017HSI5,UniRef90_A0A017RQU7,UniRef90_A0A017RQW2,UniRef90_A0A017RQW8,...,UniRef90_W9TJZ7,UniRef90_W9TPK3,UniRef90_W9TTV1,UniRef90_W9VMG4,UniRef90_X0S6B6,UniRef90_X4ZJM1,UniRef90_X7E510,UniRef90_X7F5P6,UniRef90_X7F7U7,UniRef90_X7F7W5
taxon_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1008305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105229,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1073253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1076588,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
108150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
908809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
936138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
971279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
