In [1]:
import os
import re

import numpy as np
import pandas as pd

from Bio import SeqIO

In [2]:
DATA_DIR = "./raw/"

In [3]:
def save_sequences(
    df: pd.DataFrame,
    out_dir: str,
    out_prefix: str,
    group_col: str = "type",
    name_cols: list[str] = ["id", "name", "type", "group"]
) -> None:

    for group in df[group_col].unique():
        group_df = df[df[group_col] == group].copy()

        for subgroup in group_df["group"].unique():
            subgroup_path = os.path.join(
                out_dir,
                f"{out_prefix}_{group}_{subgroup}.fasta"
            )
            subgroup_df = group_df[group_df["group"] == subgroup].copy()

            with open(subgroup_path, "w") as handle:
                for _, row in subgroup_df.iterrows():
                    sequence = ">" + "|".join(row[name_cols])

                    # Remove whitespaces
                    sequence = re.sub(
                        pattern="\\s+",
                        repl="_",
                        string=sequence
                    )
                    sequence += "\n"

                    sequence += row["seq"] + "\n"
                    handle.write(sequence)


## HydDB

In [4]:
hyd_df = pd.read_excel(
    os.path.join(
        DATA_DIR,
        "hyddb",
        "41598_2016_BFsrep34212_MOESM2_ESM.xls"
    ),
    sheet_name="Offline version"
)

# Rename to fit format
hyd_df = hyd_df.rename(columns={
    "NCBI Accession": "id",
    "Protein Sequence": "seq",
    "Organism": "name",
    "New Class": "type"
})
hyd_df[["type", "group"]] = hyd_df["type"].str.split("]", expand=True)

# Format type and group columns
hyd_df["type"] = hyd_df["type"].str.lstrip("[")
hyd_df["group"] = hyd_df["group"]\
    .str.split(" Group ").str[-1]

# Add type if no group is present (Fe hydrogenases)
hyd_df["group"] = hyd_df["group"]\
    .replace("", np.nan)\
    .fillna(value=hyd_df["type"])

## Review hydrogenases

In [5]:
review_path = os.path.join(
    DATA_DIR,
    "review",
    "MC_HYD_DATABASE_derep_FINAL_VERSION.fasta"
)

review_df = []

for record in SeqIO.parse(review_path, "fasta"):

    # Fix headers
    record.description = record.description.replace(" _", "|")

    review_df.append(
        pd.DataFrame(
            [record.description.split("|") + [str(record.seq)]],
            columns=["id", "name", "group", "other", "seq"]
        )
    )

review_df = pd.concat(
    review_df,
    axis=0,
    ignore_index=True
)

# Add hydrogenase type
review_df = pd.merge(
    left=review_df,
    right=hyd_df[["type", "group"]].drop_duplicates(),
    on="group",
    how="left"
)

# Add hydrogenases missing in HydDB
review_df.loc[
    review_df["group"] == "1l",
    "type"
] = "NiFe"

## Concatenate and save

In [6]:
final_cols = [
    "id",
    "name",
    "type",
    "group",
    "seq",
    "source"
]

# Add source column
hyd_df["source"] = "HydDB"
review_df["source"] = "Review"

# Concatenate datasets
final_df = pd.concat([hyd_df[final_cols], review_df[final_cols]])

# Drop potential duplicates
final_df = final_df.drop_duplicates(subset="id", keep="first")

In [7]:
final_df["source"].value_counts()

source
Review    4195
HydDB     3248
Name: count, dtype: int64

In [8]:
save_sequences(
    df=final_df,
    out_dir="./sequences/",
    out_prefix="mixed",
    name_cols=[col for col in final_cols if col != "seq"]
)