In [None]:
import os
from dotenv import load_dotenv

import pandas as pd
from reading_util import enzyme_split30_preprocessing, read_h5, apply_prott5, read_fasta

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import DecisionBoundaryDisplay

In [None]:
load_dotenv()
path_to_non_enzymes = os.getenv("FASTA_NON_ENZYMES")
path_to_csv_split30 = os.getenv("CSV30_ENZYMES", "not found")

path_to_prott5_ne = os.getenv("PROTT5_NON_ENZYMES", "not found")
path_to_prott5 = os.getenv("PROTT5_ENZYMES_SPLIT_X", "not found")
path_to_csv_split30_prott5 = os.getenv("CSV30_ENZYMES_PROTT5_APPLIED")
path_to_non_enzymes_prott5 = os.getenv("NON_ENZYMES_PROTT5_APPLIED")

# Import and format data

In [None]:
enzymes = enzyme_split30_preprocessing(pd.read_csv(path_to_csv_split30, delimiter=","))
enzymes.head()


In [None]:
enzymes = apply_prott5(read_h5(path_to_prott5), enzymes)
enzymes.head()

In [None]:
enzymes.to_csv(path_to_csv_split30_prott5)

In [None]:
non_enzymes = read_fasta(path_to_non_enzymes)
non_enzymes.head()

In [None]:
non_enzymes = apply_prott5(read_h5(path_to_prott5_ne), non_enzymes)
non_enzymes.head()

# Get CSVs if prott5 is applied already

In [None]:
enzymes = pd.read_csv(path_to_csv_split30_prott5, delimiter=",")
enzymes.head()

In [None]:
non_enzymes = pd.read_csv(path_to_non_enzymes_prott5, delimiter=",")
non_enzymes.head()

# Apply KNN

In [None]:
# Split data
enzymes["Label"] = 1
non_enzymes["Label"] = 0

bin = pd.concat([enzymes[["Label", "Embedding"]], non_enzymes[["Label", "Embedding"]]], ignore_index=True)

bin = bin.sample(frac=1, random_state=42).reset_index(drop=True)

#Split data
X_train, X_test, y_train, y_test = train_test_split(bin["Embedding"], bin["Label"], test_size=0.2, random_state=42)

In [None]:
# Perform KNN
clf = Pipeline(
    steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=11))]
)