In [None]:
import pandas as pd
import numpy as np
import datetime

from sklearn.model_selection import GroupShuffleSplit 
from Bio import pairwise2


from scipy.stats import gmean

from collections import Counter

from sklearn.metrics import r2_score


In [None]:
def removeoutlier_col(df,cols):
    Q1 = df[cols].quantile(0.25)
    Q3 = df[cols].quantile(0.75)
    IQR = Q3 - Q1
    df_out = df[~((df[[cols]] < (Q1 - 1.5 * IQR)) |(df[[cols]] > (Q3 + 1.5 * IQR))).any(axis=1)]
    return df_out


# Function to compute the average activity for a given row of indices
def average_activity(row_indices, df, output):
    # Filter out indices that do not exist in the dataframe
    valid_indices = [idx for idx in row_indices if idx in df.index]
    df[output] = 10**df['y_val'].copy()
    
    # Compute the average activity
    return np.log10(gmean(df.loc[valid_indices, output]))

In [None]:
def gmeanaverage(df, output, aln, Temperature, state):

    df['y_val']=np.log10(df[output]).copy()
    
    df_clean = removeoutlier_col(df,'y_val').copy()
    
    # Create a mapping of unique sequences to unique codes
    sequence_to_code = {seq: f"ENZYME_{i+1}" for i, seq in enumerate(df_clean['Sequence'].unique())}

    # Map these codes to a new column in the DataFrame using .loc
    df_clean.loc[:, 'Sequence Code'] = df_clean['Sequence'].map(sequence_to_code)
    
    df_clean = df_clean.set_index('Index')

    splitter = GroupShuffleSplit(test_size=.20, n_splits=10, random_state = state)

    split = splitter.split(df_clean, groups=df_clean['Sequence Code'])

    train_inds, val_inds = next(split)

    y = df_clean[['y_val']]
    X = df_clean[["Sequence"]]


    X_train_gmean = X.iloc[train_inds]
    y_train_gmean = y.iloc[train_inds]

    X_val_gmean = X.iloc[val_inds]
    y_val_gmean = y.iloc[val_inds]
    
    X_train_seq = X_train_gmean['Sequence']
    X_val_seq = X_val_gmean['Sequence']

    
    max_list = []
    for sequence_val in X_val_seq:
        score_list=[]
        for sequence_train in X_train_seq:
            alignment = pairwise2.align.globalxx(sequence_val, sequence_train)[0]
            aligned_length = len(alignment.seqA)
            identical_positions = sum(a == b for a,b in zip(alignment.seqA, alignment.seqB))
            identity_score = (identical_positions / aligned_length)
            score_list.append(identity_score)
        max_list.append(score_list)


    similarity_df = pd.DataFrame(max_list, columns = X_train_gmean.index, index = X_val_gmean.index)


    # Prepare the list to store top indices
    index_list = []
    average_similarity_list = []

    # Iterate over each element in filtered_X_val_temp
    for num in similarity_df.index:
        # Get the current element
        top_index = similarity_df.loc[num].nlargest(3).index
        average_similarity = np.mean(similarity_df.loc[num].nlargest(3))

        index_list.append(top_index)
        average_similarity_list.append(average_similarity)

    index_matrix = np.array([index for sublist in index_list for index in sublist]).reshape(-1,3)


    # Apply the function to each row of the matrix
    averages = [average_activity(row, y_train_gmean, output) for row in index_matrix]

    df_y_val = pd.DataFrame()
    df_y_val['y_val'] = y_val_gmean['y_val']
    df_y_val['Average'] = np.reshape(averages, -1)
    df_y_val['Average Identity'] = average_similarity_list


    r2_mean3 = r2_score(df_y_val['y_val'], df_y_val["Average"])
    
    return df_y_val, r2_mean3

In [None]:
enzyme = 'betaGlucosidase'

df = pd.read_excel('betaGlucosidasewithMutantsOptimumTemperature.xlsx')

output = 'pNP-Glc kcat/Km (1/smM)'
aln = enzyme + '.aln'

x = datetime.datetime.now()
date = str(x.year)+str(x.month)+str(x.day)

Temperature = True

In [None]:
df_kcatKmTopt = df[df['Percentage Activity Depending on Optimum Temp']==1]

In [None]:
random_state = [202 , 1, 42, 101, 2022,5 , 10, 22, 1995, 0]
list_yval_values = []
list_r2mean3 = []


for state in random_state:
    y_val, r2_mean3 = gmeanaverage(df_kcatKmTopt, output, aln, Temperature, state)
    list_yval_values.append(y_val)
    list_r2mean3.append(r2_mean3)