In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit 
from sklearn.preprocessing import OneHotEncoder

from Bio import pairwise2
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score

from sklearn import preprocessing
from Bio.SeqUtils.ProtParam import ProteinAnalysis


from Bio import AlignIO
from Bio import SeqIO
from Bio.Align.Applications import MuscleCommandline
from Bio.Align import AlignInfo
import pandascharm as pc

from Bio.SubsMat.MatrixInfo import blosum62
from Bio.SubsMat.MatrixInfo import blosum45

from sklearn import preprocessing



In [2]:
import joblib

# Function to save model
def save_model(model, model_name):
    joblib.dump(model, f'{model_name}.joblib')

# Function to load model
def load_model(model_name):
    return joblib.load(f'{model_name}.joblib')

In [3]:
def removeoutlier_col(df,cols):
    Q1 = df[cols].quantile(0.25)
    Q3 = df[cols].quantile(0.75)
    IQR = Q3 - Q1
    df_out = df[~((df[[cols]] < (Q1 - 1.5 * IQR)) |(df[[cols]] > (Q3 + 1.5 * IQR))).any(axis=1)]
    return df_out

In [4]:
blosum62.update(((b,a),val) for (a,b),val in list(blosum62.items()))
blosum45.update(((b,a),val) for (a,b),val in list(blosum45.items()))

def score_pairwise(seq1, seq2, matrix, gap_s, gap_e, gap = True):
    for A,B in zip(seq1, seq2):
        diag = ('-'==A) or ('-'==B)
        yield (gap_e if gap else gap_s) if diag else matrix[(A,B)]
        gap = diag

In [5]:
# Encoding With Temperature

def encode_temp(encoding, output, df_clean, aln, esm1b ,key = None):
    
    df_clean = df_clean.set_index('Index')
    
    ClustalAlign = AlignIO.read(aln, 'fasta')
    summary_align = AlignInfo.SummaryInfo(ClustalAlign )
    dframe = pc.from_bioalignment(ClustalAlign).transpose()
    sequences = dframe.loc[df_clean.index]
    
    y = df_clean['Log' + output]

    scaler = preprocessing.StandardScaler()
    
    
    if encoding == 'One-Hot-Encoder':

        one_hot = OneHotEncoder()
        encoded = one_hot.fit(sequences)
        sequences_encoded = encoded.transform(sequences).toarray()
        X = np.concatenate((sequences_encoded,df_clean[['Reaction Temperature']]), axis =1)

        
           
    if encoding == 'Bag-of-Words':

        X = pd.DataFrame([ProteinAnalysis(i).count_amino_acids() for i in df_clean['Sequence']])
        X = np.concatenate((X,df_clean[['Reaction Temperature']]), axis =1)

        
    if encoding == 'bigram':
        
        X = df_clean['Sequence']

        example = df_clean['Sequence'][0]
        lst = ['E','G','L','Y','T','H','R','A','C','D','P','I','F','N','K','S','V','M','W','Q']
        all_dct = {}
        key = []
        for i in lst:
            for j in lst:
                st = i+j
                all_dct[st] = []

        for example, id in zip(X,range(len(X))):

            temp = list(example)
            temp_dct = dict.fromkeys(all_dct.keys(),0)
            for k in range(len(temp)-1):
                try:
                    check = temp[k] + temp[k+1]
                    temp_dct[check] += 1
                except:
                    pass
            for key, value in temp_dct.items():
                all_dct[key].append(value)
                
                
        X = pd.DataFrame.from_dict(all_dct).set_index(df_clean.index)
        X['Reaction Temperature'] = df_clean['Reaction Temperature']

    
    if encoding == 'trigram':
        
        X = df_clean['Sequence']

        example = df_clean['Sequence'][0]
        lst = ['E','G','L','Y','T','H','R','A','C','D','P','I','F','N','K','S','V','M','W','Q']
        all_dct = {}
        key = []
        for i in lst:
            for j in lst:
                for k in lst:
                    st = i+j+k
                    all_dct[st] = []

        for example, id in zip(X,range(len(X))):

            temp = list(example)
            temp_dct = dict.fromkeys(all_dct.keys(),0)
            for k in range(len(temp)-2):
                try:
                    check = temp[k] + temp[k+1]+temp[k+2]
                    temp_dct[check] += 1
                except:
                    pass
            for key, value in temp_dct.items():
                all_dct[key].append(value)
                
        X = pd.DataFrame.from_dict(all_dct).set_index(df_clean.index)
        X['Reaction Temperature'] = df_clean['Reaction Temperature']

        
        
    if encoding == 'quadrogram':
        
        X = df_clean['Sequence']

        example = df_clean['Sequence'][0]
        lst = ['E','G','L','Y','T','H','R','A','C','D','P','I','F','N','K','S','V','M','W','Q']
        all_dct = {}
        key = []
        for i in lst:
            for j in lst:
                for k in lst:
                    for l in lst:
                        st = i+j+k+l
                        all_dct[st] = []

        for example, id in zip(X,range(len(X))):

            temp = list(example)
            temp_dct = dict.fromkeys(all_dct.keys(),0)
            for k in range(len(temp)-3):
                try:
                    check = temp[k] + temp[k+1]+temp[k+2]+temp[k+3]
                    temp_dct[check] += 1
                except:
                    pass
            for key, value in temp_dct.items():
                all_dct[key].append(value)
                
        X = pd.DataFrame.from_dict(all_dct).set_index(df_clean.index)
        X['Reaction Temperature'] = df_clean['Reaction Temperature']


    if encoding == 'BLOSUM62':

        n = len(sequences)
        enc_seq = np.zeros((n,n))

        i = 0

        for a in list(sequences.index):
            j = 0
            for b in list(sequences.index):
                enc_seq[i,j] = sum(score_pairwise(sequences.loc[a], sequences.loc[b], blosum62, -5, -1))
                j += 1
            i += 1

        X = np.concatenate((enc_seq[:,:-6],df_clean[['Reaction Temperature']]), axis =1)

        
        
    if encoding == 'BLOSUM45':
        
        n = len(sequences)
        enc_seq = np.zeros((n,n))

        i = 0

        for a in list(sequences.index):
            j = 0
            for b in list(sequences.index):
                enc_seq[i,j] = sum(score_pairwise(sequences.loc[a], sequences.loc[b], blosum45, -5, -1))
                j += 1
            i += 1

        X = np.concatenate((enc_seq[:,:-6],df_clean[['Reaction Temperature']]), axis =1)

        
    if encoding == 'ESM1b':
        

        encoded = esm1b.loc[df_clean.index]
        encoded['Reaction Temperature'] = df_clean['Reaction Temperature']
        X = np.array(encoded)

    X = np.array(X)  
    scaler.fit(X)
    X_scaled = scaler.transform(X)
    
    return X

In [6]:
model_list = [['Random Forest', 'BLOSUM62'], ['SVM', 'ESM1b'], 
              ['SVM', 'bigram'],['Random Forest', 'ESM1b']]


In [7]:
enzyme = 'betaGlucosidasewithMutants'

df = pd.read_excel('betaGlucosidasewithMutantsOptimumTemperatureHybrid.xlsx')

output = 'pNP-Glc kcat/Km (1/smM)'
aln = 'betaGlucosidasewithMutantsHybrid.fa'

x = datetime.datetime.now()
date = str(x.year)+str(x.month)+str(x.day)

df['Log'+output] = np.log10(df[output])
esm1b = pd.read_excel('betaGlucosidasewithMutantsHybridESM1b_embeddings.xlsx', index_col = 0)

random_states = [202 , 1, 42, 101, 2022,5 , 10, 22, 1995, 0]


In [8]:
list_df_predicted_activity = []    
list_df_train_predicted_activity  = []    
list_df_hybrid_predicted_activity  = []  
    
for state in random_states:    
    
    predicted_activity_list = []
    predicted_train_activity_list = []
    predicted_hybrid_activity_list = []
    model_name_list = []

    df_ancestor = df.tail(6)

    df_clean = removeoutlier_col(df.iloc[:-6],'Log' + output).reset_index()

    # Create a mapping of unique sequences to unique codes
    sequence_to_code = {seq: f"ENZYME_{i+1}" for i, seq in enumerate(df_clean['Sequence'].unique())}

    # Map these codes to a new column in the DataFrame using .loc
    df_clean.loc[:, 'Sequence Code'] = df_clean['Sequence'].map(sequence_to_code)

    
    
    splitter = GroupShuffleSplit(test_size=.20, n_splits=10, random_state = state)
    split = splitter.split(df_clean, groups=df_clean['Sequence Code'])
    train_inds, val_inds = next(split)
    
    
    df_clean = pd.concat([df_clean, df_ancestor], ignore_index=True)
    hybrid_inds = df_clean.tail(6).index
    y = df_clean['Log'+output]
    
    X_BLOSUM62 = encode_temp('BLOSUM62', output, df_clean, aln, esm1b ,key = None)
    X_ESM1b = encode_temp('ESM1b', output, df_clean, aln, esm1b ,key = None)
    X_bigram = encode_temp('bigram', output, df_clean, aln, esm1b ,key = None)
    
    sequence_dictionary = {'ESM1b': X_ESM1b,'BLOSUM62': X_BLOSUM62,'bigram' : X_bigram}
    

    for model_algorithm, model_method in model_list:

        X = sequence_dictionary[model_method]
        
        model_name = f'{model_method}_{model_algorithm}_singleLayer_state_{state}'
        model = load_model(model_name)
        
        
        if model_algorithm in ['LASSO', 'SVM', 'Neural Network']:
            scaler = preprocessing.StandardScaler()
            scaler.fit(X[:-6])
            X = scaler.transform(X)
        
        prediction_yval = model.predict(X[val_inds])
        train_pred = model.predict(X[train_inds])
        hybrid_pred = model.predict(X[hybrid_inds])
        
        predicted_activity_list.append(prediction_yval)
        predicted_train_activity_list.append(train_pred)
        predicted_hybrid_activity_list.append(hybrid_pred)

        model_name_list.append(model_name)


    df_predicted_activity = pd.DataFrame(predicted_activity_list, index = model_name_list , 
                                         columns=df_clean.loc[val_inds]['Index'].values).transpose()
    df_predicted_activity['y_val'] = y[val_inds].values
    
    
    df_train_predicted_activity = pd.DataFrame(predicted_train_activity_list, index = model_name_list, 
                                               columns=df_clean.loc[train_inds]['Index'].values ).transpose()
    df_train_predicted_activity['y_val'] = y[train_inds].values
    
    
    df_hybrid_predicted_activity = pd.DataFrame(predicted_hybrid_activity_list, index = model_name_list, 
                                               columns=df_clean.loc[hybrid_inds]['Index'].values ).transpose()
    df_hybrid_predicted_activity['y_val'] = y[hybrid_inds].values
    
    list_df_predicted_activity.append(df_predicted_activity)
    list_df_train_predicted_activity.append(df_train_predicted_activity)
    list_df_hybrid_predicted_activity.append(df_hybrid_predicted_activity)
    

In [10]:
names = ['Trial 1', 'Trial 2', 'Trial 3', 'Trial 4', 'Trial 5', 'Trial 6', 'Trial 7', 'Trial 8', 'Trial 9', 'Trial 10']

writer=pd.ExcelWriter(r"C:\Users\memre\Desktop\Research\Predicting Enzyme Properties Based on Various Organisms\Code\Temperature Profile Prediction\20250122 SingleLayerbetaGlucosidase-y_val.xlsx")
_ = [A.to_excel(writer,sheet_name="{0}".format(names[i])) for i, A in enumerate(list_df_predicted_activity)]
writer.close()

In [11]:
writer=pd.ExcelWriter(r"C:\Users\memre\Desktop\Research\Predicting Enzyme Properties Based on Various Organisms\Code\Temperature Profile Prediction\20250122 SingleLayerbetaGlucosidase-y_train.xlsx")
_ = [A.to_excel(writer,sheet_name="{0}".format(names[i])) for i, A in enumerate(list_df_train_predicted_activity)]
writer.close()

In [12]:
writer=pd.ExcelWriter(r"C:\Users\memre\Desktop\Research\Predicting Enzyme Properties Based on Various Organisms\Code\Temperature Profile Prediction\20250122 SingleLayerbetaGlucosidase-y_hybrid.xlsx")
_ = [A.to_excel(writer,sheet_name="{0}".format(names[i])) for i, A in enumerate(list_df_hybrid_predicted_activity)]
writer.close()

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from scipy.stats import pearsonr
from datetime import datetime
current_date = datetime.now().strftime("%Y%m%d")

In [None]:
names = ['Trial 1', 'Trial 2', 'Trial 3', 'Trial 4', 'Trial 5', 'Trial 6', 'Trial 7', 'Trial 8', 'Trial 9', 'Trial 10']

In [None]:
list_df_predicted_activity = pd.read_excel('20241125 SingleLayerbetaGlucosidase-y_val.xlsx', index_col =0,
                          sheet_name =names)
list_df_train_predicted_activity = pd.read_excel('20241125 SingleLayerbetaGlucosidase-y_train.xlsx', index_col =0,
                          sheet_name =names)

In [None]:
#Combining Training and Validation
list_df_entire_predicted_activity = {}

for i in names:  # Iterate over each split
    combined_df = pd.concat([list_df_train_predicted_activity[i], list_df_predicted_activity [i]])  # Keep original indices
    list_df_entire_predicted_activity[i] = combined_df

In [None]:
data_set = [list_df_predicted_activity , list_df_train_predicted_activity, list_df_entire_predicted_activity]
dataset_types = ["Validation", "Training", "Entire"]

In [None]:
# Metric lists for storing results
list_R2_df = []
list_RMSE_df = []
list_MAE_df = []
list_PCC_df = []
list_pValue_df = []

i =1

# Process each dataset (validation, training, entire)
for data, dataset_type in zip(data_set, dataset_types):
    list_R2, list_RMSE, list_MAE, list_PCC, list_pValue = [], [], [], [], []

    # Loop through each split in the dataset dictionary
    for state_key in data:
        state = data[state_key]  # Access the DataFrame

        r2_list, rmse_list, mae_list, pcc_list, pvalue_list = [], [], [], [], []

        # Calculate metrics for each model in the dataframe
        for model in state.columns[:-1]:  # Assuming 'y_val' is the last column
            r2 = r2_score(state['y_val'], state[model])
            rmse = mean_squared_error(state['y_val'], state[model], squared=False)
            mae = mean_absolute_error(state['y_val'], state[model])
            pcc, pValue = pearsonr(state['y_val'], state[model])

            r2_list.append(r2)
            rmse_list.append(rmse)
            mae_list.append(mae)
            pcc_list.append(pcc)
            pvalue_list.append(pValue)

        # Append results for this split
        list_R2.append(r2_list)
        list_RMSE.append(rmse_list)
        list_MAE.append(mae_list)
        list_PCC.append(pcc_list)
        list_pValue.append(pvalue_list)
        
    column_names = state.columns[:-1]

    # Convert metric lists to DataFrames for each split
    Results_R2 = pd.DataFrame(list_R2, columns=column_names, index = names)
    Results_RMSE = pd.DataFrame(list_RMSE, columns=column_names,index = names)
    Results_MAE = pd.DataFrame(list_MAE, columns=column_names, index = names)
    Results_PCC = pd.DataFrame(list_PCC, columns=column_names, index = names)
    Results_pValue = pd.DataFrame(list_pValue, columns=column_names, index = names)

    # Append each metric's result DataFrame
    list_R2_df.append(Results_R2)
    list_RMSE_df.append(Results_RMSE)
    list_MAE_df.append(Results_MAE)
    list_PCC_df.append(Results_PCC)
    list_pValue_df.append(Results_pValue)

In [None]:
# Function to write metrics to Excel file with specified structure
def write_metrics_to_excel(filename, metric_dfs, metric_name):
    with pd.ExcelWriter(filename) as writer:
        # Initialize empty DataFrames for validation, training, and entire dataset results

        # Select the first four columns for Validation
        validation_df = metric_dfs[0]

        # Select the next four columns for Training
        training_df = metric_dfs[1]

        # Select the last four columns for Entire
        entire_df =metric_dfs[2]

        # Write each dataset type to a separate sheet
        validation_df.to_excel(writer, sheet_name="Validation", index=False)
        training_df.to_excel(writer, sheet_name="Training", index=False)
        entire_df.to_excel(writer, sheet_name="Entire", index=False)

        # Calculate the average across the 10 trials for each dataset type
        avg_validation = validation_df.mean(axis=0)
        avg_training = training_df.mean(axis=0)
        avg_entire = entire_df.mean(axis=0)

        
        model_names =  validation_df.columns
        # Create DataFrames for average values
        avg_validation_df = pd.DataFrame(avg_validation, columns=['Average'], index = model_names)
        avg_training_df = pd.DataFrame(avg_training, columns=['Average'], index = model_names)
        avg_entire_df = pd.DataFrame(avg_entire, columns=['Average'], index = model_names)

        # Write average values to separate sheets
        avg_validation_df.to_excel(writer, sheet_name="Avg_Validation")
        avg_training_df.to_excel(writer, sheet_name="Avg_Training",)
        avg_entire_df.to_excel(writer, sheet_name="Avg_Entire")

        # Calculate the standard deviation across the 10 trials for each dataset type
        std_validation = validation_df.std(axis=0)
        std_training = training_df.std(axis=0)
        std_entire = entire_df.std(axis=0)

        # Create DataFrames for standard deviation values
        std_validation_df = pd.DataFrame(std_validation, columns=['Std_Dev'], index = model_names)
        std_training_df = pd.DataFrame(std_training, columns=['Std_Dev'], index = model_names)
        std_entire_df = pd.DataFrame(std_entire, columns=['Std_Dev'], index = model_names)

        # Write standard deviation values to separate sheets
        std_validation_df.to_excel(writer, sheet_name="Std_Validation")
        std_training_df.to_excel(writer, sheet_name="Std_Training")
        std_entire_df.to_excel(writer, sheet_name="Std_Entire")


In [None]:
# Write each metric to separate Excel files with date in the filename
for metric_dfs, metric_name in zip(
        [list_R2_df, list_RMSE_df, list_MAE_df, list_PCC_df, list_pValue_df],
        ["R2", "RMSE", "MAE", "PCC", "pValue"]):
    filename = f"{current_date}_{metric_name}_singleLayerResults_.xlsx"
    write_metrics_to_excel(filename, metric_dfs, metric_name)