# Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay,f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error




## Importing data

In [2]:
train_data_file_path = r"drive-download-20230926T045156Z-001\concatenated_data.xlsx"  
train_label_data_file_path = r"drive-download-20230926T045156Z-001\concatenated_label_data.xlsx"

# Read the Excel file into a DataFrame
train_data = pd.read_excel(train_data_file_path)
train_label_data = pd.read_excel(train_label_data_file_path)

train_data = train_data.drop(["Unnamed: 0"], axis=1, errors="ignore") #droping a unnamed column in train data
train_label_data = train_label_data.drop(["Unnamed: 0"], axis=1, errors="ignore") #droping a unnamed column in train label data

# Display the DataFrame
display(train_data)
display(train_label_data)

Unnamed: 0,experimentalTechnique,residueCount,resolution,structureMolecularWeight,crystallizationTempK,densityMatthews,densityPercentSol,phValue,crystallizationMethod_1,crystallizationMethod_2,crystallizationMethod_3,pdbxDetails_1,pdbxDetails_2,pdbxDetails_3,pdbxDetails_4,pdbxDetails_5,pdbxDetails_6,pdbxDetails_7
0,X-RAY DIFFRACTION,438,1.70,50637.53,298.0,1.95,36.97,6.6,VAPOR DIFFUSION,HANGING DROP,,30% (w/v) PEG 4000,0.1M Sodium Cacodylate pH 6.6,0.15M Sodium acetate,VAPOR DIFFUSION,HANGING DROP,temperature 298.0K,
1,X-RAY DIFFRACTION,1324,1.60,148853.81,293.0,2.48,50.38,7.5,VAPOR DIFFUSION,HANGING DROP,,20mg/mL protein with 2mM NADH/L-oxamate; well ...,0.1M HEPES,pH7.5,,,,
2,X-RAY DIFFRACTION,252,1.55,29054.37,,2.03,39.29,6.5,,,,pH 6.5,,,,,,
3,X-RAY DIFFRACTION,180,2.30,20674.23,289.0,2.63,52.00,7.5,VAPOR DIFFUSION,SITTING DROP,,Crystal Screen HT condition C6: 200mM Ammonium...,30% (w/v) PEG 8000,protein conc. 10 mg/ml,cryo 20% ethylene glycol,,,
4,X-RAY DIFFRACTION,436,3.50,51456.98,297.0,2.70,54.00,5.6,VAPOR DIFFUSION,SITTING DROP,,0.2M sodium citrate,2M ammonium sulphate,pH 5.6,VAPOR DIFFUSION,SITTING DROP,temperature 297K,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120690,X-RAY DIFFRACTION,192,2.43,22023.59,,2.97,58.58,,,,,0.2 M SODIUM THIOCYANATE,20% PEG 3350,,,,,
120691,X-RAY DIFFRACTION,127,0.84,14470.12,,2.74,55.19,7.5,,,,AMMONIUM SULPHATE,TRIS PH 7.5 .,,,,,
120692,SOLUTION NMR,26,,3083.67,,,,,,,,,,,,,,
120693,X-RAY DIFFRACTION,613,2.05,69425.64,298.0,2.20,44.09,7.4,VAPOR DIFFUSION,HANGING DROP,,22-26% PEG3350,0.1 M HEPES,pH 7.4,10 mM manganese chloride,10 mM GTP,VAPOR DIFFUSION,HANGING DROP


Unnamed: 0,Class label
0,TRANSFERASE
1,OXIDOREDUCTASE
2,CHAPERONE
3,OXIDOREDUCTASE
4,TRANSFERASE
...,...
120690,APOPTOSIS
120691,ELECTRON TRANSPORT
120692,HYDROLASE
120693,LYASE


In [3]:
# Display the DataFrame information, including column names and data types
print("Train Data Column Information:")
train_data_info = train_data.dtypes.reset_index()
train_data_info.columns = ['Column Name', 'Data Type']
print(train_data_info)


Train Data Column Information:
                 Column Name Data Type
0      experimentalTechnique    object
1               residueCount     int64
2                 resolution   float64
3   structureMolecularWeight   float64
4       crystallizationTempK   float64
5            densityMatthews   float64
6          densityPercentSol   float64
7                    phValue    object
8    crystallizationMethod_1    object
9    crystallizationMethod_2    object
10   crystallizationMethod_3    object
11             pdbxDetails_1    object
12             pdbxDetails_2    object
13             pdbxDetails_3    object
14             pdbxDetails_4    object
15             pdbxDetails_5    object
16             pdbxDetails_6    object
17             pdbxDetails_7    object


### Getting only the top 10 values for One-Hot Encoding

In [4]:
# Columns to get top 10 values
columns_to_get_top_10 = [
    'experimentalTechnique',
    'crystallizationMethod_1',
    'crystallizationMethod_2',
    'crystallizationMethod_3',
    'pdbxDetails_1',
    'pdbxDetails_2',
    'pdbxDetails_3',
    'pdbxDetails_4',
    'pdbxDetails_5',
    'pdbxDetails_6',
    'pdbxDetails_7'
]

# Display the top 10 values for each specified column as lists
for column in columns_to_get_top_10:
    top_10_occurrences = train_data[column].value_counts().head(10).index.tolist()
    print(f"Top 10 occurrences for {column}:\n{top_10_occurrences}\n")


Top 10 occurrences for experimentalTechnique:
['X-RAY DIFFRACTION', 'SOLUTION NMR', 'ELECTRON MICROSCOPY', 'SOLID-STATE NMR', 'ELECTRON CRYSTALLOGRAPHY', 'NEUTRON DIFFRACTION', 'FIBER DIFFRACTION', 'X-RAY DIFFRACTION, NEUTRON DIFFRACTION', 'NEUTRON DIFFRACTION, X-RAY DIFFRACTION', 'SOLUTION SCATTERING']

Top 10 occurrences for crystallizationMethod_1:
['VAPOR DIFFUSION', 'EVAPORATION', 'MICROBATCH', 'hanging drop', 'LIPIDIC CUBIC PHASE', 'MICRODIALYSIS', 'BATCH MODE', 'LIQUID DIFFUSION', 'SMALL TUBES', 'batch']

Top 10 occurrences for crystallizationMethod_2:
[' HANGING DROP', ' SITTING DROP', 'SITTING DROP', ' hanging drop', ' vapor diffusion', ' RECRYSTALLIZATION', ' VAPOR DIFFUSION', ' under oil', ' bicelle method', ' sitting drops']

Top 10 occurrences for crystallizationMethod_3:
['NANODROP', ' NANODROP', ' MICROSEEDING', ' HANGING DROP', ' STREAK SEEDING', ' seeding', ' Cocrystaliization', ' MICRO-SEEDING', ' MACROSEEDING', ' SOAKING']

Top 10 occurrences for pdbxDetails_1:
['PEG

In [7]:
# # Converting the categorical data into binary encoding

# for label in top_10:
# data[label] = np.where(data['column name']==label,1,0)

# data[['column name']+top_10].head(40)

In [8]:
# # Display the DataFrame information, including column names and data types
# print("Train Data Column Information:")
# train_data_info = train_data.dtypes.reset_index()
# train_data_info.columns = ['Column Name', 'Data Type']
# print(train_data_info)


In [9]:
# # Choose the top 10 values for 'experimentalTechnique'
# top_10_experimental_technique = train_data['experimentalTechnique'].value_counts().head(10).index.tolist()

# # Converting the 'experimentalTechnique' column into binary encoding
# for label in top_10_experimental_technique:
#     train_data[label] = np.where(train_data['experimentalTechnique'] == label, 1, 0)

# # Display the relevant columns for 'experimentalTechnique'
# display(train_data[['experimentalTechnique'] + top_10_experimental_technique].head(40))
# print("Sixe of the train_data : ",train_data.shape)

### only the top 10 values are used for One-Hot Encoding

In [5]:
def one_hot_encode_columns(data, columns_to_encode):
    encoded_data = data.copy()  

    for column in columns_to_encode:
        # Choose the top 10 values for the current column
        top_values = data[column].value_counts().head(10).index.tolist()

        # Converting the current column into binary encoding
        for label in top_values[:-1]:  # Exclude the last value
            encoded_data[f"{column}_{label}"] = np.where(data[column] == label, 1, 0)

        # Drop the original column after encoding
        encoded_data = encoded_data.drop(column, axis=1)

    return encoded_data


# Specify the columns you want to one-hot encode
columns_to_encode = [
    'experimentalTechnique',
    'crystallizationMethod_1',
    'crystallizationMethod_2',
    'crystallizationMethod_3',
    'pdbxDetails_1',
    'pdbxDetails_2',
    'pdbxDetails_3',
    'pdbxDetails_4',
    'pdbxDetails_5',
    'pdbxDetails_6',
    'pdbxDetails_7'
]

# Call the function with your DataFrame and columns to encode
encoded_train_data = one_hot_encode_columns(train_data, columns_to_encode)

# Concatenate the original and encoded DataFrames
ohe_train_data = pd.concat([train_data, encoded_train_data], axis=1)

# Display the resulting DataFrame and its shape
display(ohe_train_data)
print("Shape of the DataFrame:", ohe_train_data.shape)


  encoded_data[f"{column}_{label}"] = np.where(data[column] == label, 1, 0)
  encoded_data[f"{column}_{label}"] = np.where(data[column] == label, 1, 0)


Unnamed: 0,experimentalTechnique,residueCount,resolution,structureMolecularWeight,crystallizationTempK,densityMatthews,densityPercentSol,phValue,crystallizationMethod_1,crystallizationMethod_2,...,pdbxDetails_6_ temperature 289K,pdbxDetails_7_ temperature 293K,pdbxDetails_7_ HANGING DROP,pdbxDetails_7_ temperature 298K,pdbxDetails_7_ VAPOR DIFFUSION,pdbxDetails_7_ temperature 277K,pdbxDetails_7_ SITTING DROP,pdbxDetails_7_ temperature 291K,pdbxDetails_7_ temperature 295K,pdbxDetails_7_ temperature 289K
0,X-RAY DIFFRACTION,438,1.70,50637.53,298.0,1.95,36.97,6.6,VAPOR DIFFUSION,HANGING DROP,...,0,0,0,0,0,0,0,0,0,0
1,X-RAY DIFFRACTION,1324,1.60,148853.81,293.0,2.48,50.38,7.5,VAPOR DIFFUSION,HANGING DROP,...,0,0,0,0,0,0,0,0,0,0
2,X-RAY DIFFRACTION,252,1.55,29054.37,,2.03,39.29,6.5,,,...,0,0,0,0,0,0,0,0,0,0
3,X-RAY DIFFRACTION,180,2.30,20674.23,289.0,2.63,52.00,7.5,VAPOR DIFFUSION,SITTING DROP,...,0,0,0,0,0,0,0,0,0,0
4,X-RAY DIFFRACTION,436,3.50,51456.98,297.0,2.70,54.00,5.6,VAPOR DIFFUSION,SITTING DROP,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120690,X-RAY DIFFRACTION,192,2.43,22023.59,,2.97,58.58,,,,...,0,0,0,0,0,0,0,0,0,0
120691,X-RAY DIFFRACTION,127,0.84,14470.12,,2.74,55.19,7.5,,,...,0,0,0,0,0,0,0,0,0,0
120692,SOLUTION NMR,26,,3083.67,,,,,,,...,0,0,0,0,0,0,0,0,0,0
120693,X-RAY DIFFRACTION,613,2.05,69425.64,298.0,2.20,44.09,7.4,VAPOR DIFFUSION,HANGING DROP,...,0,0,1,0,0,0,0,0,0,0


Shape of the DataFrame: (120695, 124)


In [7]:
display(ohe_train_data)

Unnamed: 0,experimentalTechnique,residueCount,resolution,structureMolecularWeight,crystallizationTempK,densityMatthews,densityPercentSol,phValue,crystallizationMethod_1,crystallizationMethod_2,...,pdbxDetails_6_ temperature 289K,pdbxDetails_7_ temperature 293K,pdbxDetails_7_ HANGING DROP,pdbxDetails_7_ temperature 298K,pdbxDetails_7_ VAPOR DIFFUSION,pdbxDetails_7_ temperature 277K,pdbxDetails_7_ SITTING DROP,pdbxDetails_7_ temperature 291K,pdbxDetails_7_ temperature 295K,pdbxDetails_7_ temperature 289K
0,X-RAY DIFFRACTION,438,1.70,50637.53,298.0,1.95,36.97,6.6,VAPOR DIFFUSION,HANGING DROP,...,0,0,0,0,0,0,0,0,0,0
1,X-RAY DIFFRACTION,1324,1.60,148853.81,293.0,2.48,50.38,7.5,VAPOR DIFFUSION,HANGING DROP,...,0,0,0,0,0,0,0,0,0,0
2,X-RAY DIFFRACTION,252,1.55,29054.37,,2.03,39.29,6.5,,,...,0,0,0,0,0,0,0,0,0,0
3,X-RAY DIFFRACTION,180,2.30,20674.23,289.0,2.63,52.00,7.5,VAPOR DIFFUSION,SITTING DROP,...,0,0,0,0,0,0,0,0,0,0
4,X-RAY DIFFRACTION,436,3.50,51456.98,297.0,2.70,54.00,5.6,VAPOR DIFFUSION,SITTING DROP,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120690,X-RAY DIFFRACTION,192,2.43,22023.59,,2.97,58.58,,,,...,0,0,0,0,0,0,0,0,0,0
120691,X-RAY DIFFRACTION,127,0.84,14470.12,,2.74,55.19,7.5,,,...,0,0,0,0,0,0,0,0,0,0
120692,SOLUTION NMR,26,,3083.67,,,,,,,...,0,0,0,0,0,0,0,0,0,0
120693,X-RAY DIFFRACTION,613,2.05,69425.64,298.0,2.20,44.09,7.4,VAPOR DIFFUSION,HANGING DROP,...,0,0,1,0,0,0,0,0,0,0


In [8]:
print("Train Data Column Information:")
train_d_info = pd.DataFrame({
    'Column Name': ohe_train_data.columns,
    'Data Type': ohe_train_data.dtypes
})

display(train_d_info)
print(train_d_info.shape)

Train Data Column Information:


Unnamed: 0,Column Name,Data Type
experimentalTechnique,experimentalTechnique,object
residueCount,residueCount,int64
resolution,resolution,float64
structureMolecularWeight,structureMolecularWeight,float64
crystallizationTempK,crystallizationTempK,float64
...,...,...
pdbxDetails_7_ temperature 277K,pdbxDetails_7_ temperature 277K,int32
pdbxDetails_7_ SITTING DROP,pdbxDetails_7_ SITTING DROP,int32
pdbxDetails_7_ temperature 291K,pdbxDetails_7_ temperature 291K,int32
pdbxDetails_7_ temperature 295K,pdbxDetails_7_ temperature 295K,int32


(124, 2)


In [9]:
# # Display the DataFrame information, including column names and data types
# print("Train Data Column Information:")
# train_data_info = ohe_train_data.dtypes.reset_index()
# train_data_info.columns = ['Column Name', 'Data Type']
# print(train_data_info)

# Display the DataFrame information, including column names and data types
print("Train Data Column Information:")
train_data_info = ohe_train_data.dtypes.reset_index()
train_data_info.columns = ['Column Name', 'Data Type']

# Print only columns with data type 'object'
object_columns = train_data_info[train_data_info['Data Type'] == 'object']['Column Name'].tolist()
print("Columns with Data Type 'object':", object_columns)


Train Data Column Information:
Columns with Data Type 'object': ['experimentalTechnique', 'phValue', 'crystallizationMethod_1', 'crystallizationMethod_2', 'crystallizationMethod_3', 'pdbxDetails_1', 'pdbxDetails_2', 'pdbxDetails_3', 'pdbxDetails_4', 'pdbxDetails_5', 'pdbxDetails_6', 'pdbxDetails_7', 'phValue']


In [10]:
ohe_train_data = ohe_train_data.drop(['experimentalTechnique', 'crystallizationMethod_1', 'crystallizationMethod_2', 'crystallizationMethod_3', 'pdbxDetails_1', 'pdbxDetails_2', 'pdbxDetails_3', 'pdbxDetails_4', 'pdbxDetails_5', 'pdbxDetails_6', 'pdbxDetails_7'], axis=1, errors="ignore")
display(ohe_train_data)

Unnamed: 0,residueCount,resolution,structureMolecularWeight,crystallizationTempK,densityMatthews,densityPercentSol,phValue,residueCount.1,resolution.1,structureMolecularWeight.1,...,pdbxDetails_6_ temperature 289K,pdbxDetails_7_ temperature 293K,pdbxDetails_7_ HANGING DROP,pdbxDetails_7_ temperature 298K,pdbxDetails_7_ VAPOR DIFFUSION,pdbxDetails_7_ temperature 277K,pdbxDetails_7_ SITTING DROP,pdbxDetails_7_ temperature 291K,pdbxDetails_7_ temperature 295K,pdbxDetails_7_ temperature 289K
0,438,1.70,50637.53,298.0,1.95,36.97,6.6,438,1.70,50637.53,...,0,0,0,0,0,0,0,0,0,0
1,1324,1.60,148853.81,293.0,2.48,50.38,7.5,1324,1.60,148853.81,...,0,0,0,0,0,0,0,0,0,0
2,252,1.55,29054.37,,2.03,39.29,6.5,252,1.55,29054.37,...,0,0,0,0,0,0,0,0,0,0
3,180,2.30,20674.23,289.0,2.63,52.00,7.5,180,2.30,20674.23,...,0,0,0,0,0,0,0,0,0,0
4,436,3.50,51456.98,297.0,2.70,54.00,5.6,436,3.50,51456.98,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120690,192,2.43,22023.59,,2.97,58.58,,192,2.43,22023.59,...,0,0,0,0,0,0,0,0,0,0
120691,127,0.84,14470.12,,2.74,55.19,7.5,127,0.84,14470.12,...,0,0,0,0,0,0,0,0,0,0
120692,26,,3083.67,,,,,26,,3083.67,...,0,0,0,0,0,0,0,0,0,0
120693,613,2.05,69425.64,298.0,2.20,44.09,7.4,613,2.05,69425.64,...,0,0,1,0,0,0,0,0,0,0


# Model Train

## KNN

In [11]:
# Display the count of null values in each column
null_counts = ohe_train_data.isnull().sum().reset_index()
null_counts.columns = ['Column Name', 'Null Count']
print("Null counts in each column:")
display(null_counts)


Null counts in each column:


Unnamed: 0,Column Name,Null Count
0,residueCount,0
1,resolution,10484
2,structureMolecularWeight,0
3,crystallizationTempK,37960
4,densityMatthews,13840
...,...,...
108,pdbxDetails_7_ temperature 277K,0
109,pdbxDetails_7_ SITTING DROP,0
110,pdbxDetails_7_ temperature 291K,0
111,pdbxDetails_7_ temperature 295K,0


In [12]:
# Concatenate the DataFrames along columns
con_train_data = pd.concat([ohe_train_data, train_label_data], axis=1)

# Display the concatenated DataFrame
display(con_train_data)



Unnamed: 0,residueCount,resolution,structureMolecularWeight,crystallizationTempK,densityMatthews,densityPercentSol,phValue,residueCount.1,resolution.1,structureMolecularWeight.1,...,pdbxDetails_7_ temperature 293K,pdbxDetails_7_ HANGING DROP,pdbxDetails_7_ temperature 298K,pdbxDetails_7_ VAPOR DIFFUSION,pdbxDetails_7_ temperature 277K,pdbxDetails_7_ SITTING DROP,pdbxDetails_7_ temperature 291K,pdbxDetails_7_ temperature 295K,pdbxDetails_7_ temperature 289K,Class label
0,438,1.70,50637.53,298.0,1.95,36.97,6.6,438,1.70,50637.53,...,0,0,0,0,0,0,0,0,0,TRANSFERASE
1,1324,1.60,148853.81,293.0,2.48,50.38,7.5,1324,1.60,148853.81,...,0,0,0,0,0,0,0,0,0,OXIDOREDUCTASE
2,252,1.55,29054.37,,2.03,39.29,6.5,252,1.55,29054.37,...,0,0,0,0,0,0,0,0,0,CHAPERONE
3,180,2.30,20674.23,289.0,2.63,52.00,7.5,180,2.30,20674.23,...,0,0,0,0,0,0,0,0,0,OXIDOREDUCTASE
4,436,3.50,51456.98,297.0,2.70,54.00,5.6,436,3.50,51456.98,...,0,0,0,0,0,0,0,0,0,TRANSFERASE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120690,192,2.43,22023.59,,2.97,58.58,,192,2.43,22023.59,...,0,0,0,0,0,0,0,0,0,APOPTOSIS
120691,127,0.84,14470.12,,2.74,55.19,7.5,127,0.84,14470.12,...,0,0,0,0,0,0,0,0,0,ELECTRON TRANSPORT
120692,26,,3083.67,,,,,26,,3083.67,...,0,0,0,0,0,0,0,0,0,HYDROLASE
120693,613,2.05,69425.64,298.0,2.20,44.09,7.4,613,2.05,69425.64,...,0,1,0,0,0,0,0,0,0,LYASE


In [36]:
# con_train_data.to_csv(r"C:\Users\xghostrider\Desktop\1-ppt\ML\drive-download-20230926T045156Z-001\con_train_data.csv",index="False")

#### Removing row 39702 due to error

In [13]:
# # non_numeric_columns = con_train_data.select_dtypes(exclude=['number']).columns
# # print("Columns with non-numeric values:", non_numeric_columns)
# # row_39702 = con_train_data.iloc[39702]
# # print(row_39702)
# # row_39702_dict = row_39702.to_dict()
# # print(row_39702_dict)

# non_numeric_columns = con_train_data.select_dtypes(exclude=['number']).columns
# print("Columns with non-numeric values:", non_numeric_columns)

con_train_data.drop(39702, inplace=True)


row_39702 = con_train_data.iloc[39702]
print(row_39702)
row_39702_dict = row_39702.to_dict()
print(row_39702_dict)

residueCount                                     264
resolution                                       2.0
structureMolecularWeight                    27817.88
crystallizationTempK                           291.0
densityMatthews                                  2.3
                                         ...        
pdbxDetails_7_ SITTING DROP                        0
pdbxDetails_7_ temperature 291K                    0
pdbxDetails_7_ temperature 295K                    0
pdbxDetails_7_ temperature 289K                    0
Class label                        TRANSPORT PROTEIN
Name: 39703, Length: 114, dtype: object
{'residueCount': 264, 'resolution': 2.0, 'structureMolecularWeight': 27817.88, 'crystallizationTempK': 291.0, 'densityMatthews': 2.3, 'densityPercentSol': 46.7, 'phValue': '6.3', 'experimentalTechnique_X-RAY DIFFRACTION': 1, 'experimentalTechnique_SOLUTION NMR': 0, 'experimentalTechnique_ELECTRON MICROSCOPY': 0, 'experimentalTechnique_SOLID-STATE NMR': 0, 'experimentalTechni

In [15]:
# Assuming 'Class label' is your target variable
target_variable = 'Class label'

# Separate features and target variable
X = con_train_data.drop(target_variable, axis=1)
y = con_train_data[target_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Impute NaN values using SimpleImputer with mean strategy

imputer = SimpleImputer(missing_values = np.nan,strategy ='most_frequent')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)





In [16]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Initialize the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)  

# Fit the model on the standardized training data
knn_model.fit(X_train_scaled, y_train)

# Predict on the standardized test data
y_pred_scaled = knn_model.predict(X_test_scaled)

# Calculate accuracy and other metrics for scaled data
accuracy_scaled = accuracy_score(y_test, y_pred_scaled)
classification_rep_scaled = classification_report(y_test, y_pred_scaled, zero_division=1)


# Print the results for scaled data
print(f"Accuracy (scaled): {accuracy_scaled:.4f}")
print("Classification Report (scaled):\n", classification_rep_scaled)

Accuracy (scaled): 0.2604
Classification Report (scaled):
                                               precision    recall  f1-score   support

                     5'-3' EXO/ENDO NUCLEASE       0.00      1.00      0.00         0
                                 ACETYLATION       0.00      1.00      0.00         0
               ACETYLCHOLINE BINDING PROTEIN       0.00      1.00      0.00         0
                      ACETYLCHOLINE RECEPTOR       0.00      1.00      0.00         0
               ACETYLCHOLINE-BINDING PROTEIN       0.00      0.00      1.00         3
                           ACETYLTRANSFERASE       1.00      0.00      0.00         1
                    ACID ANHYDRIDE HYDROLASE       0.00      0.00      1.00         2
                               ACTIN BINDING       0.00      0.00      1.00         2
                       ACTIN BINDING PROTEIN       0.00      0.00      1.00         3
                               ACTIN-BINDING       0.00      1.00      0.00     

In [26]:
# Assuming 'Class label' is your target variable
target_variable = 'Class label'

# Separate features and target variable
X = con_train_data.drop(target_variable, axis=1)
y = con_train_data[target_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Impute NaN values using SimpleImputer with mean strategy

imputer = SimpleImputer(missing_values = np.nan,strategy ='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
# Initialize the KNN model
knn_model = KNeighborsClassifier(n_neighbors=17)  # You can adjust the number of neighbors

# Fit the model on the standardized training data
knn_model.fit(X_train_scaled, y_train)

# Predict on the standardized test data
y_pred_scaled = knn_model.predict(X_test_scaled)

# Calculate accuracy and other metrics for scaled data
accuracy_scaled = accuracy_score(y_test, y_pred_scaled)
classification_rep_scaled = classification_report(y_test, y_pred_scaled, zero_division=1)


# Print the results for scaled data
print(f"Accuracy (scaled): {accuracy_scaled:.4f}")
print("Classification Report (scaled):\n", classification_rep_scaled)

Accuracy (scaled): 0.2418
Classification Report (scaled):
                                               precision    recall  f1-score   support

                     5'-3' EXO/ENDO NUCLEASE       0.00      1.00      0.00         0
               ACETYLCHOLINE BINDING PROTEIN       0.00      1.00      0.00         0
               ACETYLCHOLINE-BINDING PROTEIN       1.00      0.00      0.00         3
                           ACETYLTRANSFERASE       1.00      0.00      0.00         1
                    ACID ANHYDRIDE HYDROLASE       1.00      0.00      0.00         2
                               ACTIN BINDING       0.00      0.00      1.00         2
                       ACTIN BINDING PROTEIN       1.00      0.00      0.00         3
                       ACTIN-BINDING PROTEIN       0.00      0.00      1.00         2
                             ACYLTRANSFERASE       0.25      0.50      0.33         2
      ADAPTOR PROTEIN CONTAINING SH2 AND SH3       1.00      0.00      0.00     

### Grid Search For KNN

In [20]:
from sklearn.model_selection import GridSearchCV

k_range = list(range(5, 25))
n_neighbors = k_range

# # Algorithm to be used by the KNN
# algorithm = ['auto']

# Create the parameter grid
param_grid = {'n_neighbors': n_neighbors
            #   'algorithm': algorithm
             }
print(param_grid)


# from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier()

# Create a GridSearchCV object
grid = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10, scoring='f1_micro', n_jobs=-1)

# Fit the model on the training data
grid.fit(X_train_imputed, y_train)

# Print the best parameters and corresponding accuracy
print("Best Parameters:", grid.best_params_)
print("Best Score:", grid.best_score_)


{'n_neighbors': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]}




Best Parameters: {'n_neighbors': 17}
Best Score: 0.21142353082763798


In [21]:
grid.best_score_


0.21142353082763798

In [22]:
from sklearn.model_selection import GridSearchCV

k_range = list(range(5, 25))
n_neighbors = k_range

# # Algorithm to be used by the KNN
# algorithm = ['auto']

# Create the parameter grid
param_grid = {'n_neighbors': n_neighbors
            #   'algorithm': algorithm
             }
print(param_grid)


# from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier()

# Create a GridSearchCV object
grid = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10, scoring='f1_macro', n_jobs=-1)

# Fit the model on the training data
grid.fit(X_train_imputed, y_train)

# Print the best parameters and corresponding accuracy
print("Best Parameters:", grid.best_params_)
print("Best Score:", grid.best_score_)


{'n_neighbors': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]}




Best Parameters: {'n_neighbors': 5}
Best Score: 0.0468443115727424


## Random-Forest 

In [31]:
# Assuming 'Class label' is your target variable
target_variable = 'Class label'

# Separate features and target variable
X = con_train_data.drop(target_variable, axis=1)
y = con_train_data[target_variable]

# # reducing the size of data for tesing and memory issues #change this accordingly
# X =  (X[:20000])
# y = list(y[:20000])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
imputer = SimpleImputer(missing_values = np.nan,strategy ='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

#### Parameter 1

In [29]:
rfc_model = RandomForestClassifier(n_estimators=20, min_samples_split=2, min_samples_leaf=2, max_samples =1.0, max_features=0.6,
max_depth = 8, bootstrap =True)  # You can adjust the number of estimators

# Fit the model on the imputed training data
rfc_model.fit(X_train_imputed, y_train)

# Predict on the imputed test data
y_pred_imputed = rfc_model.predict(X_test_imputed)

# Calculate accuracy and other metrics for imputed data
accuracy_imputed = accuracy_score(y_test, y_pred_imputed)
classification_rep_imputed = classification_report(y_test, y_pred_imputed, zero_division=1)

# Print the results for imputed data
print(f"Accuracy (imputed): {accuracy_imputed:.4f}")
print("Classification Report (imputed):\n", classification_rep_imputed)

Accuracy (imputed): 0.2205
Classification Report (imputed):
                                           precision    recall  f1-score   support

           ACETYLCHOLINE-BINDING PROTEIN       1.00      0.00      0.00         1
                         ACYLTRANSFERASE       1.00      0.00      0.00         2
                        ADP-RIBOSYLATION       1.00      0.00      0.00         1
                                ALLERGEN       1.00      0.00      0.00         2
                                 AMIDASE       1.00      0.00      0.00         1
               AMINOACYL-TRNA SYNTHETASE       1.00      0.00      0.00         1
                        AMINOTRANSFERASE       1.00      0.00      0.00         1
                              ANTIBIOTIC       1.00      0.33      0.50         3
                 ANTIBIOTIC BIOSYNTHESIS       1.00      0.00      0.00         1
                    ANTIBIOTIC INHIBITOR       1.00      0.00      0.00         1
                   ANTIBIOTIC RESIST

#### Parameter 2

In [None]:
rfc_model = RandomForestClassifier(n_estimators=40, min_samples_split=2, min_samples_leaf=2, max_samples =0.5, max_features=0.6,
max_depth=8, bootstrap =True)  # You can adjust the number of estimators

# Fit the model on the imputed training data
rfc_model.fit(X_train_imputed, y_train)

# Predict on the imputed test data
y_pred_imputed = rfc_model.predict(X_test_imputed)

# Calculate accuracy and other metrics for imputed data
accuracy_imputed = accuracy_score(y_test, y_pred_imputed)
classification_rep_imputed = classification_report(y_test, y_pred_imputed, zero_division=1)

# Print the results for imputed data
print(f"Accuracy (imputed): {accuracy_imputed:.4f}")
print("Classification Report (imputed):\n", classification_rep_imputed)

#### Parameter 3 for 10000 values

In [30]:
rfc_model = RandomForestClassifier(n_estimators=120, min_samples_split=5, min_samples_leaf=1, max_samples =0.75, max_features=0.6,
max_depth=None, bootstrap =True)  # You can adjust the number of estimators

# Fit the model on the imputed training data
rfc_model.fit(X_train_imputed, y_train)

# Predict on the imputed test data
y_pred_imputed = rfc_model.predict(X_test_imputed)

# Calculate accuracy and other metrics for imputed data
accuracy_imputed = accuracy_score(y_test, y_pred_imputed)
classification_rep_imputed = classification_report(y_test, y_pred_imputed, zero_division=1)

# Print the results for imputed data
print(f"Accuracy (imputed): {accuracy_imputed:.4f}")
print("Classification Report (imputed):\n", classification_rep_imputed)

Accuracy (imputed): 0.2970
Classification Report (imputed):
                                           precision    recall  f1-score   support

           ACETYLCHOLINE-BINDING PROTEIN       1.00      0.00      0.00         1
                         ACYLTRANSFERASE       1.00      0.00      0.00         2
                        ADP-RIBOSYLATION       1.00      0.00      0.00         1
                                ALLERGEN       1.00      0.00      0.00         2
                                 AMIDASE       1.00      1.00      1.00         1
               AMINOACYL-TRNA SYNTHETASE       1.00      0.00      0.00         1
                        AMINOTRANSFERASE       1.00      0.00      0.00         1
                           ANTI-ONCOGENE       0.00      1.00      0.00         0
                              ANTIBIOTIC       0.50      0.33      0.40         3
                 ANTIBIOTIC BIOSYNTHESIS       1.00      0.00      0.00         1
                    ANTIBIOTIC INHIB

In [None]:
rfc_model = RandomForestClassifier(n_estimators=120, min_samples_split=5, min_samples_leaf=1, max_samples =0.75, max_features=0.6,
max_depth=None, bootstrap =True)  # You can adjust the number of estimators

# Fit the model on the imputed training data
rfc_model.fit(X_train_imputed, y_train)

# Predict on the imputed test data
y_pred_imputed = rfc_model.predict(X_test_imputed)

# Calculate accuracy and other metrics for imputed data
accuracy_imputed = accuracy_score(y_test, y_pred_imputed)
classification_rep_imputed = classification_report(y_test, y_pred_imputed, zero_division=1)

# Print the results for imputed data
print(f"Accuracy (imputed): {accuracy_imputed:.4f}")
print("Classification Report (imputed):\n", classification_rep_imputed)

# --------------------Random Search CV----------------------------
#### for random forest classifier

In [48]:
# Number of trees in random forest
n_estimators = [20,60,75,80,100,120]

# Number of features to consider at every split
max_features = [0.2,0.4,0.6,0.75]

# Maximum number of levels in tree
max_depth = [2,6,8,None]

# Number of samples
max_samples = [0.5,0.75]

# Bootstrap samples
bootstrap = [True,False]

# Minimum number of samples required to split a node
min_samples_split = [2,3,5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

In [49]:
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
              'max_samples':max_samples,
              'bootstrap':bootstrap,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf
             }
print(param_grid)

{'n_estimators': [20, 60, 75, 80, 100, 120], 'max_features': [0.2, 0.4, 0.6, 0.75], 'max_depth': [2, 6, 8, None], 'max_samples': [0.5, 0.75], 'bootstrap': [True, False], 'min_samples_split': [2, 3, 5], 'min_samples_leaf': [1, 2]}


In [50]:
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestClassifier()
rf_grid = RandomizedSearchCV(estimator = rf, 
                       param_distributions = param_grid, 
                       cv = 5, 
                       verbose=2, 
                       n_jobs = -1)

In [51]:
rf_grid.fit(X_train_imputed, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xghostrider\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\xghostrider\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xghos

MemoryError: could not allocate 555220992 bytes

In [52]:
rf_grid.best_params_

{'n_estimators': 80,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_samples': 0.5,
 'max_features': 0.6,
 'max_depth': None,
 'bootstrap': True}

In [None]:
rf_grid.best_score_
