## Classification on data set March_2024_AF2_01.txt. Predicting atribute structure name.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("March_2024_AF2_01.txt", sep=r"\s+", header=None)
df.columns = ['Protein_number', 'res_number', 'PB1', 'PB2', 'AA1', 'AA2', 'S2_1', 'S2_2', 'expected_frequency', 'plDDT', 'RSA1', 'RSA2']
df

Unnamed: 0,Protein_number,res_number,PB1,PB2,AA1,AA2,S2_1,S2_2,expected_frequency,plDDT,RSA1,RSA2
0,0,7,j,j,G,G,C,S,0.880055,60.180000,100.000000,100.000000
1,1,95,n,l,E,Q,T,T,0.736402,96.220001,89.762611,98.582039
2,1,219,i,d,G,N,S,C,0.040571,31.870001,100.000000,100.000000
3,1,222,d,i,T,V,C,C,0.021758,47.259998,100.000000,100.000000
4,1,223,i,d,V,P,C,C,0.040571,38.200001,100.000000,93.366096
...,...,...,...,...,...,...,...,...,...,...,...,...
671150,23508,14909273,j,d,G,L,C,C,0.623739,95.459999,1.531394,20.793951
671151,23508,14909291,f,d,C,D,P,S,0.047665,95.379997,11.473963,100.000000
671152,23508,14909390,j,d,G,P,S,P,0.623739,80.699997,65.849922,88.452087
671153,23508,14909434,g,f,L,T,C,C,0.430306,79.760002,60.491493,46.452702


In [5]:
def concat_structures(data, rm_columns : list, req_col_str : str):
    """
    Input:  
        - data: DataFrame to be processed (a copy is used to avoid modifying the original).
        - rm_columns: List of column names (strings) to be removed.
        - req_col_str: String of a new column name with "-" indicating concatenation.
    Output:
        - A modified copy of the DataFrame with the requested changes.
    """
    data = data.copy()
    data = data.drop(list(('Protein_number', 'res_number')), axis=1)
    data[req_col_str] = ""
    for req_col in req_col_str.split('-'):
        data[req_col_str] += data[req_col]
    data = data.drop(rm_columns, axis=1)
    return data

In [6]:
df1 = concat_structures(df, list(('AA1', 'AA2')), 'AA1-AA2')
df1

Unnamed: 0,PB1,PB2,S2_1,S2_2,expected_frequency,plDDT,RSA1,RSA2,AA1-AA2
0,j,j,C,S,0.880055,60.180000,100.000000,100.000000,GG
1,n,l,T,T,0.736402,96.220001,89.762611,98.582039,EQ
2,i,d,S,C,0.040571,31.870001,100.000000,100.000000,GN
3,d,i,C,C,0.021758,47.259998,100.000000,100.000000,TV
4,i,d,C,C,0.040571,38.200001,100.000000,93.366096,VP
...,...,...,...,...,...,...,...,...,...
671150,j,d,C,C,0.623739,95.459999,1.531394,20.793951,GL
671151,f,d,P,S,0.047665,95.379997,11.473963,100.000000,CD
671152,j,d,S,P,0.623739,80.699997,65.849922,88.452087,GP
671153,g,f,C,C,0.430306,79.760002,60.491493,46.452702,LT


In [7]:
len(df1['AA1-AA2'].unique()) # numbers of classes

400

In [8]:
df2 = concat_structures(df, list(('PB1', 'PB2')), 'PB1-PB2')
df2

Unnamed: 0,AA1,AA2,S2_1,S2_2,expected_frequency,plDDT,RSA1,RSA2,PB1-PB2
0,G,G,C,S,0.880055,60.180000,100.000000,100.000000,jj
1,E,Q,T,T,0.736402,96.220001,89.762611,98.582039,nl
2,G,N,S,C,0.040571,31.870001,100.000000,100.000000,id
3,T,V,C,C,0.021758,47.259998,100.000000,100.000000,di
4,V,P,C,C,0.040571,38.200001,100.000000,93.366096,id
...,...,...,...,...,...,...,...,...,...
671150,G,L,C,C,0.623739,95.459999,1.531394,20.793951,jd
671151,C,D,P,S,0.047665,95.379997,11.473963,100.000000,fd
671152,G,P,S,P,0.623739,80.699997,65.849922,88.452087,jd
671153,L,T,C,C,0.430306,79.760002,60.491493,46.452702,gf


In [9]:
df3 = concat_structures(df, list(('S2_1', 'S2_2')), 'S2_1-S2_2')
df3

Unnamed: 0,PB1,PB2,AA1,AA2,expected_frequency,plDDT,RSA1,RSA2,S2_1-S2_2
0,j,j,G,G,0.880055,60.180000,100.000000,100.000000,CS
1,n,l,E,Q,0.736402,96.220001,89.762611,98.582039,TT
2,i,d,G,N,0.040571,31.870001,100.000000,100.000000,SC
3,d,i,T,V,0.021758,47.259998,100.000000,100.000000,CC
4,i,d,V,P,0.040571,38.200001,100.000000,93.366096,CC
...,...,...,...,...,...,...,...,...,...
671150,j,d,G,L,0.623739,95.459999,1.531394,20.793951,CC
671151,f,d,C,D,0.047665,95.379997,11.473963,100.000000,PS
671152,j,d,G,P,0.623739,80.699997,65.849922,88.452087,SP
671153,g,f,L,T,0.430306,79.760002,60.491493,46.452702,CC


# CatBoost

In [10]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [13]:
target = 'S2_1-S2_2'
features = ['PB1', 'PB2', 'AA1', 'AA2',
            'expected_frequency', 'plDDT', 'RSA1', 'RSA2']
categorical_features = ['PB1', 'PB2', 'AA1', 'AA2']

In [14]:
X = df3[features]
y = df3[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [15]:
model = CatBoostClassifier(iterations=10, learning_rate=0.1, depth=6, cat_features=categorical_features, verbose=2)
model.fit(X_train, y_train)

0:	learn: 1.0469445	total: 1m 36s	remaining: 14m 26s
2:	learn: 0.9229750	total: 6m	remaining: 14m 2s
4:	learn: 0.8347738	total: 10m 49s	remaining: 10m 49s
6:	learn: 0.7718864	total: 15m 31s	remaining: 6m 39s
8:	learn: 0.7268368	total: 20m 13s	remaining: 2m 14s
9:	learn: 0.7102316	total: 22m 34s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x76d1539fe2f0>

In [17]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

          BB       0.00      0.00      0.00        13
          BC       0.60      0.16      0.25       351
          BE       0.00      0.00      0.00         4
          BG       0.00      0.00      0.00         6
          BH       0.00      0.00      0.00         4
          BS       0.00      0.00      0.00       132
          BT       0.00      0.00      0.00       144
          CB       0.32      0.33      0.32       457
          CC       0.92      0.98      0.95    212083
          CE       0.44      0.46      0.45       463
          CG       0.00      0.00      0.00       253
          CH       0.36      0.08      0.14       756
          CP       0.00      0.00      0.00       690
          CS       0.00      0.00      0.00      4700
          CT       0.42      0.14      0.21      1664
          EB       0.00      0.00      0.00        12
          EC       0.48      0.69      0.56       873
          EE       0.49    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# RandomForest

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [26]:
target = 'S2_1-S2_2'
features = ['PB1', 'PB2', 'AA1', 'AA2',
            'expected_frequency', 'plDDT', 'RSA1', 'RSA2']
categorical_features = ['PB1', 'PB2', 'AA1', 'AA2']
numerical_features = ['expected_frequency', 'plDDT', 'RSA1', 'RSA2']

In [28]:
X = df3[features]
y = df3[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [29]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # Keep numerical features as is
)

In [30]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [32]:
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train_transformed, y_train)

In [33]:
y_pred = model.predict(X_test_transformed)
print(classification_report(y_test, y_pred))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

          BB       0.00      0.00      0.00        13
          BC       0.00      0.00      0.00       351
          BE       0.00      0.00      0.00         4
          BG       0.00      0.00      0.00         6
          BH       0.00      0.00      0.00         4
          BS       0.00      0.00      0.00       132
          BT       1.00      0.01      0.01       144
          CB       0.84      0.18      0.30       457
          CC       0.89      1.00      0.94    212083
          CE       0.85      0.21      0.33       463
          CG       0.00      0.00      0.00       253
          CH       0.71      0.35      0.46       756
          CP       0.00      0.00      0.00       690
          CS       0.87      0.01      0.01      4700
          CT       0.75      0.15      0.25      1664
          EB       0.00      0.00      0.00        12
          EC       0.64      0.32      0.43       873
          EE       0.56    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Artificial Neural Network

In [34]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

ModuleNotFoundError: No module named 'tensorflow'