## Classification on data set March_2024_AF2_01.txt. Predicting atribute structure name.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("March_2024_AF2_01.txt", sep=r"\s+", header=None)
df.columns = ['Protein_number', 'res_number', 'PB1', 'PB2', 'AA1', 'AA2', 'S2_1', 'S2_2', 'expected_frequency', 'plDDT', 'RSA1', 'RSA2']
df

Unnamed: 0,Protein_number,res_number,PB1,PB2,AA1,AA2,S2_1,S2_2,expected_frequency,plDDT,RSA1,RSA2
0,0,7,j,j,G,G,C,S,0.880055,60.180000,100.000000,100.000000
1,1,95,n,l,E,Q,T,T,0.736402,96.220001,89.762611,98.582039
2,1,219,i,d,G,N,S,C,0.040571,31.870001,100.000000,100.000000
3,1,222,d,i,T,V,C,C,0.021758,47.259998,100.000000,100.000000
4,1,223,i,d,V,P,C,C,0.040571,38.200001,100.000000,93.366096
...,...,...,...,...,...,...,...,...,...,...,...,...
671150,23508,14909273,j,d,G,L,C,C,0.623739,95.459999,1.531394,20.793951
671151,23508,14909291,f,d,C,D,P,S,0.047665,95.379997,11.473963,100.000000
671152,23508,14909390,j,d,G,P,S,P,0.623739,80.699997,65.849922,88.452087
671153,23508,14909434,g,f,L,T,C,C,0.430306,79.760002,60.491493,46.452702


In [8]:
def concat_structures(data, rm_columns : list, req_col_str : str):
    """
    Input:  
        - data: DataFrame to be processed (a copy is used to avoid modifying the original).
        - rm_columns: List of column names (strings) to be removed.
        - req_col_str: String of a new column name with "-" indicating concatenation.
    Output:
        - A modified copy of the DataFrame with the requested changes.
    """
    data = data.copy()
    data[req_col_str] = ""
    for req_col in req_col_str.split('-'):
        data[req_col_str] += data[req_col]
    data = data.drop(rm_columns, axis=1)
    return data

In [9]:
df1 = concat_structures(df, list(('AA1', 'AA2')), 'AA1-AA2')
df1

Unnamed: 0,Protein_number,res_number,PB1,PB2,S2_1,S2_2,expected_frequency,plDDT,RSA1,RSA2,AA1-AA2
0,0,7,j,j,C,S,0.880055,60.180000,100.000000,100.000000,GG
1,1,95,n,l,T,T,0.736402,96.220001,89.762611,98.582039,EQ
2,1,219,i,d,S,C,0.040571,31.870001,100.000000,100.000000,GN
3,1,222,d,i,C,C,0.021758,47.259998,100.000000,100.000000,TV
4,1,223,i,d,C,C,0.040571,38.200001,100.000000,93.366096,VP
...,...,...,...,...,...,...,...,...,...,...,...
671150,23508,14909273,j,d,C,C,0.623739,95.459999,1.531394,20.793951,GL
671151,23508,14909291,f,d,P,S,0.047665,95.379997,11.473963,100.000000,CD
671152,23508,14909390,j,d,S,P,0.623739,80.699997,65.849922,88.452087,GP
671153,23508,14909434,g,f,C,C,0.430306,79.760002,60.491493,46.452702,LT


In [10]:
len(df1['AA1-AA2'].unique()) # numbers of classes

400

In [11]:
df2 = concat_structures(df, list(('PB1', 'PB2')), 'PB1-PB2')
df2

Unnamed: 0,Protein_number,res_number,AA1,AA2,S2_1,S2_2,expected_frequency,plDDT,RSA1,RSA2,PB1-PB2
0,0,7,G,G,C,S,0.880055,60.180000,100.000000,100.000000,jj
1,1,95,E,Q,T,T,0.736402,96.220001,89.762611,98.582039,nl
2,1,219,G,N,S,C,0.040571,31.870001,100.000000,100.000000,id
3,1,222,T,V,C,C,0.021758,47.259998,100.000000,100.000000,di
4,1,223,V,P,C,C,0.040571,38.200001,100.000000,93.366096,id
...,...,...,...,...,...,...,...,...,...,...,...
671150,23508,14909273,G,L,C,C,0.623739,95.459999,1.531394,20.793951,jd
671151,23508,14909291,C,D,P,S,0.047665,95.379997,11.473963,100.000000,fd
671152,23508,14909390,G,P,S,P,0.623739,80.699997,65.849922,88.452087,jd
671153,23508,14909434,L,T,C,C,0.430306,79.760002,60.491493,46.452702,gf


In [12]:
df3 = concat_structures(df, list(('S2_1', 'S2_2')), 'S2_1-S2_2')
df3

Unnamed: 0,Protein_number,res_number,PB1,PB2,AA1,AA2,expected_frequency,plDDT,RSA1,RSA2,S2_1-S2_2
0,0,7,j,j,G,G,0.880055,60.180000,100.000000,100.000000,CS
1,1,95,n,l,E,Q,0.736402,96.220001,89.762611,98.582039,TT
2,1,219,i,d,G,N,0.040571,31.870001,100.000000,100.000000,SC
3,1,222,d,i,T,V,0.021758,47.259998,100.000000,100.000000,CC
4,1,223,i,d,V,P,0.040571,38.200001,100.000000,93.366096,CC
...,...,...,...,...,...,...,...,...,...,...,...
671150,23508,14909273,j,d,G,L,0.623739,95.459999,1.531394,20.793951,CC
671151,23508,14909291,f,d,C,D,0.047665,95.379997,11.473963,100.000000,PS
671152,23508,14909390,j,d,G,P,0.623739,80.699997,65.849922,88.452087,SP
671153,23508,14909434,g,f,L,T,0.430306,79.760002,60.491493,46.452702,CC
