In [3]:
import numpy as np
import pandas as pd
import pickle #to un-encode the dictionaries
from pathlib import Path #used for looping through representation dictionaries
from itertools import product #used for gridsearch on hyperparameters

#data processing libraries
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

#model libraries
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

In [4]:
proteindictspath = Path(r"C:\Users\20243625\OneDrive - TU Eindhoven\Desktop\group-4\docs\Sep's picklebestanden\protein dicts to use in gridsearch").glob('*')
moldictspath = Path(r"C:\Users\20243625\OneDrive - TU Eindhoven\Desktop\group-4\docs\mol representatie picklebestanden").glob('*')
combinations = []
for protpath in proteindictspath:
    # print(protpath)
    moldictspath = Path(r"C:\Users\20243625\OneDrive - TU Eindhoven\Desktop\group-4\docs\mol representatie picklebestanden").glob('*')
    for molpath in moldictspath:
        if 'train' in molpath.stem:
            combinations.append((protpath,molpath))
            # print('1')



In [71]:
print(len(combinations))

24


In [5]:
def data_loader(data_location):
        data = pd.read_csv(rf"{data_location}")
        return data
traindata = data_loader(r"C:\Users\20243625\OneDrive - TU Eindhoven\Desktop\group-4\data\train.csv")
testdata = data_loader(r"C:\Users\20243625\OneDrive - TU Eindhoven\Desktop\group-4\data\test.csv")

In [7]:
#selecting the preferred dicitionaries
iteration = 1
for protdict,moltraindict in combinations[0:2]:
    molecule_features_dict_train = pickle.load(moltraindict.open('rb'))
    molecule_features_dict_test = pickle.load(open(str(moltraindict).replace('train','test'),'rb'))
    protein_features_dict = pickle.load(protdict.open('rb'))
    # print("dictionaries loaded")
    #loading the training set
    train_df = pd.read_csv(r"C:\Users\20243625\OneDrive - TU Eindhoven\Desktop\group-4\data\train.csv")
    test_df = pd.read_csv(r"C:\Users\20243625\OneDrive - TU Eindhoven\Desktop\group-4\data\test.csv")

    X = []
    y = []

    #feature concatenation to combining each ligand-protein pair

    for _, row in train_df.iterrows():
        smiles = row["molecule_SMILES"]
        protein = row["UniProt_ID"]
        affinity_score = row["affinity_score"]
        #quick check if all elements are available
        if smiles not in molecule_features_dict_train: 
            raise FileNotFoundError(
                f"The following SMILES exists in the trainingset but not in the molecule-features dictionary: {smiles}"
            )
        if protein not in protein_features_dict: 
            raise FileNotFoundError(
                f"The following Uniprot_ID exists in the trainingset but not in the protein-features dictionary: {protein}"
            )

        #feature concatenation
        if isinstance(molecule_features_dict_train[smiles], np.ndarray):
            molecule_features_dict_train[smiles] = molecule_features_dict_train[smiles].tolist()
        if isinstance(protein_features_dict[protein], np.ndarray):
            protein_features_dict[protein] = protein_features_dict[protein].tolist()
        combined = molecule_features_dict_train[smiles] + protein_features_dict[protein]

        #data seperation
        X.append(combined)
        y.append(affinity_score)

    X = np.array(X, dtype=float)
    y = np.array(y, dtype=float)

    X_predict = []
    for _, row in test_df.iterrows():
        smiles = row["molecule_SMILES"]
        protein = row["UniProt_ID"]
        #quick check if all elements are available
        if smiles not in molecule_features_dict_test: 
            raise FileNotFoundError(
                f"The following SMILES exists in the testset but not in the molecule-features dictionary: {smiles}"
            )
        if protein not in protein_features_dict: 
            raise FileNotFoundError(
                f"The following Uniprot_ID exists in the testset but not in the protein-features dictionary: {protein}"
            )

        #feature concatenation
        if isinstance(molecule_features_dict_test[smiles], np.ndarray):
            molecule_features_dict_test[smiles] = molecule_features_dict_test[smiles].tolist()
        if isinstance(protein_features_dict[protein], np.ndarray):
            protein_features_dict[protein] = protein_features_dict[protein].tolist()
        combined = molecule_features_dict_test[smiles] + protein_features_dict[protein]

        #data seperation
        X_predict.append(combined)
    X_predict = np.array(X_predict, dtype=float)
    # print("feature concatenation complete")

    #splitting the data in training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        #ONLY CHANGE THE TEST_SIZE BY PREFERENCE
        X, y, test_size=0.33, random_state=42 
    )
    # print("data splitting complete")

    #BELOW ARE OPTIONS FOR SCALING AND PCA, REMOVE DOCSTRINGS FOR
    #THE PREFERRED OPTION(S)

    #choose one of the following scaling option, or leave them out if preferred
    """
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    X_predict = scaler.transform(X_predict)
    print("standard scaling complete")
    """

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    X = scaler.fit_transform(X)
    #X_predict = scaler.transform(X_predict)
    # print("minmax scaling complete")

    #apply PCA if preferred
    r"""
    ValueError: Input X contains NaN.
    PCA does not accept missing values encoded as NaN natively. 
    For supervised learning, you might want to consider 
    sklearn.ensemble.HistGradientBoostingClassifier and Regressor 
    which accept missing values encoded as NaNs natively. 
    Alternatively, it is possible to preprocess the data, 
    for instance by using an imputer transformer in a pipeline 
    or drop samples with missing values. 
    See https://scikit-learn.org/stable/modules/impute.html 
    You can find a list of all estimators that handle NaN values 
    at the following page: 
    https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
    """
    """
    pca = PCA(n_components=8)
    X_train = pca.fit_transform(X_train)
    X_test  = pca.transform(X_test)
    print("PCA application complete")
    """
    #NOW APPLY YOUR PREFERRED MODEL TYPE
    r"""
    ValueError: Input X contains NaN.
    MLPRegressor does not accept missing values encoded as NaN natively. 
    For supervised learning, you might want to consider 
    sklearn.ensemble.HistGradientBoostingClassifier and Regressor 
    which accept missing values encoded as NaNs natively. 
    Alternatively, it is possible to preprocess the data, 
    for instance by using an imputer transformer in a pipeline 
    or drop samples with missing values. 
    See https://scikit-learn.org/stable/modules/impute.html 
    You can find a list of all estimators that handle NaN values 
    at the following page: 
    https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
    """

    """
    model = MLPRegressor(
        hidden_layer_sizes=( 16, 8),
        activation='logistic',
        learning_rate='adaptive',
        max_iter=400,
        random_state=42
    )
    """

    model = RandomForestRegressor(
        max_features='sqrt',
        max_depth=500
    )


    # model = HistGradientBoostingRegressor(
    #     loss= "absolute_error",
    #     learning_rate= 0.1,
    #     max_iter= 100
    # )

    model.fit(X_train, y_train)
    print(iteration)
    print("Train score:", model.score(X_train, y_train))
    print("Test score:", model.score(X_test, y_test))
    print('abs_loss',np.average(abs(model.predict(X_test)-y_test)))
    iteration += 1

#FOR MAKING THE ACTUAL PREDICTIONS

# model.fit(X, y)
# y_predict = model.predict(X_predict)

# submission = pd.DataFrame({
#     "ID": test_df["ID"],
#     "affinity_score": y_predict
# })
# submission.to_csv("data/submission2.csv", index=False)

1
Train score: 0.9243730871252673
Test score: 0.45191905564364765
abs_loss 2.8096194918495603
2
Train score: 0.9310305612545288
Test score: 0.5023131799100911
abs_loss 2.7230725966010914


In [75]:
# output = r"""C:\Users\20243625\OneDrive - TU Eindhoven\Desktop\group-4\docs\mol representatie picklebestanden\train_molecule_combined_representation.pkl
# 1
# Train score: 0.9198618309025929
# Test score: 0.4132078638660188
# 1
# Train score: 0.9106905198968767
# Test score: 0.36201019375805454
# 1
# Train score: 0.9154302764984878
# Test score: 0.39535249872184675
# 1
# Train score: 0.9217580044408616
# Test score: 0.4403328464414671
# 1
# Train score: 0.9144708846458509
# Test score: 0.38929312470093436
# 1
# Train score: 0.916061305145672
# Test score: 0.4175202727265849
# 1
# Train score: 0.9250683514242007
# Test score: 0.4665612680171197
# 1
# Train score: 0.9198473856543024
# Test score: 0.4308613345462996
# 1
# Train score: 0.9207919543543407
# Test score: 0.4333794372279679
# 1
# Train score: 0.9160679349490032
# Test score: 0.4060377352501878
# 1
# Train score: 0.9098172481334615
# Test score: 0.3555330787816271
# 1
# Train score: 0.9125425638520753
# Test score: 0.3872406418381015
# 1
# Train score: 0.9209937180786263
# Test score: 0.4374619619907618
# 1
# Train score: 0.913969733032996
# Test score: 0.38382895900473246
# 1
# Train score: 0.9180675592138209
# Test score: 0.41368486171675833
# 1
# Train score: 0.9255721339825949
# Test score: 0.4676036406847044
# 1
# Train score: 0.9207632928689484
# Test score: 0.42859873959420247
# 1
# Train score: 0.9185952247691547
# Test score: 0.4347679750982181
# 1
# Train score: 0.9187214559759393
# Test score: 0.4246162106944663
# 1
# Train score: 0.9105303881704226
# Test score: 0.35639243847420454
# 1
# Train score: 0.9139819061436268
# Test score: 0.3911542199450856
# 1
# Train score: 0.9223797794144107
# Test score: 0.4397697100044584
# 1
# Train score: 0.9148718261798853
# Test score: 0.3857563952715357
# 1
# Train score: 0.9157622810587089
# Test score: 0.41379466995774206
# 1
# Train score: 0.926512018438649
# Test score: 0.4724489172044446
# 1
# Train score: 0.9207637288881297
# Test score: 0.4290221167193343
# 1
# Train score: 0.9203528751850473
# Test score: 0.433935930381563
# 1
# Train score: 0.9170832375139956
# Test score: 0.4142223967784584
# 1
# Train score: 0.9110514358787315
# Test score: 0.3598677604133249
# 1
# Train score: 0.9142637208655607
# Test score: 0.3961998937286748
# 1
# Train score: 0.9222773829548987
# Test score: 0.44225718619046717
# 1
# Train score: 0.9156944269539136
# Test score: 0.39356215493202595
# 1
# Train score: 0.9174315538682665
# Test score: 0.414358345589078
# 1
# Train score: 0.9266605621113979
# Test score: 0.47176276787646687
# 1
# Train score: 0.921003536178896
# Test score: 0.42859143514024356
# 1
# Train score: 0.9206659385080908
# Test score: 0.44184176288002397
# 1
# Train score: 0.916425021746666
# Test score: 0.39922086416099656
# 1
# Train score: 0.9091399607592185
# Test score: 0.35485144633389387
# 1
# Train score: 0.9115055585777478
# Test score: 0.3806216695039252
# 1
# Train score: 0.920523085977137
# Test score: 0.4300408628852409
# 1
# Train score: 0.9133219471646856
# Test score: 0.3763656184994568
# 1
# Train score: 0.9151229854636909
# Test score: 0.39676229678832753
# 1
# Train score: 0.9242396977946533
# Test score: 0.4560898976125527
# 1
# Train score: 0.9194591526571937
# Test score: 0.4078578132759265
# 1
# Train score: 0.9190343085311231
# Test score: 0.42505981364577494
# 1
# Train score: 0.9153051057587928
# Test score: 0.4012128089448719
# 1
# Train score: 0.9089037121868182
# Test score: 0.35214052568063947
# 1
# Train score: 0.9120395417959742
# Test score: 0.3810440247544239
# 1
# Train score: 0.9202395703313662
# Test score: 0.43217193920614616
# 1
# Train score: 0.9121591098141268
# Test score: 0.3696351776242641
# 1
# Train score: 0.9158154278356758
# Test score: 0.406662500179213
# 1
# Train score: 0.9232536486275712
# Test score: 0.4522030778989492
# 1
# Train score: 0.9173823353142686
# Test score: 0.4125953431073084
# 1
# Train score: 0.9190869220140677
# Test score: 0.4301017168060264
# """
output_randomforrest = r"""1
Train score: 0.9198618309025929
Test score: 0.4132078638660188
2
Train score: 0.9106905198968767
Test score: 0.36201019375805454
3
Train score: 0.9154302764984878
Test score: 0.39535249872184675
4
Train score: 0.9217580044408616
Test score: 0.4403328464414671
5
Train score: 0.9144708846458509
Test score: 0.38929312470093436
6
Train score: 0.916061305145672
Test score: 0.4175202727265849
7
Train score: 0.9250683514242007
Test score: 0.4665612680171197
8
Train score: 0.9198473856543024
Test score: 0.4308613345462996
9
Train score: 0.9207919543543407
Test score: 0.4333794372279679
10
Train score: 0.9160679349490032
Test score: 0.4060377352501878
11
Train score: 0.9098172481334615
Test score: 0.3555330787816271
12
Train score: 0.9125425638520753
Test score: 0.3872406418381015
13
Train score: 0.9209937180786263
Test score: 0.4374619619907618
14
Train score: 0.913969733032996
Test score: 0.38382895900473246
15
Train score: 0.9180675592138209
Test score: 0.41368486171675833
16
Train score: 0.9255721339825949
Test score: 0.4676036406847044
17
Train score: 0.9207632928689484
Test score: 0.42859873959420247
18
Train score: 0.9185952247691547
Test score: 0.4347679750982181
19
Train score: 0.9187214559759393
Test score: 0.4246162106944663
20
Train score: 0.9105303881704226
Test score: 0.35639243847420454
21
Train score: 0.9139819061436268
Test score: 0.3911542199450856
22
Train score: 0.9223797794144107
Test score: 0.4397697100044584
23
Train score: 0.9148718261798853
Test score: 0.3857563952715357
24
Train score: 0.9157622810587089
Test score: 0.41379466995774206
25
Train score: 0.926512018438649
Test score: 0.4724489172044446
26
Train score: 0.9207637288881297
Test score: 0.4290221167193343
27
Train score: 0.9203528751850473
Test score: 0.433935930381563
28
Train score: 0.9170832375139956
Test score: 0.4142223967784584
29
Train score: 0.9110514358787315
Test score: 0.3598677604133249
30
Train score: 0.9142637208655607
Test score: 0.3961998937286748
31
Train score: 0.9222773829548987
Test score: 0.44225718619046717
32
Train score: 0.9156944269539136
Test score: 0.39356215493202595
33
Train score: 0.9174315538682665
Test score: 0.414358345589078
34
Train score: 0.9266605621113979
Test score: 0.47176276787646687
35
Train score: 0.921003536178896
Test score: 0.42859143514024356
36
Train score: 0.9206659385080908
Test score: 0.44184176288002397
37
Train score: 0.916425021746666
Test score: 0.39922086416099656
38
Train score: 0.9091399607592185
Test score: 0.35485144633389387
39
Train score: 0.9115055585777478
Test score: 0.3806216695039252
40
Train score: 0.920523085977137
Test score: 0.4300408628852409
41
Train score: 0.9133219471646856
Test score: 0.3763656184994568
42
Train score: 0.9151229854636909
Test score: 0.39676229678832753
43
Train score: 0.9242396977946533
Test score: 0.4560898976125527
44
Train score: 0.9194591526571937
Test score: 0.4078578132759265
45
Train score: 0.9190343085311231
Test score: 0.42505981364577494
46
Train score: 0.9153051057587928
Test score: 0.4012128089448719
47
Train score: 0.9089037121868182
Test score: 0.35214052568063947
48
Train score: 0.9120395417959742
Test score: 0.3810440247544239
49
Train score: 0.9202395703313662
Test score: 0.43217193920614616
50
Train score: 0.9121591098141268
Test score: 0.3696351776242641
51
Train score: 0.9158154278356758
Test score: 0.406662500179213
52
Train score: 0.9232536486275712
Test score: 0.4522030778989492
53
Train score: 0.9173823353142686
Test score: 0.4125953431073084
54
Train score: 0.9190869220140677
Test score: 0.4301017168060264"""
output_randomforrest2 = r"""1
Train score: 0.9189687945638008
Test score: 0.41469148611760565
abs_loss 2.9011179845289137
2
Train score: 0.9088404357359092
Test score: 0.3605281537805022
abs_loss 3.0432322173215116
3
Train score: 0.9124694854838481
Test score: 0.3944772527638962
abs_loss 2.9374720896499626
4
Train score: 0.921558992840867
Test score: 0.4382743095292425
abs_loss 2.840040453257743
5
Train score: 0.9156341479067052
Test score: 0.38497534249949805
abs_loss 2.9877318653438314
6
Train score: 0.9168974684844132
Test score: 0.4158640885519338
abs_loss 2.886466770471716
7
Train score: 0.9254034000968875
Test score: 0.4733139346289962
abs_loss 2.7641662251913197
8
Train score: 0.9202767495583877
Test score: 0.4199001223040578
abs_loss 2.9041405328335124
9
Train score: 0.920651919165978
Test score: 0.4342819014985675
abs_loss 2.8502637899459
10
Train score: 0.9182462634277602
Test score: 0.41440356910878773
abs_loss 2.8952552225852557
11
Train score: 0.9096783329155272
Test score: 0.35557601300645614
abs_loss 3.0542966352278174
12
Train score: 0.913915054400281
Test score: 0.38755663154605313
abs_loss 2.956541770714819
13
Train score: 0.9224689468012227
Test score: 0.4357489378519662
abs_loss 2.854505178583582
14
Train score: 0.9150521587199245
Test score: 0.38832996903752104
abs_loss 2.9792626233748543
15
Train score: 0.9168116087448686
Test score: 0.41094518685173886
abs_loss 2.9009215829419364
16
Train score: 0.9257621791531205
Test score: 0.4660272052513804
abs_loss 2.7923912653033423
17
Train score: 0.9194630022563942
Test score: 0.4258989575329335
abs_loss 2.89653196774183
18
Train score: 0.9210379882149394
Test score: 0.44200423773409137
abs_loss 2.8216449253835334
19
Train score: 0.9177612556615404
Test score: 0.4205659003907085
abs_loss 2.8848561287928134
20
Train score: 0.9108690224205813
Test score: 0.35624222894680013
abs_loss 3.0619975639130677
21
Train score: 0.9143156186649758
Test score: 0.3958791854523428
abs_loss 2.9379964813248134
22
Train score: 0.9215824204772729
Test score: 0.44201850543754695
abs_loss 2.8459766487256526
23
Train score: 0.9146991697157911
Test score: 0.3850272155123762
abs_loss 2.98470296083446
24
Train score: 0.9156782414769025
Test score: 0.41652312100588795
abs_loss 2.8777024278712893
25
Train score: 0.9253493383137172
Test score: 0.4672893214288679
abs_loss 2.793780330604094
26
Train score: 0.9197698760945141
Test score: 0.42468116611705087
abs_loss 2.8960539426544676
27
Train score: 0.9204948479463643
Test score: 0.4367254087810247
abs_loss 2.8459815188113207
28
Train score: 0.9151682297979609
Test score: 0.41488913996661525
abs_loss 2.898720864896227
29
Train score: 0.909856600331276
Test score: 0.3576499293494064
abs_loss 3.0546849166642356
30
Train score: 0.9128694653344764
Test score: 0.3928223883849141
abs_loss 2.9427112545419756
31
Train score: 0.9219194166608577
Test score: 0.4440344529839412
abs_loss 2.837857998580276
32
Train score: 0.9157766275603879
Test score: 0.3895406779221431
abs_loss 2.9815172775163474
33
Train score: 0.9181265455362752
Test score: 0.41151797417022984
abs_loss 2.895700526318347
34
Train score: 0.9250993226343344
Test score: 0.4711567420159749
abs_loss 2.76861248626467
35
Train score: 0.9206486035597896
Test score: 0.4280451274705117
abs_loss 2.890837902943492
36
Train score: 0.9202880162945606
Test score: 0.4407214671045959
abs_loss 2.831733499129689
37
Train score: 0.9157097767571237
Test score: 0.39918860533079015
abs_loss 2.9396247929666712
38
Train score: 0.9094625127252879
Test score: 0.3532603566333773
abs_loss 3.055248015600445
39
Train score: 0.9121613875251504
Test score: 0.3814650981511887
abs_loss 2.9768723265945134
40
Train score: 0.9200384541950395
Test score: 0.43192793646962757
abs_loss 2.8636788364110175
41
Train score: 0.9128780899709011
Test score: 0.3726207929801332
abs_loss 3.0137590399016267
42
Train score: 0.9159636718837045
Test score: 0.40714482970669763
abs_loss 2.901415414118134
43
Train score: 0.923719058372384
Test score: 0.4548410653470162
abs_loss 2.8012705339517865
44
Train score: 0.9187464191079948
Test score: 0.4097356114898375
abs_loss 2.9319799352373157
45
Train score: 0.9189503722598085
Test score: 0.4269817951488821
abs_loss 2.853327275202504
46
Train score: 0.9163489288025973
Test score: 0.3981944129872611
abs_loss 2.9429854000343587
47
Train score: 0.9100951720585463
Test score: 0.3545887741466107
abs_loss 3.0442223227048624
48
Train score: 0.9121625425262131
Test score: 0.3759527134005065
abs_loss 2.976727704036484
49
Train score: 0.9201946716170712
Test score: 0.43030612129153245
abs_loss 2.863075322840999
50
Train score: 0.9124023601127401
Test score: 0.3740129457615927
abs_loss 3.0051276192745147
51
Train score: 0.91511739246267
Test score: 0.4083939329812317
abs_loss 2.913953349475382
52
Train score: 0.9234573471306683
Test score: 0.45633990101813304
abs_loss 2.8032973795099676
53
Train score: 0.9188538366353607
Test score: 0.41511866297103317
abs_loss 2.9170543302345773
54
Train score: 0.9188547054039172
Test score: 0.425226576191564
abs_loss 2.8567111297490193"""
output_randomforrest3 = r"""1
Train score: 0.9217068847945763
Test score: 0.4506702899441173
abs_loss 2.827148049826422
2
Train score: 0.930775540316791
Test score: 0.5012352711334682
abs_loss 2.7248949389973105
3
Train score: 0.9291194985768726
Test score: 0.4966607662239446
abs_loss 2.7147759103403097
4
Train score: 0.9238011588736426
Test score: 0.46970470974645484
abs_loss 2.7797186399350413
5
Train score: 0.9239566228399789
Test score: 0.45804589744209845
abs_loss 2.796727010949365
6
Train score: 0.9312931511402109
Test score: 0.5078330272793058
abs_loss 2.693235040646847
7
Train score: 0.929170989730366
Test score: 0.486204858520877
abs_loss 2.7358298897770243
8
Train score: 0.9239222196679715
Test score: 0.45752888122360846
abs_loss 2.7944785396630896
9
Train score: 0.9234728226710056
Test score: 0.45579458552530483
abs_loss 2.807490921474435
10
Train score: 0.9292097114689187
Test score: 0.4992909299150754
abs_loss 2.716072929247253
11
Train score: 0.9273519408823676
Test score: 0.48677592717214
abs_loss 2.740116983105949
12
Train score: 0.9248451274435582
Test score: 0.46085267115875017
abs_loss 2.7880522529990803
13
Train score: 0.9244178933234075
Test score: 0.45930321700626575
abs_loss 2.794259610774154
14
Train score: 0.9299881314530374
Test score: 0.4996598570994316
abs_loss 2.734000891297228
15
Train score: 0.9271544612919621
Test score: 0.49820103994765663
abs_loss 2.711401051178075
16
Train score: 0.9252457113139757
Test score: 0.4675855356780697
abs_loss 2.7822326368427692
17
Train score: 0.9218805891908307
Test score: 0.44336010386713376
abs_loss 2.833484976953818
18
Train score: 0.9267531997617291
Test score: 0.48419446781920894
abs_loss 2.740511121321441
19
Train score: 0.9257804774985677
Test score: 0.4660239953152897
abs_loss 2.775688304888788
20
Train score: 0.922703727347878
Test score: 0.45119680154005115
abs_loss 2.821271820696828
21
Train score: 0.9218774543443757
Test score: 0.44586167771027096
abs_loss 2.8285762279951103
22
Train score: 0.927374771513009
Test score: 0.4804669049147319
abs_loss 2.7453533193305883
23
Train score: 0.9271124172492266
Test score: 0.47440018033876385
abs_loss 2.753726318348452
24
Train score: 0.9218093188064336
Test score: 0.4517709373794567
abs_loss 2.8162865843519342"""
# print(float(output_randomforrest2.split("\n")[33*4+3].split(' ')[1] ))
for i in range(len(output_randomforrest3.split('\n'))):
    if i % 4 == 3 and (float((output_randomforrest3.split("\n")[i]).split(' ')[1]) < 2.72): #
        print(i//4 + 1)
        print(output_randomforrest3.split("\n")[i])
        # print(' '.split(output.split("\n")[i]))


3
abs_loss 2.7147759103403097
6
abs_loss 2.693235040646847
10
abs_loss 2.716072929247253
15
abs_loss 2.711401051178075


In [76]:
for i in [2,5,9,14]:
    print(i+1,combinations[i])

3 (WindowsPath("C:/Users/20243625/OneDrive - TU Eindhoven/Desktop/group-4/docs/Sep's picklebestanden/protein dicts to use in gridsearch/dict ID to BLOSUM62 vector in 3 pieces"), WindowsPath('C:/Users/20243625/OneDrive - TU Eindhoven/Desktop/group-4/docs/mol representatie picklebestanden/train_molecule_combined_representation.pkl'))
6 (WindowsPath("C:/Users/20243625/OneDrive - TU Eindhoven/Desktop/group-4/docs/Sep's picklebestanden/protein dicts to use in gridsearch/dict ID to feature vector 2 in 2 pieces"), WindowsPath('C:/Users/20243625/OneDrive - TU Eindhoven/Desktop/group-4/docs/mol representatie picklebestanden/train_molecule_combined_representation.pkl'))
10 (WindowsPath("C:/Users/20243625/OneDrive - TU Eindhoven/Desktop/group-4/docs/Sep's picklebestanden/protein dicts to use in gridsearch/dict ID to feature vector in one-hot in 2 pieces"), WindowsPath('C:/Users/20243625/OneDrive - TU Eindhoven/Desktop/group-4/docs/mol representatie picklebestanden/train_molecule_combined_represen

In [17]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor,RandomForestRegressor

#selecting the preferred dicitionaries
molecule_features_dict_train = pickle.load(open(r"C:\Users\20243625\OneDrive - TU Eindhoven\Desktop\group-4\docs\mol representatie picklebestanden\train_molecule_combined_representation.pkl",'rb'))
molecule_features_dict_test = pickle.load(open(r"C:\Users\20243625\OneDrive - TU Eindhoven\Desktop\group-4\docs\mol representatie picklebestanden\test_molecule_combined_representation.pkl",'rb'))
protein_features_dict = pickle.load(open(r"C:\Users\20243625\OneDrive - TU Eindhoven\Desktop\group-4\docs\Sep's picklebestanden\protein dicts to use in gridsearch\dict ID to feature vector 2 in 2 pieces", 'rb'))
print("dictionaries loaded")
#loading the training set
train_df = pd.read_csv(r"C:\Users\20243625\OneDrive - TU Eindhoven\Desktop\group-4\data\train.csv")
test_df = pd.read_csv(r"C:\Users\20243625\OneDrive - TU Eindhoven\Desktop\group-4\data\test.csv")

X = []
y = []

#feature concatenation to combining each ligand-protein pair

for _, row in train_df.iterrows():
    smiles = row["molecule_SMILES"]
    protein = row["UniProt_ID"]
    affinity_score = row["affinity_score"]
    #quick check if all elements are available
    if smiles not in molecule_features_dict_train: 
        raise FileNotFoundError(
            f"The following SMILES exists in the trainingset but not in the molecule-features dictionary: {smiles}"
        )
    if protein not in protein_features_dict: 
        raise FileNotFoundError(
            f"The following Uniprot_ID exists in the trainingset but not in the protein-features dictionary: {protein}"
        )

    #feature concatenation
    if isinstance(molecule_features_dict_train[smiles], np.ndarray):
        molecule_features_dict_train[smiles] = molecule_features_dict_train[smiles].tolist()
    if isinstance(protein_features_dict[protein], np.ndarray):
        protein_features_dict[protein] = protein_features_dict[protein].tolist()
    combined = molecule_features_dict_train[smiles] + protein_features_dict[protein]

    #data seperation
    X.append(combined)
    y.append(affinity_score)

X = np.array(X, dtype=float)
y = np.array(y, dtype=float)

X_predict = []
for _, row in test_df.iterrows():
    smiles = row["molecule_SMILES"]
    protein = row["UniProt_ID"]
    #quick check if all elements are available
    if smiles not in molecule_features_dict_test: 
        raise FileNotFoundError(
            f"The following SMILES exists in the testset but not in the molecule-features dictionary: {smiles}"
        )
    if protein not in protein_features_dict: 
        raise FileNotFoundError(
            f"The following Uniprot_ID exists in the testset but not in the protein-features dictionary: {protein}"
        )

    #feature concatenation
    if isinstance(molecule_features_dict_test[smiles], np.ndarray):
        molecule_features_dict_test[smiles] = molecule_features_dict_test[smiles].tolist()
    if isinstance(protein_features_dict[protein], np.ndarray):
        protein_features_dict[protein] = protein_features_dict[protein].tolist()
    combined = molecule_features_dict_test[smiles] + protein_features_dict[protein]

    #data seperation
    X_predict.append(combined)
X_predict = np.array(X_predict, dtype=float)
print('length X_predict',len(X_predict))
print("feature concatenation complete")

#splitting the data in training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    #ONLY CHANGE THE TEST_SIZE BY PREFERENCE
    X, y, test_size=0.33, random_state=42 
)
print("data splitting complete")

#BELOW ARE OPTIONS FOR SCALING AND PCA, REMOVE DOCSTRINGS FOR
#THE PREFERRED OPTION(S)

#choose one of the following scaling option, or leave them out if preferred
"""
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_predict = scaler.transform(X_predict)
print("standard scaling complete")
"""

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print('len X',len(X))
X = scaler.fit_transform(X)
X_predict = scaler.transform(X_predict)
print("minmax scaling complete")

#apply PCA if preferred
r"""
ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. 
For supervised learning, you might want to consider 
sklearn.ensemble.HistGradientBoostingClassifier and Regressor 
which accept missing values encoded as NaNs natively. 
Alternatively, it is possible to preprocess the data, 
for instance by using an imputer transformer in a pipeline 
or drop samples with missing values. 
See https://scikit-learn.org/stable/modules/impute.html 
You can find a list of all estimators that handle NaN values 
at the following page: 
https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
"""
"""
pca = PCA(n_components=8)
X_train = pca.fit_transform(X_train)
X_test  = pca.transform(X_test)
print("PCA application complete")
"""
#NOW APPLY YOUR PREFERRED MODEL TYPE
r"""
ValueError: Input X contains NaN.
MLPRegressor does not accept missing values encoded as NaN natively. 
For supervised learning, you might want to consider 
sklearn.ensemble.HistGradientBoostingClassifier and Regressor 
which accept missing values encoded as NaNs natively. 
Alternatively, it is possible to preprocess the data, 
for instance by using an imputer transformer in a pipeline 
or drop samples with missing values. 
See https://scikit-learn.org/stable/modules/impute.html 
You can find a list of all estimators that handle NaN values 
at the following page: 
https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
"""

"""
model = MLPRegressor(
    hidden_layer_sizes=( 16, 8),
    activation='logistic',
    learning_rate='adaptive',
    max_iter=400,
    random_state=42
)
"""

model = RandomForestRegressor(n_estimators=500,
                              max_features='sqrt',
                              max_depth = 400
)


# model = HistGradientBoostingRegressor(
#     loss= "absolute_error",
#     learning_rate= 0.1,
#     max_iter= 100
# )

model.fit(X_train, y_train)
print("Train score:", model.score(X_train, y_train))
print("Test score:", model.score(X_test, y_test))
print('abs_loss',np.average(abs(model.predict(X_test)-y_test)))

#FOR MAKING THE ACTUAL PREDICTIONS

model.fit(X, y)
y_predict = model.predict(X_predict)

submission = pd.DataFrame({
    "ID": test_df["ID"],
    "affinity_score": y_predict
})
submission.to_csv("data/submission2.csv", index=False)

dictionaries loaded
length X_predict 34626
feature concatenation complete
data splitting complete
len X 14839


ValueError: X has 854 features, but MinMaxScaler is expecting 853 features as input.

In [14]:
print(X_predict[-1])

[ 1.21740757e+01  1.97405149e-01  1.36250000e+01  5.35652000e+02
  0.00000000e+00 -4.92236956e-01  1.02500000e+00  1.64776751e+01
  1.01260722e+01  2.14239096e+00 -2.15826678e+00  2.31433929e+00
 -2.24536150e+00  5.94712585e+00 -1.15752168e-01  3.65400968e+00
  2.47479351e+09  0.00000000e+00  1.00000000e+00  1.00000000e+00
  3.00000000e+00  2.00000000e+00  5.00000000e+00  0.00000000e+00
  0.00000000e+00  3.00000000e+00  1.00000000e+01  0.00000000e+00
  1.00000000e+00  1.00000000e+00  0.00000000e+00  0.00000000e+00
  6.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  4.00000000e+00  1.00000000e+00  0.00000000e+00
  1.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  4.00000000e+00  3.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  3.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  2.00000000e+00
  0.00000000e+00  1.00000