In [2]:
%cd ..

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import sys
from data_preparation import *
import torch
import torch.nn as nn
import torch.optim as optim


In [5]:
PROTEIN = 'cyp2d6'
df = prepare_df(PROTEIN)
X, y = prepare_np_xy(df)

In [6]:
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid = { 
    'n_estimators': [200,300,400,500],
    'max_features': ['sqrt','log2'],
    'max_depth' : [15,16,17,18,19,20,21,22],
    'random_state' : [18, 20, 48]
}

CV_rfr = GridSearchCV(estimator=RandomForestRegressor(), param_grid=grid, cv= 5)
CV_rfr.fit(np.array(X_train), np.array(y_train))

In [8]:
rfs = []
for train_idxs, test_idxs in get_kfold_split(X, y, 5):
    X_train, X_test = X[train_idxs], X[test_idxs]
    y_train, y_test = y[train_idxs], y[test_idxs]

    rf = RandomForestRegressor(n_estimators = 300, max_features = 'sqrt', max_depth = 18, random_state = 18).fit(X_train, y_train)

    rfs.append((rf, X_train, X_test, y_train, y_test))

mses = []
for rf, X_train, X_test, y_train, y_test in rfs:
    prediction = rf.predict(X_test)
    mse = mean_squared_error(y_test, prediction)
    mses.append(mse)
    rmse = mse**.5
    print("mse: ", mse)
    print("rmse: ", rmse)

print("\navg: ", np.mean(mses))
print("std: ", np.std(mses))

mse:  0.6297138357104747
rmse:  0.7935451062860098
mse:  0.5191693620258606
rmse:  0.720534081099472
mse:  0.5830932876699257
rmse:  0.7636054528812152
mse:  0.5762980012170471
rmse:  0.759142938593943
mse:  0.5850109922617616
rmse:  0.7648601128714725

avg:  0.578657095777014
std:  0.035245420371088326


In [11]:
setups = []
for train_idxs, test_idxs in get_kfold_split(X, y, 5):
    X_train, X_test = X[train_idxs], X[test_idxs]
    y_train, y_test = y[train_idxs], y[test_idxs]

    input_dim = X_train.shape[1]

    model = nn.Sequential(
        nn.Linear(input_dim, 64),
        nn.BatchNorm1d(64),
        nn.LeakyReLU(),
        nn.Linear(64, 32),
        nn.LeakyReLU(),
        nn.Linear(32, 1),
    )

    loss_function = nn.MSELoss()

    optimizer = optim.SGD(model.parameters(), lr=0.001)

    num_epochs = 3000

    train_data = torch.from_numpy(X_train).to(torch.float32)
    train_labels = torch.from_numpy(y_train).to(torch.float32)

    for epoch in range(num_epochs):

        inputs = train_data
        labels = train_labels.unsqueeze(-1)
        
        optimizer.zero_grad()

        outputs = model(inputs)

        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()
        
        epoch_loss = loss / len(train_data)

        if epoch % (num_epochs/10) == 0:
            print(100 * epoch/num_epochs, '%', end='   ')
            epoch_loss = loss / len(train_data)
            print(f"Epoch {epoch} loss: {epoch_loss:.6f}")

    setups.append((model, X_train, X_test, y_train, y_test))
    print()

mses = []
for model, X_train, X_test, y_train, y_test in setups:
    X_test_torch = torch.from_numpy(X_test).to(torch.float32)
    y_test_torch = torch.from_numpy(y_test).to(torch.float32).unsqueeze(-1)
    prediction = model(X_test_torch)
    mse = mean_squared_error(y_test_torch.detach().numpy(), prediction.detach().numpy())
    mses.append(mse)
    rmse = mse**.5
    print("mse: ", mse)
    print("rmse: ", rmse)

print("\navg: ", np.mean(mses))
print("std: ", np.std(mses))

0.0 %   Epoch 0 loss: 0.006086
10.0 %   Epoch 300 loss: 0.000246
20.0 %   Epoch 600 loss: 0.000215
30.0 %   Epoch 900 loss: 0.000203
40.0 %   Epoch 1200 loss: 0.000195
50.0 %   Epoch 1500 loss: 0.000189
60.0 %   Epoch 1800 loss: 0.000184
70.0 %   Epoch 2100 loss: 0.000180
80.0 %   Epoch 2400 loss: 0.000176
90.0 %   Epoch 2700 loss: 0.000173

0.0 %   Epoch 0 loss: 0.005520
10.0 %   Epoch 300 loss: 0.000261
20.0 %   Epoch 600 loss: 0.000231
30.0 %   Epoch 900 loss: 0.000217
40.0 %   Epoch 1200 loss: 0.000208
50.0 %   Epoch 1500 loss: 0.000201
60.0 %   Epoch 1800 loss: 0.000195
70.0 %   Epoch 2100 loss: 0.000191
80.0 %   Epoch 2400 loss: 0.000186
90.0 %   Epoch 2700 loss: 0.000183

0.0 %   Epoch 0 loss: 0.006152
10.0 %   Epoch 300 loss: 0.000255
20.0 %   Epoch 600 loss: 0.000223
30.0 %   Epoch 900 loss: 0.000210
40.0 %   Epoch 1200 loss: 0.000201
50.0 %   Epoch 1500 loss: 0.000194
60.0 %   Epoch 1800 loss: 0.000189
70.0 %   Epoch 2100 loss: 0.000185
80.0 %   Epoch 2400 loss: 0.000181
90.0