# Initiate datasets

In [1]:
dataset_names = ["prostate.data.txt", "HousingData.csv"]
label_column_names = ["lpsa", "MEDV"]
model_names =["LinearRegressor", "SVMRegressor", "SGDRegressor", "RandomForestRegressor", "AdaBoostRegressor", "MLPRegressor"]

In [3]:
i = 0 # dataset index, 0 for prostate cancer and 1 for Boston Housing Data
dataset_name = dataset_names[i]
label_column_name = label_column_names[i]

In [4]:
from src.data.make_dataset import import_clean_data, feature_select, standardizer, split_data_train_test

test_size = 0.3

# clean data, select features (PCA) and standardize it
data_df = import_clean_data(dataset_name)
data_df = standardizer(data_df)
data_df = feature_select(data_df, label_column_name)
X_train, X_test, y_train, y_test = split_data_train_test(data_df, label_column_name, test_size)

# Linear Regression

In [5]:
# pick a model
model_name = "LinearRegressor"

In [6]:
from src.models.train_model import train_model
parameters = {} # use default model parameters
n_splits = 10 # number of splits for K-Fold method
model = train_model(model_name, {}, X_train, y_train, 10)

Training time:  0.049574851989746094       best score:   0.5250745599486663     best params:  {}


In [7]:
from src.visualization.performances import eval_linear
y_pred = model.predict(X_test)
print(f"r2 score: {eval_linear(y_test, y_pred)}")

r2 score: 0.5609815590935237


# Try multiple models with default parameters

In [8]:
model_names =["LinearRegressor", "SVMRegressor", "SGDRegressor", "RandomForestRegressor", "AdaBoostRegressor", "MLPRegressor"]

In [9]:
trained_models = []
for model_name in model_names:
    model = train_model(model_name, {}, X_train, y_train, 10)
    trained_models.append(model)

Training time:  0.0645749568939209       best score:   0.5394928874366907     best params:  {}
Training time:  2.560211181640625       best score:   0.5127892199899098     best params:  {'C': 1, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
Training time:  0.16758203506469727       best score:   0.5173653268849222     best params:  {'fit_intercept': False, 'penalty': 'l1'}
Training time:  11.670139789581299       best score:   0.4266703252566425     best params:  {'max_depth': 5, 'n_estimators': 70}
Training time:  10.007711172103882       best score:   0.4988243556585247     best params:  {'learning_rate': 0.5, 'n_estimators': 150}




Training time:  28.00094199180603       best score:   0.5297319876337113     best params:  {'activation': 'logistic', 'hidden_layer_sizes': (128, 256)}


# Validation

In [None]:
from sklearn.metrics import mean_squared_error

for i in range(len(model_names)):
    score = mean_squared_error(y_test,trained_models[i].predict(X_test))
    print("Model name : ", model_names[i] , "    Score : ",score)