# Baseline Algorithms

#### Import the required libraries

In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [96]:
from sklearn import model_selection
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.naive_bayes import GaussianNB
from scipy import stats

In [97]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Read the dataset

In [98]:
# read the data
df = pd.read_csv("../experiments/ruby.csv", index_col=0)

###### Prepare the data

In [99]:
# delete unnamed column
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
# convert all columns of DataFrame
df = df.dropna()
# find the objects columns
cols = df.columns[df.dtypes.eq('object')]
# convert to numeric these columns
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

In [100]:
df.shape

(131266, 47)

### Set the baseline algorithms

In [101]:
seed = 7
array = df.values
X = array[:,0:46]
Y = array[:,46]
validation_size = 0.2
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [102]:
# set the models
models = []
models.append(('LR', LinearRegression()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('SVM', LinearSVR()))
models.append(('RF', RandomForestRegressor()))
models.append

<function list.append(object, /)>

In [103]:
names = []
for name, model in models:
    names.append(name)

In [104]:
# evaluate each model in turn
def model_generation(evaluation_metric):
    results = []
    for name, model in models:
        kfold = model_selection.KFold(n_splits=10, random_state=seed)
        cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=evaluation_metric)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
    return(results)

In [105]:
models_execution = {}

#metrics = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']
metrics = ['r2']

for metric in metrics:
    models_execution[metric] = model_generation(metric)



LR: 0.646561 (0.008363)




KNN: 0.819254 (0.008252)




CART: 0.828292 (0.009809)




SVM: -2533.850922 (7554.512553)
RF: 0.882215 (0.008252)
