Libraries:
 - conda install pandas

In [18]:
import pandas as pd 

df = pd.read_csv("../data/computers.csv")
df = df.filter(items=df.columns[0:7])


df = df.round(2)

df.describe(include='all')

Unnamed: 0,price,speed,hd,ram,screen,cd,multi
count,6259.0,6259.0,6259.0,6259.0,6259.0,6259,6259
unique,,,,,,2,2
top,,,,,,no,no
freq,,,,,,3351,5386
mean,2219.57661,52.011024,416.601694,8.286947,14.608723,,
std,580.803956,21.157735,258.548445,5.631099,0.905115,,
min,949.0,25.0,80.0,2.0,14.0,,
25%,1794.0,33.0,214.0,4.0,14.0,,
50%,2144.0,50.0,340.0,8.0,14.0,,
75%,2595.0,66.0,528.0,8.0,15.0,,


For mixed data types provided via a DataFrame, the default is to return only an analysis of numeric columns. If the dataframe consists only of object and categorical data without any numeric columns, the default is to return an analysis of both the object and categorical columns. If include='all' is provided as an option, the result will include a union of attributes of each type.

In [2]:
df['cd'].value_counts()

no     3351
yes    2908
Name: cd, dtype: int64

In [3]:
df['multi'].value_counts()

no     5386
yes     873
Name: multi, dtype: int64

---

In [4]:
df['cd'] = pd.Categorical(df['cd'], categories=df['cd'].unique()).codes
df['multi'] = pd.Categorical(df['multi'], categories=df['multi'].unique()).codes
test_set = df.sample(n=1000, random_state=0)
main_set = df.drop(test_set.index)

In [20]:
main_set.describe()

Unnamed: 0,price,speed,hd,ram,screen,cd,multi
count,5259.0,5259.0,5259.0,5259.0,5259.0,5259.0,5259.0
mean,2224.561894,52.235026,417.625975,8.340749,14.618178,0.464537,0.139
std,579.301211,21.218069,258.554273,5.658289,0.91012,0.498788,0.345979
min,999.0,25.0,80.0,2.0,14.0,0.0,0.0
25%,1795.0,33.0,214.0,4.0,14.0,0.0,0.0
50%,2154.0,50.0,345.0,8.0,14.0,0.0,0.0
75%,2595.0,66.0,528.0,8.0,15.0,1.0,0.0
max,5399.0,100.0,2100.0,32.0,17.0,1.0,1.0


---

In [5]:
X = main_set.loc[:, main_set.columns != 'price'].to_numpy()
y = main_set['price'].to_numpy()
X_test = test_set.loc[:, test_set.columns != 'price'].to_numpy()
y_test = test_set['price'].to_numpy()

In [22]:
import numpy as np
from sklearn.model_selection import RepeatedKFold, cross_validate, GridSearchCV, RandomizedSearchCV

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LinearRegression

def mean_squared_error_(ground_truth, predictions):
    return np.sqrt(mean_squared_error(ground_truth, predictions))
RMSE = make_scorer(mean_squared_error_, greater_is_better=False) 


Tuning parameter 'shrinkage' was held constant at a value of 0.1

Tuning parameter 'n.minobsinnode' was held constant at a value of 10

RMSE was used to select the optimal model using the smallest value.

The final values used for the model were n.trees = 100, interaction.depth = 2, shrinkage = 0.1 and n.minobsinnode = 10.

In [7]:
rkf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=0)

clf = GradientBoostingRegressor() 
param_distributions = {'learning_rate': [0.1], 'min_samples_leaf': [10], 'n_estimators': [50, 100], 'max_depth': [1, 2]}
tmp = RandomizedSearchCV(clf, param_distributions, n_iter=4, scoring=RMSE, cv=rkf, random_state=0).fit(X, y)

all = pd.DataFrame() 

In [8]:
for param_id in range(0, len(tmp.cv_results_['params'])):
    tmp_test_scores = []
    for split_scores_id in range(0, 100): 
        tmp_test_scores.append(tmp.cv_results_[f'split{split_scores_id}_test_score'][param_id])
    params = tmp.cv_results_["params"][param_id]
    params_list = [params['learning_rate'], params['max_depth'], params['min_samples_leaf'], params['n_estimators']]
    all = all.append({'Model': f'gb[,{",".join(str(x) for x in params_list)},]', 'RMSE': -1*min(tmp_test_scores), "SD": np.std(tmp_test_scores)}, ignore_index=True)

In [23]:

parameters = {'n_neighbors': [1, 2, 3, 5, 7, 10]}
tmp = GridSearchCV(KNeighborsRegressor(), parameters, cv=rkf, scoring=RMSE).fit(X, y)


In [24]:
bestNNModel = tmp.best_estimator_
for param_id in range(0, len(tmp.cv_results_['params'])):
    tmp_test_scores = []
    for split_scores_id in range(0, 100): 
        tmp_test_scores.append(tmp.cv_results_[f'split{split_scores_id}_test_score'][param_id])
    all = all.append({'Model': f'nn[,{tmp.cv_results_["params"][param_id]["n_neighbors"]},]', 'RMSE': -1*min(tmp_test_scores), "SD": np.std(tmp_test_scores)}, ignore_index=True)

In [25]:
bestNNModel

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                     weights='uniform')

In [11]:
clf = LinearRegression()

tmp = cross_validate(clf, X, y, cv=rkf, scoring=RMSE)
all = all.append({'Model': 'lm', 'RMSE': -1*min(tmp['test_score']), "SD": np.std(tmp['test_score'])}, ignore_index=True)

In [12]:
all

Unnamed: 0,Model,RMSE,SD
0,"gb[,0.1,1,10,50,]",465.734108,17.891289
1,"gb[,0.1,1,10,100,]",447.622174,17.133322
2,"gb[,0.1,2,10,50,]",433.891617,17.185479
3,"gb[,0.1,2,10,100,]",407.898817,15.229875
4,"nn[,1,]",423.044016,18.534263
5,"nn[,2,]",384.245843,15.759044
6,"nn[,3,]",371.771094,14.135298
7,"nn[,5,]",370.264823,14.634447
8,"nn[,7,]",371.455809,15.485513
9,"nn[,10,]",380.230663,16.406543


In [26]:
all.sort_values('RMSE')

Unnamed: 0,Model,RMSE,SD
7,"nn[,5,]",370.264823,14.634447
8,"nn[,7,]",371.455809,15.485513
6,"nn[,3,]",371.771094,14.135298
9,"nn[,10,]",380.230663,16.406543
5,"nn[,2,]",384.245843,15.759044
3,"gb[,0.1,2,10,100,]",407.898817,15.229875
4,"nn[,1,]",423.044016,18.534263
11,"nn[,1,]",423.044016,18.534263
2,"gb[,0.1,2,10,50,]",433.891617,17.185479
12,"nn[,2,]",440.353185,18.082168


In [14]:
y_predicted = bestNNModel.predict(X_test)
round(np.sqrt(np.mean((y_predicted - y_test)**2)), 2)

320.45