## Train-val-test split
Split data into respective sets

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
coffee_df = pd.read_csv('data\coffee_desk_dataset_final.csv', index_col=0)
coffee_df

Unnamed: 0,brewing_method,roast,pure_arabica,grind,Fermented_closedtank,price_per_kg
0,drip (alternative brewing methods),light,True,beans,False,52.22
1,drip (alternative brewing methods),medium,True,beans,False,31.92
2,drip (alternative brewing methods),light,True,beans,False,39.20
3,drip (alternative brewing methods),light,True,beans,False,39.20
4,drip (alternative brewing methods),dark,True,beans,False,35.20
...,...,...,...,...,...,...
857,drip (alternative brewing methods),light,True,beans,False,73.33
858,espresso,light,False,beans,False,50.00
859,drip (alternative brewing methods),light,True,beans,False,36.00
860,drip (alternative brewing methods),light,True,beans,False,25.00


In [3]:
X_df = coffee_df.drop('price_per_kg', axis=1) # defining predictors
y_df = coffee_df['price_per_kg'] # defining target variable

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42) #using random state to ensure I always have random division with the same random numbers
X_validation, X_test, y_validation, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


## Encode data
Convert categorical data into binary vectors

In [5]:
from sklearn.preprocessing import OneHotEncoder

In [6]:
encoder = OneHotEncoder(handle_unknown="ignore")
encoder.fit(X_train) # all variables are categorical

OneHotEncoder(handle_unknown='ignore')

In [7]:
X_train = encoder.transform(X_train)
X_validation = encoder.transform(X_validation)
X_test = encoder.transform(X_test)

# Regression Models

1. Train Models - 3 linear models wer chosen to compare their performance on data
2. Evaluate Models using mean squared error, mean absolut error and r squared

In [8]:
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score

In [9]:
import matplotlib.pyplot as plt
import numpy as np

def plot_predictions(y_pred, y_true):
    samples = len(y_pred)
    plt.figure()
    plt.scatter(np.arange(samples), y_pred, c='r', label='predictions')
    plt.scatter(np.arange(samples), y_true, c='b', label='true labels', marker='x')
    plt.legend()
    plt.xlabel('Sample numbers')
    plt.ylabel('Values')
    plt.show()

In [13]:
models = {'LinearRegression':LinearRegression(), 'Ridge':Ridge(), 'SGDRegressor':SGDRegressor()}
predictions_by_model = {}
validation_r2_scores = {}

for name, model in models.items():
    # Train each of the models
    model.fit(X_train, y_train)

    # make predictions
    y_train_pred = model.predict(X_train)
    y_validation_pred = model.predict(X_validation)
    y_test_pred = model.predict(X_test)
    predictions_by_model[name] = y_test_pred

    # EVALUATION:

    #Train Loss
    train_mse = metrics.mean_squared_error(y_train, y_train_pred)
    train_mae = metrics.mean_absolute_error(y_train, y_train_pred)
    train_r2 = metrics.r2_score(y_train, y_train_pred)

    # Validation Loss
    validation_mse = metrics.mean_squared_error(y_validation, y_validation_pred)
    validation_mae = metrics.mean_absolute_error(y_validation, y_validation_pred)
    validation_r2_scores[name] = metrics.r2_score(y_validation, y_validation_pred)

    # Test Loss
    # test_mse = metrics.mean_squared_error(y_test, y_test_pred)
    # test_mae = metrics.mean_absolute_error(y_test, y_test_pred)
    # test_r2 = metrics.r2_score(y_test, y_test_pred)

    # plot_predictions(y_validation_pred[:10], y_validation[:10])

    # print(
    #     f"{model.__class__.__name__}:\n"
    #     f"\t\tMean squared error: Train: {train_mse}, Validation {validation_mse}\n"
    #     f"\t\tMean absolute error: Train: {train_mae}, Validation {validation_mae}\n"
    #     # f"\t\tR squared: Train: {train_r2}, Validation {validation_r2}\n"
    #     f"\tModel's bias: {model.intercept_}"
    # )

    # only use test for checking the final performance the model I have chosen among all of these models I'm trying

In [15]:
validation_r2_scores

{'LinearRegression': 0.4894543840561524,
 'Ridge': 0.48899543913289356,
 'SGDRegressor': 0.4855609062917867}

In [24]:
val_scores = pd.DataFrame(validation_r2_scores, index=[0])
val_scores = val_scores.T
val_scores.sort_values([0], ascending=False)

Unnamed: 0,0
LinearRegression,0.489454
Ridge,0.488995
SGDRegressor,0.485561


In [25]:
test_pred = predictions_by_model['LinearRegression'] # this is due to variance, by chance test is performing higher than validation; also below the score would be just by chance
print(metrics.r2_score(y_test, test_pred))

0.4942085356464391


## Adding polynomial features to dataset

In [41]:
from sklearn import preprocessing

def polynomial_datasets(degree: int, *datasets):
    polynomial = preprocessing.PolynomialFeatures(degree=degree)
    return [polynomial.fit_transform(dataset) for dataset in datasets]

In [110]:
X_train_poly, X_validation_poly, X_test_poly = polynomial_datasets(2, X_train, X_validation, X_test)
X_train_poly.shape # we can see how features inclreased from 5 to 91

(669, 91)

In [119]:
for model in models:
    # Train each of the models
    model.fit(X_train_poly, y_train)

    # make predictions
    y_train_pred = model.predict(X_train_poly)
    y_validation_pred = model.predict(X_validation_poly)
    y_test_pred = model.predict(X_test_poly)

    # EVALUATION:

    #Train Loss
    train_mse = metrics.mean_squared_error(y_train, y_train_pred)
    train_mae = metrics.mean_absolute_error(y_train, y_train_pred)
    train_r2 = metrics.r2_score(y_train, y_train_pred)

    # Validation Loss
    validation_mse = metrics.mean_squared_error(y_validation, y_validation_pred)
    validation_mae = metrics.mean_absolute_error(y_validation, y_validation_pred)
    validation_r2 = metrics.r2_score(y_validation, y_validation_pred)

    # Test Loss
    test_mse = metrics.mean_squared_error(y_test, y_test_pred)
    test_mae = metrics.mean_absolute_error(y_test, y_test_pred)
    test_r2 = metrics.r2_score(y_test, y_test_pred)

    # plot_predictions(y_validation_pred[:10], y_validation[:10]) #plot only up to 10 data points 

    print(
        f"{model.__class__.__name__}:\n"
        f"\t\tMean squared error: Train: {train_mse}, Validation {validation_mse} , Test {test_mse}\n"
        f"\t\tMean absolute error: Train: {train_mae}, Validation {validation_mae} , Test {test_mae}\n"
        f"\t\tR squared: Train: {train_r2}, Validation {validation_r2} , Test {test_r2}\n"
        f"\tModel's bias: {model.intercept_}"
    )

LinearRegression:
		Mean squared error: Train: 122.52588745815115, Validation 118.53740394015227 , Test 126.50246505520077
		Mean absolute error: Train: 7.874504095873349, Validation 7.6655383512565 , Test 8.08284606214806
		R squared: Train: 0.5377850632110095, Validation 0.5360127086020513 , Test 0.5388878974490876
	Model's bias: 31.030862550233625
Ridge:
		Mean squared error: Train: 122.76575621158102, Validation 119.00183708536403 , Test 126.51843975831676
		Mean absolute error: Train: 7.912742182759642, Validation 7.719744739385063 , Test 8.105163514362955
		R squared: Train: 0.5368801856948837, Validation 0.5341947923163949 , Test 0.5388296683154639
	Model's bias: 31.976091674607847
SGDRegressor:
		Mean squared error: Train: 126.12628519105185, Validation 122.81628920248984 , Test 129.4264005945734
		Mean absolute error: Train: 8.132962235358724, Validation 7.9778516790296825 , Test 8.28760977510171
		R squared: Train: 0.5242029733763505, Validation 0.5192640004552374 , Test 0.52

** Observations: ** The R^2 has visibly increased with the use of polynomial features of 2nd degree, nevertheless, all thre models yield similar results.
Moreover, model's bias has decreased.

## Cross Validation
Using sklearn cross_validation_score with 5 splits, to ensure results are not influenced by the initial split of data.

In [70]:
X_train_validation, X_test, y_train_validation, y_test = train_test_split(X_df, y_df, test_size=0.1, random_state=42) # using random state to ensure I always have random division with the same random numbers
X_train_validation = encoder.transform(X_train_validation)

In [88]:
# adding polynomial features to data
polynomial = preprocessing.PolynomialFeatures(degree=2)
X_train_validation_poly = polynomial.fit_transform(X_train_validation)
X_train_validation_poly.shape

(753, 91)

In [115]:
for model in models:
    scores_r2 = cross_val_score(model, X_train_validation, y_train_validation, scoring='r2', cv=5)
    print(
        f"{model.__class__.__name__}: ",
        scores_r2,
        f"model's bias: {model.intercept_}"
    )

LinearRegression:  [0.3449723  0.49843856 0.42078476 0.43668589 0.4888519 ] model's bias: 31.030862550233625
Ridge:  [0.34671939 0.49765014 0.42090051 0.43616885 0.48906956] model's bias: 31.976091674607847
SGDRegressor:  [0.35298728 0.48676187 0.41771225 0.43456289 0.49024491] model's bias: [0.04613564]


In [116]:
for model in models:
    scores_r2 = cross_val_score(model, X_train_validation_poly, y_train_validation, scoring='r2', cv=5)
    print(
        f"{model.__class__.__name__}: ",
        scores_r2,
        f"model's bias: {model.intercept_}"
    )

LinearRegression:  [0.44262793 0.52748237 0.4797011  0.48500845 0.54020545] model's bias: 31.030862550233625
Ridge:  [0.44184882 0.53391246 0.48057261 0.47823286 0.53987915] model's bias: 31.976091674607847
SGDRegressor:  [0.42212874 0.53788876 0.47374473 0.46660963 0.5316073 ] model's bias: [0.04613564]


**Observations:** in the cross validation results are almost on the same level except of one of the folds, this can prove the model is not biased by the split.