## Train-val-test split

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [6]:
coffee_df = pd.read_csv('data\coffee_desk_dataset_final.csv', index_col=0)
coffee_df

Unnamed: 0,brewing_method,roast,pure_arabica,grind,Fermented_closedtank,price_per_kg
0,drip (alternative brewing methods),light,True,beans,False,52.22
1,drip (alternative brewing methods),medium,True,beans,False,31.92
2,drip (alternative brewing methods),light,True,beans,False,39.20
3,drip (alternative brewing methods),light,True,beans,False,39.20
4,drip (alternative brewing methods),dark,True,beans,False,35.20
...,...,...,...,...,...,...
857,drip (alternative brewing methods),light,True,beans,False,73.33
858,espresso,light,False,beans,False,50.00
859,drip (alternative brewing methods),light,True,beans,False,36.00
860,drip (alternative brewing methods),light,True,beans,False,25.00


In [20]:
X_df = coffee_df.drop('price_per_kg', axis=1) #defining predictors
y_df = coffee_df['price_per_kg'] #defining target variable

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.1, random_state=True) #using random state to ensure I always have random division with the same random numbers
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.1, random_state=True)


In [24]:
print(X_train.shape)
print(X_test.shape)
print(X_validation.shape)
print(y_train.shape)
print(y_test.shape)
print(y_validation.shape)

(677, 5)
(84, 5)
(76, 5)
(677,)
(84,)
(76,)


## Encode data

In [18]:
from sklearn.preprocessing import OneHotEncoder

In [25]:
encoder = OneHotEncoder(handle_unknown="ignore")
encoder.fit(X_train) # all variables are categorical

OneHotEncoder(handle_unknown='ignore')

In [29]:
X_train = encoder.transform(X_train)
X_validation = encoder.transform(X_validation)
X_test = encoder.transform(X_test)

# Regression Models

In [44]:
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn import metrics

LR_model = LinearRegression().fit(X_train, y_train)
L2_model = Ridge().fit(X_train, y_train)
SGD_model = SGDRegressor().fit(X_train, y_train)

In [45]:
def calculate_loss(model, X, y):
    return metrics.mean_squared_error(y, model.predict(X))

In [46]:
print(f"Training loss: {calculate_loss(LR_model, X_train, y_train)}")
print(f"Validation loss: {calculate_loss(LR_model, X_validation, y_validation)}")
print(f"Test loss: {calculate_loss(LR_model, X_test, y_test)}")

Training loss: 149.88620582744136
Validation loss: 116.3509686375882
Test loss: 90.94224112878398


In [49]:
print(f"Training loss: {calculate_loss(L2_model, X_train, y_train)}")
print(f"Validation loss: {calculate_loss(L2_model, X_validation, y_validation)}")
print(f"Test loss: {calculate_loss(L2_model, X_test, y_test)}")

Training loss: 149.89167872538317
Validation loss: 116.20073507953329
Test loss: 90.95058568716465


In [48]:
print(f"Training loss: {calculate_loss(SGD_model, X_train, y_train)}")
print(f"Validation loss: {calculate_loss(SGD_model, X_validation, y_validation)}")
print(f"Test loss: {calculate_loss(SGD_model, X_test, y_test)}")

Training loss: 150.38050968099427
Validation loss: 113.82347965163098
Test loss: 90.05910398187402
