In [1]:
# The usuals
import os
import sys
import subprocess
import pandas as pd
import numpy as np

# Useful stuff
from joblib import dump, load
from pprint import pprint
import random
from statistics import mode
from datetime import datetime

# PyTorch bits
import torch
from torch import nn
from torch.nn import functional as F

# Autoreload
%load_ext autoreload
%autoreload
%load_ext autoreload

# Pandas header
pd.set_option("display.colheader_justify","left")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# https://drive.google.com/u/0/uc?export=download&confirm=IpWH&id=1vYyJL_IB6KjKCxuk9kg4vIMPGTtoX8Ek

In [3]:
# Ensure the directory is correct... every time.
for i in range(5):
    if not os.getcwd().lower() == subprocess.run("git rev-parse --show-toplevel", stdout=subprocess.PIPE).stdout.decode("utf-8").replace("/","\\").strip().lower():
        os.chdir(".."),
    else:
        break

In [4]:
# Set up sys path environment
if not os.path.abspath(".") in sys.path:
    sys.path.append(os.path.abspath("."))
else:
    sys.path.remove(os.path.abspath("."))
    sys.path.append(os.path.abspath("."))

In [5]:
print(os.getcwd())

c:\Users\CHMAHONE\OneDrive - Schenker AG\Documents\Git Repos\BeerPrediction


In [6]:
data = pd.read_csv("./data/raw/beer_reviews.csv")

In [7]:
print(data.shape)
print(data.columns)
display(data.describe())

(1586614, 13)
Index(['brewery_id', 'brewery_name', 'review_time', 'review_overall',
       'review_aroma', 'review_appearance', 'review_profilename', 'beer_style',
       'review_palate', 'review_taste', 'beer_name', 'beer_abv',
       'beer_beerid'],
      dtype='object')


Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_beerid
count,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1518829.0,1586614.0
mean,3130.099,1224089000.0,3.815581,3.735636,3.841642,3.743701,3.79286,7.042387,21712.79
std,5578.104,76544270.0,0.7206219,0.6976167,0.6160928,0.6822184,0.7319696,2.322526,21818.34
min,1.0,840672000.0,0.0,1.0,0.0,1.0,1.0,0.01,3.0
25%,143.0,1173224000.0,3.5,3.5,3.5,3.5,3.5,5.2,1717.0
50%,429.0,1239203000.0,4.0,4.0,4.0,4.0,4.0,6.5,13906.0
75%,2372.0,1288568000.0,4.5,4.0,4.0,4.0,4.5,8.5,39441.0
max,28003.0,1326285000.0,5.0,5.0,5.0,5.0,5.0,57.7,77317.0


In [8]:
# from pandas_profiling import ProfileReport

# # Create profile report
# profile = ProfileReport(data, title="Profile Report")
# # Export
# profile.to_file("./reports/InitialReport.html")
# # View
# display(profile)


# Set Up Dataset

In [9]:
# Copy the data
data_upd = data.copy()

### Select only the desired feature columns

In [10]:
from src.data.set_data import sel_feat_cols

# Select only the desired feature columns
data_upd = sel_feat_cols(data_upd, ["brewery_name","review_aroma","review_appearance","review_palate","review_taste","beer_style"])

In [11]:
# Check dataset
display(data_upd)

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste,beer_style
0,Vecchio Birraio,2.0,2.5,1.5,1.5,Hefeweizen
1,Vecchio Birraio,2.5,3.0,3.0,3.0,English Strong Ale
2,Vecchio Birraio,2.5,3.0,3.0,3.0,Foreign / Export Stout
3,Vecchio Birraio,3.0,3.5,2.5,3.0,German Pilsener
4,Caldera Brewing Company,4.5,4.0,4.0,4.5,American Double / Imperial IPA
...,...,...,...,...,...,...
1586609,The Defiant Brewing Company,4.0,3.5,4.0,4.0,Pumpkin Ale
1586610,The Defiant Brewing Company,5.0,2.5,2.0,4.0,Pumpkin Ale
1586611,The Defiant Brewing Company,3.5,3.0,3.5,4.0,Pumpkin Ale
1586612,The Defiant Brewing Company,4.5,4.5,4.5,4.5,Pumpkin Ale


### Train the encoders

In [12]:
from src.data.prep_data import make_si
from src.data.prep_data import encode_features, scale_features

In [13]:
# `SimpleImputer` for `NaN` values in the `brewery_name`
data_upd, si = make_si(data_upd, "brewery_name", True)
# data_upd[["brewery_name"]] = si.transform(data_upd[["brewery_name"]])
dump(si, "./models/encoders/si_handle_nan_brewery_name.joblib")

['./models/encoders/si_handle_nan_brewery_name.joblib']

In [14]:
# `OrdinalEncoder` to get numeric versions of `brewery_name`
data_upd, oe = encode_features(data_upd, "brewery_name", "oe", True)
# data_upd[["brewery_name"]] = oe.transform(data_upd[["brewery_name"]])
dump(oe, "./models/encoders/oe_numericify_brewery_name.joblib")

['./models/encoders/oe_numericify_brewery_name.joblib']

In [15]:
# `LabelEncoder` to get numeric versions of `beer_style`
data_upd, le = encode_features(data_upd, "beer_style", "le", True)
# data_upd[["beer_style"]] = le.transform(data_upd[["beer_style"]])
dump(oe, "./models/encoders/le_numericify_beer_style.joblib")

['./models/encoders/le_numericify_beer_style.joblib']

In [16]:
# `StandardScaler` to get them all in the same scale
data_upd, sc = scale_features(data_upd, ["brewery_name","review_aroma","review_appearance","review_palate","review_taste"], True)
dump(sc, "./models/encoders/sc_scale_features.joblib")

['./models/encoders/sc_scale_features.joblib']

In [17]:
display(data_upd.info())
display(data_upd.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 6 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   brewery_name       1586614 non-null  float64
 1   review_aroma       1586614 non-null  float64
 2   review_appearance  1586614 non-null  float64
 3   review_palate      1586614 non-null  float64
 4   review_taste       1586614 non-null  float64
 5   beer_style         1586614 non-null  int32  
dtypes: float64(5), int32(1)
memory usage: 66.6 MB


None

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste,beer_style
0,1.464354,-2.487952,-2.177663,-3.288833,-3.132454,65
1,1.464354,-1.771225,-1.366096,-1.090123,-1.083188,51
2,1.464354,-1.771225,-1.366096,-1.090123,-1.083188,59
3,1.464354,-1.054499,-0.55453,-1.823026,-1.083188,61
4,-0.823949,1.095679,0.257037,0.375684,0.966078,9


### Construct Pipelines

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Import transformers
si = load("./models/encoders/si_handle_nan_brewery_name.joblib")
oe = load("./models/encoders/oe_numericify_brewery_name.joblib")
sc = load("./models/encoders/sc_scale_features.joblib")

# `SimpleImputer` for `NaN` values
nan_transformer = Pipeline(
    steps=[
        # ("simple_imputer", SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="Other"))
        ("simple_imputer", si)
    ]
)

# `OrdinalEncoder` for `brewery_name`
cat_transformer = Pipeline(
    steps=[
        # ("ordinal_encoder", OrdinalEncoder())
        ("ordinal_encoder", oe)
    ]
)

# `StandardScaler` for numeric cols
num_transformer = Pipeline(
    steps=[
        # ("standard_scaler", StandardScaler())
        ("standard_scaler", sc)
    ]
)

# Preprocessor to do everything
preprocessor = ColumnTransformer \
    ( transformers = \
        [ ("nan_cols", nan_transformer, ["brewery_name"])
        , ("cat_cols", cat_transformer, ["brewery_name"])
        , ("num_cols", num_transformer, ["brewery_name","review_aroma","review_appearance","review_palate","review_taste"])
        ]
    )

### Pop the Target

In [19]:
from src.data.set_data import pop_target

# Separate the features from the target
feat, targ = pop_target(data_upd, "beer_style")

In [20]:
# Save the data sets
dump(data, './data/interim/data.joblib')
dump(targ, "./data/interim/targ.joblib")
dump(feat, "./data/interim/feat.joblib")

['./data/interim/feat.joblib']

In [21]:
# Check the sizes
print(targ.shape)
print(feat.shape)
display(targ.head())
display(feat.head())

(1586614, 1)
(1586614, 5)


Unnamed: 0,beer_style
0,65
1,51
2,59
3,61
4,9


Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste
0,1.464354,-2.487952,-2.177663,-3.288833,-3.132454
1,1.464354,-1.771225,-1.366096,-1.090123,-1.083188
2,1.464354,-1.771225,-1.366096,-1.090123,-1.083188
3,1.464354,-1.054499,-0.55453,-1.823026,-1.083188
4,-0.823949,1.095679,0.257037,0.375684,0.966078


In [22]:
from src.data.set_data import split_data

# Split data
feat_trn, feat_tst, targ_trn, targ_tst = split_data(feat, targ, test_size=0.3)
feat_trn, feat_val, targ_trn, targ_val = split_data(feat_trn, targ_trn, test_size=0.3)

In [23]:
from src.utils.misc import get_shape

# Check the data
temp = pd.DataFrame()
new = [{"name": data, "shape": get_shape(eval(data), True)} for data in ["feat_trn", "targ_trn", "feat_val", "targ_val", "feat_tst", "targ_tst"]]
temp = temp.append(new, ignore_index=True)
temp = temp.style.set_properties(**{"text-align":"left"})
display(temp)

Unnamed: 0,name,shape
0,feat_trn,777440 x 5
1,targ_trn,777440 x 1
2,feat_val,333189 x 5
3,targ_val,333189 x 1
4,feat_tst,475985 x 5
5,targ_tst,475985 x 1


In [24]:
# Dump everything
dump(feat_trn, "./data/processed/feat_trn.joblib")
dump(targ_trn, "./data/processed/targ_trn.joblib")
dump(feat_val, "./data/processed/feat_val.joblib")
dump(targ_val, "./data/processed/targ_val.joblib")
dump(feat_tst, "./data/processed/feat_tst.joblib")
dump(targ_tst, "./data/processed/targ_tst.joblib")

['./data/processed/targ_tst.joblib']

In [25]:
# Convert everything to Numpy arrays
feat_trn = feat_trn.to_numpy()
targ_trn = targ_trn.to_numpy()[:,0]
feat_val = feat_val.to_numpy()
targ_val = targ_val.to_numpy()[:,0]
feat_tst = feat_tst.to_numpy()
targ_tst = targ_tst.to_numpy()[:,0]

In [26]:
print(feat_trn.shape, type(feat_trn))
print(targ_trn.shape, type(targ_trn))
print(feat_val.shape, type(feat_val))
print(targ_val.shape, type(targ_val))
print(feat_tst.shape, type(feat_tst))
print(targ_tst.shape, type(targ_tst))

print(targ_trn[:10])

(777440, 5) <class 'numpy.ndarray'>
(777440,) <class 'numpy.ndarray'>
(333189, 5) <class 'numpy.ndarray'>
(333189,) <class 'numpy.ndarray'>
(475985, 5) <class 'numpy.ndarray'>
(475985,) <class 'numpy.ndarray'>
[ 31  14 102  19 102  89  60   2  14  16]


**Steps for transforming the data in production:**

1. Ensure it has parsed all feature columns
1. Handle `NaN` values in the `brewery_name` column
1. Encode the `brewery_name` column using the `OrdinalEncoder`
1. Select only feature columns
1. Run the `StandardScaler` over the full dataset.

# Modelling

### Data Generators

In [27]:
from src.models.pytorch import PyTorchDataset

# Set datasets
data_trn = PyTorchDataset(feat=feat_trn, targ=targ_trn)
data_val = PyTorchDataset(feat=feat_val, targ=targ_val)
data_tst = PyTorchDataset(feat=feat_tst, targ=targ_tst)

### Null Model

In [27]:
from src.models.null import NullModel
from src.models.performance import print_class_perf

# Baseline
baseline = NullModel(target_type="class")
pred_trn = baseline.fit_predict(targ_trn)
print_class_perf(pred_trn, targ_trn, set_name="Baseline", average="weighted")

Accuracy Baseline: 1.0
F1 Baseline: 1.0


In [28]:
print(pred_trn.shape)
print(targ_trn.shape)
print(len(set(targ_trn)))

(777440,)
(777440,)
104


### First Attempt

In [29]:
from src.models.pytorch import Net

# Instantiate model
model = Net(feat_trn.shape[1], len(set(targ_trn)))

In [30]:
from src.models.pytorch import get_device

# Push to device
device = get_device()
model.to(device)

Net(
  (fc1): Linear(in_features=5, out_features=5, bias=True)
  (fc2): Linear(in_features=5, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=20, bias=True)
  (fc4): Linear(in_features=20, out_features=40, bias=True)
  (fc5): Linear(in_features=40, out_features=80, bias=True)
  (fc6): Linear(in_features=80, out_features=100, bias=True)
  (out): Linear(in_features=100, out_features=104, bias=True)
  (softmax): Softmax(dim=1)
)

In [36]:
print(device)

cuda:0


In [38]:
# Instantiate operators
crit = nn.CrossEntropyLoss()
optm = torch.optim.Adam(model.parameters(), lr=0.01)
sche = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optm, mode="min", patience=3)

In [39]:
N_EPOCHS = 10
BATCH_SIZE = 1000

In [40]:
from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(data_trn, model=model, criterion=crit, optimizer=optm, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(data_val, model=model, criterion=crit, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

Epoch: 0
	(train)	|	Loss: 0.1434	|	Acc: 7.1%
	(valid)	|	Loss: 0.1433	|	Acc: 7.4%
Epoch: 1
	(train)	|	Loss: 0.1445	|	Acc: 3.7%
	(valid)	|	Loss: 0.1446	|	Acc: 3.4%
Epoch: 2
	(train)	|	Loss: 0.1446	|	Acc: 3.4%
	(valid)	|	Loss: 0.1446	|	Acc: 3.4%
Epoch: 3
	(train)	|	Loss: 0.1446	|	Acc: 3.4%
	(valid)	|	Loss: 0.1446	|	Acc: 3.4%
Epoch: 4
	(train)	|	Loss: 0.1446	|	Acc: 3.4%
	(valid)	|	Loss: 0.1446	|	Acc: 3.4%
Epoch: 5
	(train)	|	Loss: 0.1446	|	Acc: 3.4%
	(valid)	|	Loss: 0.1446	|	Acc: 3.4%
Epoch: 6
	(train)	|	Loss: 0.1446	|	Acc: 3.4%
	(valid)	|	Loss: 0.1446	|	Acc: 3.4%
Epoch: 7
	(train)	|	Loss: 0.1446	|	Acc: 3.4%
	(valid)	|	Loss: 0.1446	|	Acc: 3.4%
Epoch: 8
	(train)	|	Loss: 0.1446	|	Acc: 3.4%
	(valid)	|	Loss: 0.1446	|	Acc: 3.4%
Epoch: 9
	(train)	|	Loss: 0.1446	|	Acc: 3.4%
	(valid)	|	Loss: 0.1446	|	Acc: 3.4%
Epoch: 10
	(train)	|	Loss: 0.1446	|	Acc: 3.4%
	(valid)	|	Loss: 0.1446	|	Acc: 3.4%
Epoch: 11
	(train)	|	Loss: 0.1446	|	Acc: 3.5%
	(valid)	|	Loss: 0.1446	|	Acc: 3.4%
Epoch: 12
	(train)	|	Loss:

### Using function to define the model

In [31]:
from src.models.pytorch import model_set

# 
modl = model_set \
    ( first_shape=feat_trn.shape[1]
    , hidden_shapes=[10,20,40,80,100]
    , hidden_acti=nn.ReLU()
    , final_shape=len(set(targ_trn))
    , final_acti=nn.Softmax(dim=1)
    , dropout=0.2
    )

In [32]:
from src.models.pytorch import get_device

# Push to device
modl.to(get_device())

Sequential(
  (shap_frst): Linear(in_features=5, out_features=10, bias=True)
  (acti_frst): ReLU()
  (regl_frst): Dropout(p=0.2, inplace=False)
  (shap_01): Linear(in_features=10, out_features=20, bias=True)
  (acti_01): ReLU()
  (regl_01): Dropout(p=0.2, inplace=False)
  (shap_02): Linear(in_features=20, out_features=40, bias=True)
  (acti_02): ReLU()
  (regl_02): Dropout(p=0.2, inplace=False)
  (shap_03): Linear(in_features=40, out_features=80, bias=True)
  (acti_03): ReLU()
  (regl_03): Dropout(p=0.2, inplace=False)
  (shap_04): Linear(in_features=80, out_features=100, bias=True)
  (acti_04): ReLU()
  (regl_04): Dropout(p=0.2, inplace=False)
  (shap_finl): Linear(in_features=100, out_features=104, bias=True)
  (acti_finl): Softmax(dim=1)
)

In [None]:
# Instantiate operators
crit = nn.CrossEntropyLoss()
optm = torch.optim.Adam(model.parameters(), lr=0.01)
sche = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optm, mode="min", patience=3)

In [None]:
N_EPOCHS = 10
BATCH_SIZE = 1000

In [None]:
from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(data_trn, model=model, criterion=crit, optimizer=optm, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(data_val, model=model, criterion=crit, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

### Less Complicated Model

In [28]:
from src.models.pytorch import Net

# Instantiate model
modl = Net(feat_trn.shape[1], len(set(targ_trn)))

In [29]:
from src.models.pytorch import get_device

# Push to device
device = get_device()
print(device)
modl.to(device)

cpu


Net(
  (fc1): Linear(in_features=5, out_features=10, bias=True)
  (fc2): Linear(in_features=10, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=100, bias=True)
  (out): Linear(in_features=100, out_features=104, bias=True)
  (softmax): Softmax(dim=1)
)

In [31]:
# Instantiate operators
crit = nn.CrossEntropyLoss()
optm = torch.optim.Adam(modl.parameters(), lr=0.01)
sche = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optm, mode="min", patience=3)

In [32]:
N_EPOCHS = 5
BATCH_SIZE = 1000

In [33]:
from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(data_trn, model=modl, criterion=crit, optimizer=optm, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(data_val, model=modl, criterion=crit, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

TypeError: step() missing 1 required positional argument: 'metrics'