In [1]:
# The usuals
import os
import sys
import subprocess
import pandas as pd
import numpy as np

# Useful stuff
from joblib import dump, load
from pprint import pprint
import random
from statistics import mode
from datetime import datetime

# PyTorch bits
import torch
from torch import nn
from torch.nn import functional as F

# Autoreload
%load_ext autoreload
%autoreload
%load_ext autoreload

# Pandas header
pd.set_option("display.colheader_justify","right")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# https://drive.google.com/u/0/uc?export=download&confirm=IpWH&id=1vYyJL_IB6KjKCxuk9kg4vIMPGTtoX8Ek

In [3]:
# Ensure the directory is correct... every time.
for i in range(5):
    if not os.getcwd().lower() == subprocess.run("git rev-parse --show-toplevel", stdout=subprocess.PIPE).stdout.decode("utf-8").replace("/","\\").strip().lower():
        os.chdir(".."),
    else:
        break

In [4]:
# Set up sys path environment
if not os.path.abspath(".") in sys.path:
    sys.path.append(os.path.abspath("."))
else:
    sys.path.remove(os.path.abspath("."))
    sys.path.append(os.path.abspath("."))

In [5]:
print(os.getcwd())

c:\Users\chris\OneDrive\02 - Education\07 - MDSI\09 - ADSI\07 - GitHub Repo\BeerPrediction


In [6]:
data = pd.read_csv("./data/raw/beer_reviews.csv")

In [7]:
print(data.shape)
print(data.columns)
display(data.describe())

(1586614, 13)
Index(['brewery_id', 'brewery_name', 'review_time', 'review_overall',
       'review_aroma', 'review_appearance', 'review_profilename', 'beer_style',
       'review_palate', 'review_taste', 'beer_name', 'beer_abv',
       'beer_beerid'],
      dtype='object')


Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_beerid
count,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1518829.0,1586614.0
mean,3130.099,1224089000.0,3.815581,3.735636,3.841642,3.743701,3.79286,7.042387,21712.79
std,5578.104,76544270.0,0.7206219,0.6976167,0.6160928,0.6822184,0.7319696,2.322526,21818.34
min,1.0,840672000.0,0.0,1.0,0.0,1.0,1.0,0.01,3.0
25%,143.0,1173224000.0,3.5,3.5,3.5,3.5,3.5,5.2,1717.0
50%,429.0,1239203000.0,4.0,4.0,4.0,4.0,4.0,6.5,13906.0
75%,2372.0,1288568000.0,4.5,4.0,4.0,4.0,4.5,8.5,39441.0
max,28003.0,1326285000.0,5.0,5.0,5.0,5.0,5.0,57.7,77317.0


In [8]:
# from pandas_profiling import ProfileReport

# # Create profile report
# profile = ProfileReport(data, title="Profile Report")
# # Export
# profile.to_file("./reports/InitialReport.html")
# # View
# display(profile)


In [9]:
# from src.utils.misc import get_name
from varname import nameof

print(data.shape)
print(nameof(data))

(1586614, 13)
data


# Set Up Dataset

In [10]:
# Copy the data
data_upd = data.copy()

### Select only the desired feature columns

In [11]:
from src.data.set_data import sel_feat_cols

# Select only the desired feature columns
data_upd = sel_feat_cols(data_upd, ["brewery_name","review_aroma","review_appearance","review_palate","review_taste","beer_style"])

In [None]:
# Fix NaN values in the `brewery_name` column
print(len(data_upd[pd.isnull(data_upd["brewery_name"])]))
data_upd.loc[pd.isnull(data_upd["brewery_name"]),["brewery_name"]] = "Other"
print(len(data_upd[pd.isnull(data_upd["brewery_name"])]))

In [13]:
# Check dataset
display(data_upd)

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste,beer_style
0,Vecchio Birraio,2.0,2.5,1.5,1.5,Hefeweizen
1,Vecchio Birraio,2.5,3.0,3.0,3.0,English Strong Ale
2,Vecchio Birraio,2.5,3.0,3.0,3.0,Foreign / Export Stout
3,Vecchio Birraio,3.0,3.5,2.5,3.0,German Pilsener
4,Caldera Brewing Company,4.5,4.0,4.0,4.5,American Double / Imperial IPA
...,...,...,...,...,...,...
1586609,The Defiant Brewing Company,4.0,3.5,4.0,4.0,Pumpkin Ale
1586610,The Defiant Brewing Company,5.0,2.5,2.0,4.0,Pumpkin Ale
1586611,The Defiant Brewing Company,3.5,3.0,3.5,4.0,Pumpkin Ale
1586612,The Defiant Brewing Company,4.5,4.5,4.5,4.5,Pumpkin Ale


### Train the encoders

In [None]:
# `SimpleImputer` for `Nan` values in the `brewery_name` column


### Pop the Target

In [14]:
from src.data.set_data import pop_target

# Separate the features from the target
feat, targ = pop_target(data_upd, "beer_style")

In [15]:
# Save the data sets
dump(targ, "./data/interim/targ.joblib")
dump(feat, "./data/interim/feat.joblib")

['./data/interim/feat.joblib']

In [16]:
# Check the sizes
print(targ.shape)
print(feat.shape)

(1586614,)
(1586614, 5)


In [17]:
from src.data.set_data import split_data

# Split data
feat_trn, feat_tst, targ_trn, targ_tst = split_data(feat, targ, test_size=0.3)
feat_trn, feat_val, targ_trn, targ_val = split_data(feat_trn, targ_trn, test_size=0.3)

In [18]:
from src.utils.misc import get_shape

# Check the data
temp = pd.DataFrame()
new = [{"name": data, "shape": get_shape(eval(data), True)} for data in ["feat_trn", "targ_trn", "feat_val", "targ_val", "feat_tst", "targ_tst"]]
temp = temp.append(new, ignore_index=True)
temp = temp.style.set_properties(**{"text-align":"left"})
display(temp)

Unnamed: 0,name,shape
0,feat_trn,777440 x 5
1,targ_trn,777440
2,feat_val,333189 x 5
3,targ_val,333189
4,feat_tst,475985 x 5
5,targ_tst,475985


In [20]:
from src.data.prep_data import encode_features

# Encode the `brewery_name` feature
feat_trn, encoder = encode_features(feat_trn, ["brewery_name"], "oe")

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste
1008712,3622.0,3.5,4.0,3.5,4.0
1312749,149.0,2.0,2.0,2.0,2.0
118997,2143.0,3.5,3.0,3.5,3.5
523550,4162.0,4.0,4.5,4.5,4.5
271698,2021.0,3.5,4.0,3.5,3.5
...,...,...,...,...,...
1367338,2885.0,3.5,3.0,4.0,3.5
800633,4673.0,4.5,4.5,4.0,4.0
1249000,2231.0,4.0,4.5,4.0,4.0
456096,2934.0,1.5,1.5,1.0,1.0


**Steps for transforming the data in production:**

1. Ensure it has parsed all feature columns
1. Select only feature columns
1. Handle `NaN` values in the `brewery_name` column
1. Encode the `brewery_name` column using the `OrdinalEncoder`
1. Run the `StandardScaler` over the full dataset.