# Cellar Tracker MLP Classifier

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from keras.layers import Input, Dense
from keras.models import Model
from sklearn.metrics import mean_squared_error

Using TensorFlow backend.


## Data Prep
Read the data directly from data.world into a Pandas data frame

In [5]:
df = pd.read_csv('https://query.data.world/s/r8VVStpwMgTZlScD_VyeGd6JUEX4fm')

### Analysis

If we look at the columns available to us, there's a few to try out for a classifier:
- review/text is the text of the review written by the user
- review/userId or review/userName identify the user in question
- wine/name references the year, vintner, and name of the wine (so it includes wine/year)
- wine/variant is the style of wine
- wine/year is the year of vintage

review/points is the target column for regression (which could be repurposed into classification if so desired)

In [3]:
def assign_grade(review_points):
    if review_points == 100:
        return "AAA"
    elif review_points == 99:
        return "AA"
    elif review_points == 98:
        return "A"
    elif review_points == 97:
        return "BBB"
    elif review_points == 96:
        return "BB"
    elif review_points == 95:
        return "B"
    elif review_points == 94:
        return "CCC"
    elif review_points == 93:
        return "CC"
    elif review_points == 92:
        return "C"
    elif review_points == 91:
        return "DDD"    
    elif review_points == 90:
        return "DD"
    elif review_points == 89:
        return "D"
    elif review_points == 88:
        return "FFF"
    elif review_points == 87:
        return "FF"
    elif review_points == 86:
        return "F"
    else:
        return "ZZZ"    

In [6]:
# We have to deal with nulls. For simplicty, we'll remove them but any normal null imputation options are on the table.

clean_df = df[df['review/points'].notnull()]
clean_df = clean_df[clean_df['wine/year'].notnull()]

# Since we're headed towards a neural network, let's go ahead and use LabelEncoder to convert each wine variant into 
# a unique ID.  fit_transform will go ahead and both determine and ID encoding plus return the transformed variants
# for us to save back into the data frame. 
variant_encoder = LabelEncoder()
clean_df['wine/variantId'] = variant_encoder.fit_transform(clean_df['wine/variant'].values)
clean_df['Grade'] = clean_df['review/points'].map(assign_grade)

grade_encoder = LabelEncoder()
clean_df['GradeId'] = grade_encoder.fit_transform(clean_df['Grade'].values)


## Shaping

Now that we've settled on what we're going to do, we need to shape the data frame into a format learnable for our net.  Unlike other algorithms, neural nets require everything be in a numeric format.  It was for that reason that we encoded the wine/variant names into integers.  We need to take that one step further and one-hot encode them.  Keras provides an easy to_categorical function to handle that for us, which will result in 257 columns (1 for each potential variant).  We also want to use year, which we'll use as-is but one could argue should be similarly encoded.  We need to stack them together into a single Numpy array and then we'll use train_test_split from scikit to break them into train and test sets.  Any validation we need done on our model during training will also come out of the training set.  We'll use the test set after training is complete to truly assess our model's ability to generalize.   

In [7]:
#train_df = pd.DataFrame()
def get_features():
      
    variant = np_utils.to_categorical(np.asarray(clean_df['wine/variantId'].values))
    year = np.asarray(clean_df['wine/year'].values)
    
    X = np.column_stack((year, variant))
    
    return X


def regression_data():
    
    X = get_features()

    y = np.asarray(clean_df['review/points'].values)
    
    return (X, y)
    
def classification_data():
    
    X = get_features()
    
    y = np_utils.to_categorical(np.asarray(clean_df['GradeId'].values))
    
    return (X, y)
    
    

In [8]:
def split(X, y):
    (x_train, x_test, y_train, y_test) = train_test_split(X, y, test_size = .25)
    
    return (x_train, x_test, y_train, y_test)

In [24]:
def regression_learner():

    inputs = Input(shape=(658,))

    x = Dense(64, activation='relu')(inputs)
    x = Dense(64, activation='relu')(x)
    output = Dense(1)(x) # no activation as we're learning it as regression
    
    model = Model(inputs = inputs, outputs = output)
    model.compile(optimizer='adam', loss='mse', metrics = ['acc'])
    
    return model

def classification_learner():
    from keras.optimizers import SGD
    inputs = Input(shape=(658,))
    
    x = Dense(64, activation='relu')(inputs)
    x = Dense(32, activation='relu')(x)
    output = Dense(16, activation = 'softmax')(x) 
    
    model = Model(inputs = inputs, outputs = output)
    opt = SGD(lr = .001)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics = ['acc'])
    
    return model


In [22]:
def run_learner(model, x_train, y_train):

    history = model.fit(x_train, y_train, batch_size = 128, epochs = 5, validation_split = .20)
    
    return history
    


In [14]:
def run_regression():
    X, y = regression_data()
    
    (x_train, x_test, y_train, y_test) = split(X, y)
    
    model = regression_learner()
    
    run_learner(model, x_train, y_train)

def run_classification():
    X, y = classification_data()
    
    (x_train, x_test, y_train, y_test) = split(X, y)
    
    model = classification_learner()
    
    run_learner(model, x_train, y_train)
    

In [23]:
run_classification()

Train on 450257 samples, validate on 112565 samples
Epoch 1/5
Epoch 2/5
  2688/450257 [..............................] - ETA: 40s - loss: 15.9622 - acc: 0.0097 

KeyboardInterrupt: 

In [29]:
y_test[0]

91.0

In [30]:
y_preds[0]

array([ 89.90296173], dtype=float32)

In [31]:
y_preds

array([[ 89.90296173],
       [ 89.36388397],
       [ 89.07897186],
       ..., 
       [ 88.6235733 ],
       [ 89.71648407],
       [ 88.72880554]], dtype=float32)