In [26]:
import pandas as pd
import numpy as np
import os
import dill
from scipy.stats import uniform, randint

# preprocessing packages
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# pipeline tools
from sklearn.base import TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV, RepeatedKFold
from sklearn.preprocessing import FunctionTransformer

#feature selection
from sklearn.feature_selection import VarianceThreshold, SelectKBest

#models
from sklearn.ensemble import ExtraTreesRegressor

#metrics
from sklearn.metrics import mean_squared_error

In [2]:
#directory locations
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
processed_data_folder = parent_directory + '/data/wunderground-com/processed/'
models_folder = parent_directory + '/models/'

### Load Data

In [3]:
df = pd.read_csv(processed_data_folder + '1678665697.3855994.txt', 
                 sep = '|')

In [4]:
df.head()

Unnamed: 0,Product_Name,Product_Varietal,Product_Origin,Product_Price,Product_Attributes,User_Avg_Rating,User_Rating_Count,Product_Vintage,Critical_Avg_Rating,Critical_Rating_Count,...,Jan_Min_Low_Wind_Speed,Jan_Max_High_Pressure,Jan_Avg_High_Pressure,Jan_Min_High_Pressure,Jan_Avg_Pressure,Jan_Max_Low_Pressure,Jan_Avg_Low_Pressure,Jan_Min_Low_Pressure,Jan_Average_Precipitation,Jan_Total_Precipitation
0,Domaine Taupenot-Merme Nuits-Saint-Georges Les...,Pinot Noir,"Nuits-St-Georges, Cote de Nuits, Cote d'Or, Bu...",199.99,Red Wine,0.0,0,2017,93.0,3,...,0.0,29.8,29.493548,29.1,29.416129,29.8,29.354839,28.7,0.0,0.0
1,Domaine de la Romanee-Conti Echezeaux Grand Cru,Pinot Noir,"Flagey-Echezeaux, Cote de Nuits, Cote d'Or, Bu...",3299.99,Red Wine,0.0,0,2018,95.0,4,...,0.0,29.9,29.384211,28.9,29.252632,29.8,29.157895,28.8,0.0,0.0
2,Lincourt Rancho Santa Rosa Pinot Noir,Pinot Noir,"Sta. Rita Hills, Santa Barbara, Central Coast,...",40.99,Red Wine,4.0,31,2018,90.666667,3,...,0.0,30.3,30.135484,29.9,30.064516,30.2,30.0,29.5,0.087742,2.72
3,Domaine Claude Dugat Gevrey-Chambertin,Pinot Noir,"Gevrey-Chambertin, Cote de Nuits, Cote d'Or, B...",179.99,Red Wine,0.0,0,2020,92.0,3,...,0.0,29.9,29.558065,29.0,29.477419,29.9,29.4,28.8,0.0,0.0
4,Domaine Prieur-Brunet Santenay Maladiere Premi...,Pinot Noir,"Santenay, Cote de Beaune, Cote d'Or, Burgundy,...",65.99,Red Wine,0.0,0,2020,92.333333,3,...,0.0,29.9,29.558065,29.0,29.477419,29.9,29.4,28.8,0.0,0.0


In [5]:
df.columns

Index(['Product_Name', 'Product_Varietal', 'Product_Origin', 'Product_Price',
       'Product_Attributes', 'User_Avg_Rating', 'User_Rating_Count',
       'Product_Vintage', 'Critical_Avg_Rating', 'Critical_Rating_Count',
       ...
       'Jan_Min_Low_Wind_Speed ', 'Jan_Max_High_Pressure',
       'Jan_Avg_High_Pressure', 'Jan_Min_High_Pressure', 'Jan_Avg_Pressure',
       'Jan_Max_Low_Pressure', 'Jan_Avg_Low_Pressure', 'Jan_Min_Low_Pressure',
       'Jan_Average_Precipitation', 'Jan_Total_Precipitation'],
      dtype='object', length=456)

In [6]:
df.shape

(1115, 456)

### Missing Data & Data Type Correction

In [7]:
df.isnull().sum().sum()

0

### Reduce to Input Data and Assign Predictive and Target Feature

In [8]:
# specifying predictive and target features
X = df.drop(columns = ['Product_Name', 'Product_Varietal', 'Product_Origin', 'Product_Price',
       'Product_Attributes', 'User_Avg_Rating', 'User_Rating_Count',
       'Product_Vintage', 'Critical_Avg_Rating', 'Critical_Rating_Count', 'Appellation_Level', 'Weather_Origin'])
y = df[['Critical_Avg_Rating']]

In [9]:
X.columns

Index(['Aug_Max_High_Temperature', 'Aug_Avg_High_Temperature',
       'Aug_Min_High_Temperature', 'Aug_Avg_Temperature',
       'Aug_Max_Low_Temperature', 'Aug_Avg_Low_Temperature',
       'Aug_Min_Low_Temperature', 'Aug_Max_High_ Dew_Point',
       'Aug_Avg_High_ Dew_Point', 'Aug_Min_High_ Dew_Point',
       ...
       'Jan_Min_Low_Wind_Speed ', 'Jan_Max_High_Pressure',
       'Jan_Avg_High_Pressure', 'Jan_Min_High_Pressure', 'Jan_Avg_Pressure',
       'Jan_Max_Low_Pressure', 'Jan_Avg_Low_Pressure', 'Jan_Min_Low_Pressure',
       'Jan_Average_Precipitation', 'Jan_Total_Precipitation'],
      dtype='object', length=444)

In [24]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 1 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Critical_Avg_Rating  1115 non-null   float64
dtypes: float64(1)
memory usage: 8.8 KB


In [10]:
# create holdout set to approximate real-world performance
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=123)

### ExtraTrees Classifiers

In [28]:
pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                     ('scaler', StandardScaler()),
                     ('near_zero_variance', VarianceThreshold()),
                     ('k_best', SelectKBest()),
                     ('etc', ExtraTreesRegressor())])

In [29]:
search_space = [{'near_zero_variance__threshold': [0.00, 0.01, 0.05, .10],
                 'k_best__k': list(range(1,26,1)),
                 'etc__n_estimators': [10, 50, 100, 500, 1000, 5000],
                 'etc__max_leaf_nodes': [5, 10, 15, 20, 25, 50], 
                 'etc__max_depth': range(2,21,1)}]

kfold = RepeatedKFold(n_splits=5,
                      n_repeats=10,
                      random_state=123)

etc = RandomizedSearchCV(pipeline,
                         param_distributions = search_space,
                         n_iter = 200,
                         cv = kfold,
                         scoring ='neg_mean_squared_error',
                         n_jobs = 6,
                        random_state = 123)
etc.fit(X_train, y_train)

print("Best parameter (CV score=%0.3f):" % etc.best_score_)
print(etc.best_params_)

  y = column_or_1d(y, warn=True)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Best parameter (CV score=-3.113):
{'near_zero_variance__threshold': 0.0, 'k_best__k': 17, 'etc__n_estimators': 500, 'etc__max_leaf_nodes': 50, 'etc__max_depth': 13}


### Write Pipeline to File

In [30]:
etc_best_pipeline = etc.best_estimator_
with open(models_folder + 'extratrees_best_cv.pkl', 'wb') as f:
    dill.dump(etc_best_pipeline, f)

### Holdout Performance

In [31]:
np.sqrt(mean_squared_error(y_test, etc.predict(X_test)))

1.7572944547304628