In [184]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn.impute import SimpleImputer, KNNImputer
from category_encoders import TargetEncoder
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import Lasso, Ridge, RidgeCV
from sklearn.feature_extraction.text import CountVectorizer

# Task 1 - BOW and Simple Features

In [133]:
# Read data and drop all non-US wines
wine_data_150 = pd.read_csv('wine-reviews/winemag-data_first150k.csv', index_col=0)
wine_data_150 = wine_data_150[wine_data_150['country'] == 'US']

In [134]:
# Remove outliers and sample data, taking half of the dataset
wine_data_150 = wine_data_150.sample(frac=0.5)

Drop the 'country' column as it is the US for all the data given. Also remove 'points' from data as that is what will be predicted/indicative of wine quality.

In [135]:
points = wine_data_150['points']
X = wine_data_150.drop(columns=['country', 'points'])
X.head()

Unnamed: 0,description,designation,price,province,region_1,region_2,variety,winery
89070,"Tough and gritty in texture, with a raisined e...",,22.0,California,Paso Robles,Central Coast,Merlot,Bianchi
42239,There are delicious flavors in this ripe Syrah...,Farráh,18.0,California,Lodi,Central Valley,Syrah,Klinker Brick
77006,This Cabernet Sauvignon has splashes of Merlot...,,25.0,Washington,Columbia Valley (WA),Columbia Valley,Cabernet Sauvignon,Novelty Hill
70425,Laurent and Danielle Montalieu acquired this h...,Hyland Vineyard,45.0,Oregon,Eola-Amity Hills,Willamette Valley,Pinot Noir,Soléna
108346,"Definitely a New World style, this tasty Tempr...",,28.0,Washington,Walla Walla Valley (WA),Columbia Valley,Tempranillo,Trio Vintners


### Impute missing values (all except 'description')

In [136]:
categorical = ['designation', 'province', 'region_1', 'region_2', 'variety', 'winery']
continuous = ['price']
impute_categorical = SimpleImputer(strategy='most_frequent', missing_values=np.nan)
impute_float = KNNImputer(missing_values=np.nan)

In [137]:
X[categorical] = impute_categorical.fit_transform(X[categorical])
X[continuous] = impute_float.fit_transform(X[continuous])

### To Do:
- Visualize features if we think we should (all except 'description')
- Remove outliers 
- Any other features which are correlated and can be removed?

## 1.1 - Create a simpe baseline model with all non-text features

In [138]:
print('Number of different categories in each feature: ')
print(f'Province: {len(np.unique(X.province))}')
print(f'Designation: {len(np.unique(X.designation))}')
print(f'Region 1: {len(np.unique(X.region_1))}')
print(f'Region 2: {len(np.unique(X.region_2))}')
print(f'Variety: {len(np.unique(X.variety))}')
print(f'Winery: {len(np.unique(X.winery))}')

Number of different categories in each feature: 
Province: 23
Designation: 7951
Region 1: 241
Region 2: 18
Variety: 190
Winery: 4051


In [150]:
X_train, X_test, y_train, y_test = train_test_split(X, points, shuffle=True, random_state=0)
cv_strategy = KFold(n_splits=10, shuffle=True, random_state=5)

In [151]:
# Remove the description column
X_train_no_description = X_train.drop(columns=['description'])

In [152]:
# Encode categorical features 
to_target_encode = ['designation', 'winery', 'variety', 'region_1', 'region_2']
to_ohe = ['province']
to_scale = ['price']
# Create column transformer
target_encoding = make_pipeline(TargetEncoder(), preprocessing.StandardScaler())
ohe = make_pipeline(preprocessing.OneHotEncoder(handle_unknown='ignore'))
scale = make_pipeline(preprocessing.StandardScaler())
preprocessing_pipe = make_column_transformer((target_encoding, to_target_encode),
                                            (ohe, to_ohe),
                                            (scale, to_scale),
                                            remainder='passthrough')

In [153]:
X_train_no_description

Unnamed: 0,designation,price,province,region_1,region_2,variety,winery
121553,Preston Vineyard,45.0,California,Napa Valley,Napa,Cabernet Sauvignon,V. Sattui
81528,Reserve,25.0,California,Carneros,Napa-Sonoma,Pinot Noir,Buena Vista
146982,Harazsthy Collection,25.0,California,Carneros,Napa-Sonoma,White Blend,Buena Vista
93104,Reserve,20.0,California,Fiddletown,Sierra Foothills,Zinfandel,Renwood
26114,Reserve,10.0,California,California,California Other,Merlot,Montevina
...,...,...,...,...,...,...,...
119031,Reserve,42.0,California,Napa Valley,Napa,Cabernet Sauvignon,Jax
6469,Reserve,32.0,California,Livermore Valley,Central Coast,Malbec,Cuda Ridge Wines
66499,Unfltered,19.0,California,Anderson Valley,Mendocino/Lake Counties,Pinot Noir,Philo Ridge
204,Reserve,10.0,Washington,Columbia Valley (WA),Columbia Valley,Gewürztraminer,Chateau Ste. Michelle


### NOTE: Im assuming based on the assignment question that this is to be done using all features EXCEPT for 'description'

In [168]:
ridge_pipe = make_pipeline(preprocessing_pipe, Ridge())
param_grid = {'ridge__alpha': np.logspace(-3,3,10)}
ridge_grid = GridSearchCV(ridge_pipe, param_grid, cv=cv_strategy, n_jobs=-1, return_train_score=True)
ridge_grid.fit(X_train_no_description, y_train)
ridge_grid.best_score_

0.4426018677021066

In [169]:
lasso_pipe = make_pipeline(preprocessing_pipe, Lasso())
param_grid = {'lasso__alpha': np.logspace(-3,3,10)}
lasso_grid = GridSearchCV(lasso_pipe, param_grid, cv=cv_strategy, n_jobs=-1, return_train_score=True)
lasso_grid.fit(X_train_no_description, y_train)
lasso_grid.best_score_

0.4406544487318941

## 1.2 - Create a simple text-based model using a bag-of-words approach and a linear model

### NOTE: Im assuming based on the homework question that this is to be done using only the 'description' feature as later on in the question, we have to combine both of them together (text and non-text features)

In [156]:
train_text = []
for x in X_train['description']:
    train_text.append(x)

In [189]:
# This is outside of a pipeline because it was being annoying lol
token_vect = CountVectorizer(token_pattern=r"\b\w[\w’]+\b", 
                             lowercase=True,
                             stop_words='english',
                             min_df=4,
                             max_features=1000)
desc_tokenized = token_vect.fit_transform(X_train['description'])

In [193]:
param_grid = {'alpha': np.logspace(-3,3,10)}
ridge_grid = GridSearchCV(Ridge(), param_grid, cv=cv_strategy, n_jobs=-1, return_train_score=True)
ridge_grid.fit(desc_tokenized, y_train)
ridge_grid.best_score_

0.6847920648408021

In [194]:
param_grid = {'alpha': np.logspace(-3,3,10)}
lasso_grid = GridSearchCV(Lasso(), param_grid, cv=cv_strategy, n_jobs=-1, return_train_score=True)
lasso_grid.fit(desc_tokenized, y_train)
lasso_grid.best_score_

0.6821748595782734