# Wine Quality Prediction 
Francesco Di Salvo<br />
s282418@studenti.polito.it

## Import

In [1]:
from scipy.sparse import hstack
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression,SGDRegressor, Ridge
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, cross_val_score
import scipy

import re
import pandas as pd
from nltk.corpus import stopwords
import spacy

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import text

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

import nltk
nltk.download('wordnet')
nltk.download('stopwods')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Francesco\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Francesco\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Methods and utils

In [2]:
##################################  
## General methods
##################################

def loadData(directory):
    df = pd.read_csv(directory,sep="\t")
    return df

def get_final_csv(ids, y, filename):
    pd.DataFrame(dict(Id = ids,Predicted = y)).to_csv(filename,sep=",",index=False)

    
models = [LinearRegression(),Ridge(), SGDRegressor()]
targets = ["LinearRegression","Ridge","SGDRegressor"]
def evaluateModels(models, targets,X,y):
    
    scores = pd.DataFrame()
    for model,target in zip(models,targets):
        scores[target] = cross_val_score(model, X, y, scoring='r2', cv=3, n_jobs=-1)
        
    return scores



##################################  
## Encoding and Missing values
##################################
def preprocessing(X_d,X_e):
    
    # region_2 has too many duplicates
    # description will be manipulated separately
    X_d = X_d.drop(columns=["region_2","description"])
    X_e = X_e.drop(columns=["region_2","description"])

    # impute country and province with the most common values
    imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    X_d["country"] = imputer.fit_transform(np.array(X_d["country"]).reshape(-1,1))
    X_d["province"] = imputer.fit_transform(np.array(X_d["province"]).reshape(-1,1))
    
    # fill designation with the sentinel value "other"
    X_d = X_d.fillna("other")
    X_e = X_e.fillna("other")
    
    y = X_d.quality
    X_d = X_d.drop(columns=["quality"])
    
    # concat dev and eval for the encoding 
    df = pd.concat([X_d,X_e])
    
    # encode and use the sparse matrix because pandas' df is too heavy
    df_enc = pd.get_dummies(df)
    df_enc_scipy = scipy.sparse.csr_matrix(df_enc.values)
    
    # split and return the encoded values
    return df_enc_scipy[:X_d.shape[0]], y, df_enc_scipy[X_d.shape[0]:]

In [3]:
##################################  
## Document preprocessing
##################################

from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords as sw

# general structure learnt from Lab10
class LemmaTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        
    def __call__(self, document):
        lemmas = []
        
        for t in word_tokenize(document):
            
            # remove external spaces
            t = t.strip()
            # lowercase
            t = t.lower()
            # remove punctuation
            t = re.sub(r'[^\w\s]','',t)
            # remove numbers
            t = re.sub(r'[\d+]','',t)
            
            lemma = self.lemmatizer.lemmatize(t)
            if len(lemma) > 2:
                lemmas.append(lemma)
    
        return lemmas

def preprocessText(description_train,description_test):
    
    vectorizer = TfidfVectorizer(ngram_range=(1,4), tokenizer=LemmaTokenizer(), stop_words="english")
    
    # fit the TfidfVectorizer on the development set and transform it on both dev and eval
    dev_vec = vectorizer.fit_transform(description_train.description)
    eval_vec = vectorizer.transform(description_test.description)

    return dev_vec, eval_vec

## Load data

In [14]:
# load datasets

X_dev = loadData('Dataset/dev.tsv')
X_eval = loadData('Dataset/eval.tsv')

## Preprocessing β

In [5]:
%%time

####################################################
## Preprocessing - without removing the outliers : β
####################################################

# drop duplicates
X_dev = X_dev.drop_duplicates()

# encode the categorical features and handle np.nan
X_dev_prep, y, X_eval_prep = preprocessing(X_dev,X_eval)

# preprocess the descriptions
dev_vec, eval_vec = preprocessText(X_dev[["description"]].copy(),
                                   X_eval[["description"]].copy())

# concat the encoded df and the tf-idf
X_conc_dev = hstack((X_dev_prep, dev_vec))
X_conc_eval = hstack((X_eval_prep, eval_vec))

Wall time: 12min 20s


## Model evaluation

In [6]:
## for the instructor : you can skip this script

####################################################
## Evaluate β
####################################################

scores = evaluateModels(models,targets,X_conc_dev,y)

np.mean(scores)

LinearRegression    0.744518
Ridge               0.737091
SGDRegressor        0.681359
dtype: float64

## Hyperparameters tuning

In [10]:
def doGridSearch(model,hyperparams,X,y):
    
    gs = GridSearchCV(estimator=model,param_grid=hyperparams,
                      scoring='r2',cv=3, n_jobs=4,verbose=True)

    gs.fit(X, y)
    
    return gs

In [8]:
## for the instructor : you can skip this script

####################################################
## Grid search linear regression
####################################################

hyperparams_LR = {
    'fit_intercept' : [True,False],
    'normalize' : [True,False]
}

gs_lr = doGridSearch(LinearRegression(),hyperparams_LR,X_conc_dev,y)

print(f"Best params:\t{gs_lr.best_params_}")
print(f"Best score:\t{gs_lr.best_score_}")

## y_pred_lr = gs_lr.predict(X_conc_eval)
## get_final_csv(list(X_eval.index),y_pred_lr,"submit-linear-regression.csv")

## 0.880 on the public score
## {'fit_intercept': True, 'normalize': False}

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  12 out of  12 | elapsed: 32.2min finished


Best params:	{'fit_intercept': True, 'normalize': False}
Best score:	0.7445182016806552


In [9]:
## for the instructor : you can skip this script

####################################################
## Grid search Ridge
####################################################

hyperparams_Ridge = {
    'alpha' : [0.01,0.1,1,10]
}

gs_ridge = doGridSearch(Ridge(),hyperparams_Ridge,X_conc_dev,y)
print(f"Best params:\t{gs_ridge.best_params_}")
print(f"Best score:\t{gs_ridge.best_score_}")

## y_pred_sgd = gs_ridge.predict(X_conc_eval)
## get_final_csv(list(X_eval.index),y_pred_sgd,"submit-ridge.csv")

## 0.881 on the public score
## {'alpha': 0.01}

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  12 out of  12 | elapsed:  5.2min finished


Best params:	{'alpha': 0.01}
Best score:	0.745350502404858


In [11]:
## for the instructor : you can skip this script

####################################################
## Grid search SGD Regressor
####################################################

hyperparams_SGD = {
    'loss' : ['squared_loss'],
    'penalty' : ['l1','l2'],
    'alpha' : np.logspace(-5, 0, 6),
    'eta0' : [0.01, 0.1]
}

gs_sgd = doGridSearch(SGDRegressor(max_iter=10000),hyperparams_SGD,X_conc_dev,y)
print(f"Best params:\t{gs_sgd.best_params_}")
print(f"Best score:\t{gs_sgd.best_score_}")

y_pred_sgd = gs_sgd.predict(X_conc_eval)
get_final_csv(list(X_eval.index),y_pred_sgd,"submit-sgd-regressor.csv")

## 0.856 on the public score
## {'alpha': 1e-05, 'eta0': 0.1, 'loss': 'squared_loss', 'penalty': 'l1'}

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 41.7min
[Parallel(n_jobs=4)]: Done  72 out of  72 | elapsed: 43.1min finished


Best params:	{'alpha': 1e-05, 'eta0': 0.1, 'loss': 'squared_loss', 'penalty': 'l1'}
Best score:	0.7449553228087691


## Final prediction 

In [7]:
%%time

####################################################
## Best hyperparameters found so far
####################################################

model = Ridge(alpha=0.01)
model.fit(X_conc_dev,y)
y_pred = model.predict(X_conc_eval)

get_final_csv(list(X_eval.index),y_pred,"submit.csv")

## 0.881 

Wall time: 3min 57s
