# Algorithms blind tasting wines (Soon be finished)

In [1]:
import pandas as pd
import re
import numpy as np

from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, StandardScaler, OneHotEncoder, LabelEncoder, LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from string import punctuation


In [2]:
filename = '../DataBase/5_grape_db.xlsx'

a0 = pd.read_excel(filename)
a0.head()

Unnamed: 0,abv,colour,country,description,grape_variety,name,Body
0,13.5,red,USA,this wine has concentrated depth and purity of...,pinot noir,A to Z Pinot Noir 2014,light
1,13.5,red,Argentina,"a delicate, floral wine with soft cherry and s...",pinot noir,Alamos Seleccion Pinot Noir 2016,medium
2,12.5,red,Italy,"a medium-bodied wine, with aromas and flavours...",pinot noir,Alois Lageder Alto Adige Pinot Noir 2014,medium
3,13.5,white,Italy,very fresh aromas and flavours of gooseberry a...,sauvignon blanc,Alois Lageder Terlaner Sauvignon Blanc 2016,medium
4,13.5,red,Argentina,this wine has concentrated aromas and flavours...,cabernet sauvignon,Argento Cabernet Sauvignon 2014,full


In [3]:
result = a0['grape_variety']
limit = 40
## removing varieties that have only one member in the database
counts = nltk.Counter(result)
varieties = [key for key in counts if counts[key] > limit]
data_input = a0[a0['grape_variety'].isin(varieties)].reset_index()

In [4]:
data_input.head()

Unnamed: 0,index,abv,colour,country,description,grape_variety,name,Body
0,0,13.5,red,USA,this wine has concentrated depth and purity of...,pinot noir,A to Z Pinot Noir 2014,light
1,1,13.5,red,Argentina,"a delicate, floral wine with soft cherry and s...",pinot noir,Alamos Seleccion Pinot Noir 2016,medium
2,2,12.5,red,Italy,"a medium-bodied wine, with aromas and flavours...",pinot noir,Alois Lageder Alto Adige Pinot Noir 2014,medium
3,3,13.5,white,Italy,very fresh aromas and flavours of gooseberry a...,sauvignon blanc,Alois Lageder Terlaner Sauvignon Blanc 2016,medium
4,5,13.0,white,Argentina,this lightly oaked wine offers aromas of ripe ...,chardonnay,Argento Chardonnay 2015,medium


In [5]:
# defining stopwords: using the one that comes with nltk + appending it with words seen from the above evaluation
stop_words = stopwords.words('english')
stop_append = ['.', ',', '`', '"', "'", '!', ';', 'wine', 'fruit', '%', 'flavour', 'aromas', 'palate']
stop_words1 = frozenset(stop_words + stop_append)


In [6]:
# list of word types (nouns and adjectives) to leave in the text
defTags = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJS', 'JJR']#, 'RB', 'RBS', 'RBR', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

# functions to determine the type of a word
def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']


def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']


def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

# transform tag forms
def penn_to_wn(tag):
    if is_adjective(tag):
        return nltk.stem.wordnet.wordnet.ADJ
    elif is_noun(tag):
        return nltk.stem.wordnet.wordnet.NOUN
    elif is_adverb(tag):
        return nltk.stem.wordnet.wordnet.ADV
    elif is_verb(tag):
        return nltk.stem.wordnet.wordnet.VERB
    return nltk.stem.wordnet.wordnet.NOUN
    
# lemmatizer + tokenizer (+ stemming) class
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        # we define (but not use) a stemming method, uncomment the last line in __call__ to get stemming tooo
        self.stemmer = nltk.stem.SnowballStemmer('english') 
    def __call__(self, doc):
        # pattern for numbers | words of length=2 | punctuations | words of length=1
        pattern = re.compile(r'[0-9]+|\b[\w]{2,2}\b|[%.,_`!"&?\')({~@;:#}+-]+|\b[\w]{1,1}\b')
        # tokenize document
        doc_tok = word_tokenize(doc)
        #filter out patterns from words
        doc_tok = [pattern.sub('', x) for x in doc_tok]
        # get rid of anything with length=1
        doc_tok = [x for x in doc_tok if len(x) > 1]
        # position tagging
        doc_tagged = nltk.pos_tag(doc_tok)
        # selecting nouns and adjectives
        doc_tagged = [(t[0], t[1]) for t in doc_tagged if t[1] in defTags]
        # preparing lemmatization
        doc = [(t[0], penn_to_wn(t[1])) for t in doc_tagged]
        # lemmatization
        doc = [self.wnl.lemmatize(t[0], t[1]) for t in doc]
        # uncomment if you want stemming as well
        #doc = [self.stemmer.stem(x) for x in doc]
        return doc

In [7]:

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None, *parg, **kwarg):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

class MyLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    
    def transform(self, x, y=0):
        return self.encoder.transform(x)


In [8]:
body_dummies = pd.get_dummies(data_input['Body'])
colour_dummies = pd.get_dummies(data_input['colour'])
data_input = data_input.merge(body_dummies, left_index=True, right_index=True)
data_input = data_input.merge(colour_dummies, left_index=True, right_index=True)


In [9]:
# split the data into train and test
combined_features = ['Body', 'description', 'full', 'light', 'medium', 'dry', 'red', 'rose', 'white']
target = 'grape_variety'

X_train, X_test, y_train, y_test = train_test_split(data_input[combined_features], data_input[target], 
                                                    test_size=0.33, random_state=42)

In [10]:

full = Pipeline([
                ('selector', NumberSelector(key='full')),
                ])
medium = Pipeline([
                ('selector', NumberSelector(key='medium')),
                ])
light = Pipeline([
                ('selector', NumberSelector(key='light')),
                ])
dry = Pipeline([
                ('selector', NumberSelector(key='dry')),
                ])
red = Pipeline([
                ('selector', NumberSelector(key='red')),
                ])
rose = Pipeline([
                ('selector', NumberSelector(key='rose')),
                ])
white = Pipeline([
                ('selector', NumberSelector(key='white')),
                ])


In [11]:
text = Pipeline([
                ('selector', TextSelector(key='description')),
                ('vectorizer', TfidfVectorizer(ngram_range=(1,1), stop_words=stop_words, analyzer='word', 
                                               norm='l2', tokenizer=LemmaTokenizer()))
                ])

In [12]:
feats = FeatureUnion([('full', full),
                      ('medium', medium),
                      ('light', light),
                      ('dry', dry),
                      ('description', text),
                      ('red', red),
                      ('rose', rose),
                      ('white', white)
                      ])


In [13]:
pipe = Pipeline([('feats', feats),
                 ('clf',RandomForestClassifier(random_state=42))
                 ])
    
pipe.fit(X_train, y_train)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Pipeline(steps=[('feats', FeatureUnion(n_jobs=1,
       transformer_list=[('full', Pipeline(steps=[('selector', NumberSelector(key='full'))])), ('medium', Pipeline(steps=[('selector', NumberSelector(key='medium'))])), ('light', Pipeline(steps=[('selector', NumberSelector(key='light'))])), ('dry', Pipeline(st...stimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False))])

In [14]:
#train stats
preds = pipe.predict(X_train)
print(metrics.accuracy_score(y_train, preds))
print(metrics.classification_report(y_train, preds))
print(metrics.confusion_matrix(y_train, preds))
print(nltk.Counter(y_train))


1.0
                 precision    recall  f1-score   support

     chardonnay       1.00      1.00      1.00        58
     pinot noir       1.00      1.00      1.00        40
sauvignon blanc       1.00      1.00      1.00        55
          syrah       1.00      1.00      1.00        27

    avg / total       1.00      1.00      1.00       180

[[58  0  0  0]
 [ 0 40  0  0]
 [ 0  0 55  0]
 [ 0  0  0 27]]
Counter({'chardonnay': 58, 'sauvignon blanc': 55, 'pinot noir': 40, 'syrah': 27})


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [15]:
# test stats
preds = pipe.predict(X_test)
print(metrics.accuracy_score(y_test, preds))
print(metrics.classification_report(y_test, preds))
print(metrics.confusion_matrix(y_test, preds))
print(nltk.Counter(y_test))


0.7444444444444445
                 precision    recall  f1-score   support

     chardonnay       0.71      0.97      0.82        33
     pinot noir       0.68      0.89      0.77        19
sauvignon blanc       0.83      0.48      0.61        21
          syrah       1.00      0.47      0.64        17

    avg / total       0.79      0.74      0.73        90

[[32  0  1  0]
 [ 2 17  0  0]
 [11  0 10  0]
 [ 0  8  1  8]]
Counter({'chardonnay': 33, 'sauvignon blanc': 21, 'pinot noir': 19, 'syrah': 17})


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [16]:
### stratified training
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3)
sc_mean=[]
for train, test in skf.split(data_input[combined_features], data_input[target]):
    pipe.fit(data_input.loc[train,combined_features], data_input.loc[train, target])
    preds = pipe.predict(data_input.loc[test,combined_features])
    sc_mean.append(metrics.accuracy_score(data_input.loc[test, target], preds))
    
    print(metrics.accuracy_score(data_input.loc[test, target], preds))
    print(metrics.classification_report(data_input.loc[test, target], preds))
    print(metrics.confusion_matrix(data_input.loc[test, target], preds))
    print(nltk.Counter(data_input.loc[test, target]))
print('Mean: %s' % str(sum(sc_mean)/len(sc_mean)))
    

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.6956521739130435
                 precision    recall  f1-score   support

     chardonnay       0.71      0.97      0.82        31
     pinot noir       0.62      0.50      0.56        20
sauvignon blanc       0.94      0.58      0.71        26
          syrah       0.50      0.60      0.55        15

    avg / total       0.72      0.70      0.69        92

[[30  0  1  0]
 [ 1 10  0  9]
 [11  0 15  0]
 [ 0  6  0  9]]
Counter({'chardonnay': 31, 'sauvignon blanc': 26, 'pinot noir': 20, 'syrah': 15})


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.7888888888888889
                 precision    recall  f1-score   support

     chardonnay       0.81      0.73      0.77        30
     pinot noir       0.77      0.85      0.81        20
sauvignon blanc       0.73      0.88      0.80        25
          syrah       0.91      0.67      0.77        15

    avg / total       0.80      0.79      0.79        90

[[22  0  8  0]
 [ 2 17  0  1]
 [ 3  0 22  0]
 [ 0  5  0 10]]
Counter({'chardonnay': 30, 'sauvignon blanc': 25, 'pinot noir': 20, 'syrah': 15})


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.7613636363636364
                 precision    recall  f1-score   support

     chardonnay       0.72      0.87      0.79        30
     pinot noir       0.75      0.95      0.84        19
sauvignon blanc       0.79      0.60      0.68        25
          syrah       0.89      0.57      0.70        14

    avg / total       0.77      0.76      0.75        88

[[26  0  4  0]
 [ 0 18  0  1]
 [10  0 15  0]
 [ 0  6  0  8]]
Counter({'chardonnay': 30, 'sauvignon blanc': 25, 'pinot noir': 19, 'syrah': 14})
Mean: 0.7486348997218562


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


# Kaggle database

In [17]:
filename = '../DataBase/winemag-data-130k-v2.csv'

kaggle = pd.read_csv(filename, usecols=['description', 'grape_variety'])
kaggle['grape_variety'] = kaggle['grape_variety'].str.lower()
kaggle.head()

Unnamed: 0,description,grape_variety
0,"Aromas include tropical fruit, broom, brimston...",white blend
1,"This is ripe and fruity, a wine that is smooth...",portuguese red
2,"Tart and snappy, the flavors of lime flesh and...",pinot gris
3,"Pineapple rind, lemon pith and orange blossom ...",riesling
4,"Much like the regular bottling from 2012, this...",pinot noir


In [18]:
def shiraz_filter(ss):
    if ss == 'shiraz':
        return 'syrah'
    else:
        return ss

In [31]:
kaggle['grape_variety'] = kaggle.apply(lambda row: shiraz_filter(row['grape_variety']), axis=1)

In [32]:
kaggle_input = kaggle[kaggle['grape_variety'].isin(varieties)].reset_index()

In [36]:
pd.unique(kaggle_input.grape_variety)

array(['pinot noir', 'chardonnay', 'sauvignon blanc', 'syrah'],
      dtype=object)

In [37]:
kaggle_input.shape

(34943, 3)

In [22]:
colour_dict = {'pinot noir': 'red', 'syrah': 'red', 'chardonnay': 'white', 'sauvignon blanc': 'white'}

In [38]:
kaggle_input['colour'] = kaggle_input.apply(lambda row: colour_dict[row['grape_variety']], axis=1)

In [37]:
# split the data into train and test
combined_features = ['description']
target = 'grape_variety'

X_train, X_test, y_train, y_test = train_test_split(kaggle_input[combined_features], kaggle_input[target], 
                                                    test_size=0.33, random_state=42)

In [38]:
text = Pipeline([
                ('selector', TextSelector(key='description')),
                ('vectorizer', TfidfVectorizer(ngram_range=(1,1), stop_words=stop_words, analyzer='word', 
                                               norm='l2', tokenizer=LemmaTokenizer()))
                ])

In [39]:
pipe = Pipeline([('text', text),
                 ('clf',RandomForestClassifier(random_state=42))
                 ])
    
pipe.fit(X_train, y_train)


Pipeline(memory=None,
     steps=[('text', Pipeline(memory=None,
     steps=[('selector', TextSelector(key='description')), ('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_featur...stimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

In [40]:
# test stats
preds = pipe.predict(X_test)
print(metrics.accuracy_score(y_test, preds))
print(metrics.classification_report(y_test, preds))
print(metrics.confusion_matrix(y_test, preds))
print(nltk.Counter(y_test))


0.8415712799167534
                 precision    recall  f1-score   support

     chardonnay       0.81      0.95      0.87      3822
     pinot noir       0.84      0.94      0.89      4378
sauvignon blanc       0.90      0.59      0.71      1635
          syrah       0.92      0.60      0.73      1697

    avg / total       0.85      0.84      0.83     11532

[[3620  109   89    4]
 [ 184 4100   11   83]
 [ 620   49  964    2]
 [  64  603    9 1021]]
Counter({'pinot noir': 4378, 'chardonnay': 3822, 'syrah': 1697, 'sauvignon blanc': 1635})


In [44]:
### stratified training
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3)
sc_mean=[]
for train, test in skf.split(kaggle_input[combined_features], kaggle_input[target]):
    pipe.fit(kaggle_input.loc[train,combined_features], kaggle_input.loc[train, target])
    preds = pipe.predict(kaggle_input.loc[test,combined_features])
    sc_mean.append(metrics.accuracy_score(kaggle_input.loc[test, target], preds))
    
    print(metrics.accuracy_score(kaggle_input.loc[test, target], preds))
    print(metrics.classification_report(kaggle_input.loc[test, target], preds))
    print(metrics.confusion_matrix(kaggle_input.loc[test, target], preds))
    print(nltk.Counter(kaggle_input.loc[test, target]))
print('Mean: %s' % str(sum(sc_mean)/len(sc_mean)))
    

0.845467032967033
                 precision    recall  f1-score   support

     chardonnay       0.81      0.94      0.87      3914
     pinot noir       0.85      0.94      0.90      4421
sauvignon blanc       0.89      0.58      0.70      1655
          syrah       0.91      0.62      0.73      1658

    avg / total       0.85      0.85      0.84     11648

[[3689  109  112    4]
 [ 148 4175    6   92]
 [ 646   43  964    2]
 [  57  577    4 1020]]
Counter({'pinot noir': 4421, 'chardonnay': 3914, 'syrah': 1658, 'sauvignon blanc': 1655})
0.8390281593406593
                 precision    recall  f1-score   support

     chardonnay       0.80      0.95      0.87      3914
     pinot noir       0.85      0.93      0.89      4421
sauvignon blanc       0.89      0.56      0.68      1655
          syrah       0.91      0.62      0.74      1658

    avg / total       0.85      0.84      0.83     11648

[[3723   90   95    6]
 [ 212 4094   19   96]
 [ 686   44  921    4]
 [  55  563    5 1035

In [None]:
text = Pipeline([
                ('selector', TextSelector(key='description')),
                ('vectorizer', TfidfVectorizer(ngram_range=(1,1), stop_words=stop_words, analyzer='word', 
                                               norm='l2', tokenizer=LemmaTokenizer()))
                ])

In [42]:
pipe = Pipeline([('text', text),
                 ('clf',RandomForestClassifier(random_state=42))
                 ])
    
pipe.fit(kaggle_input[['description']], kaggle_input['grape_variety'])

Pipeline(memory=None,
     steps=[('text', Pipeline(memory=None,
     steps=[('selector', TextSelector(key='description')), ('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_featur...stimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

In [43]:
# test stats
preds = pipe.predict(data_input[['description']])
print(metrics.accuracy_score(data_input['grape_variety'], preds))
print(metrics.classification_report(data_input['grape_variety'], preds))
print(metrics.confusion_matrix(data_input['grape_variety'], preds))
print(nltk.Counter(data_input['grape_variety']))


0.725925925925926
                 precision    recall  f1-score   support

     chardonnay       0.71      0.95      0.81        91
     pinot noir       0.64      0.92      0.76        59
sauvignon blanc       0.89      0.54      0.67        76
          syrah       0.79      0.34      0.48        44

    avg / total       0.76      0.73      0.71       270

[[86  1  4  0]
 [ 1 54  0  4]
 [32  3 41  0]
 [ 2 26  1 15]]
Counter({'chardonnay': 91, 'sauvignon blanc': 76, 'pinot noir': 59, 'syrah': 44})


In [41]:
colour_dummies = pd.get_dummies(kaggle_input['colour'])
kaggle_input = kaggle_input.merge(colour_dummies, left_index=True, right_index=True)


In [43]:
# split the data into train and test
combined_features = ['description', 'white', 'red']
target = 'grape_variety'

X_train, X_test, y_train, y_test = train_test_split(kaggle_input[combined_features], kaggle_input[target], 
                                                    test_size=0.33, random_state=42)

In [44]:
red = Pipeline([
                ('selector', NumberSelector(key='red')),
                ])
white = Pipeline([
                ('selector', NumberSelector(key='white')),
                ])


In [45]:
text = Pipeline([
                ('selector', TextSelector(key='description')),
                ('vectorizer', TfidfVectorizer(ngram_range=(1,1), stop_words=stop_words, analyzer='word', 
                                               norm='l2', tokenizer=LemmaTokenizer()))
                ])

In [46]:
feats = FeatureUnion([('description', text),
                      ('red', red),
                      ('white', white)
                      ])


In [47]:
pipe = Pipeline([('feats', feats),
                 ('clf',RandomForestClassifier(random_state=42))
                 ])
    
pipe.fit(X_train, y_train)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Pipeline(steps=[('feats', FeatureUnion(n_jobs=1,
       transformer_list=[('description', Pipeline(steps=[('selector', TextSelector(key='description')), ('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',...stimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False))])

In [49]:
# test stats
preds = pipe.predict(X_test)
print(metrics.accuracy_score(y_test, preds))
print(metrics.classification_report(y_test, preds))
print(metrics.confusion_matrix(y_test, preds))
print(nltk.Counter(y_test))


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.866718695802983
                 precision    recall  f1-score   support

     chardonnay       0.85      0.98      0.91      3822
     pinot noir       0.86      0.98      0.92      4378
sauvignon blanc       0.92      0.59      0.72      1635
          syrah       0.93      0.59      0.72      1697

    avg / total       0.87      0.87      0.86     11532

[[3735    1   86    0]
 [   4 4296    1   77]
 [ 669    0  966    0]
 [   1  698    0  998]]
Counter({'pinot noir': 4378, 'chardonnay': 3822, 'syrah': 1697, 'sauvignon blanc': 1635})


In [50]:
### stratified training
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3)
sc_mean=[]
for train, test in skf.split(kaggle_input[combined_features], kaggle_input[target]):
    pipe.fit(kaggle_input.loc[train,combined_features], kaggle_input.loc[train, target])
    preds = pipe.predict(kaggle_input.loc[test,combined_features])
    sc_mean.append(metrics.accuracy_score(kaggle_input.loc[test, target], preds))
    
    print(metrics.accuracy_score(kaggle_input.loc[test, target], preds))
    print(metrics.classification_report(kaggle_input.loc[test, target], preds))
    print(metrics.confusion_matrix(kaggle_input.loc[test, target], preds))
    print(nltk.Counter(kaggle_input.loc[test, target]))
print('Mean: %s' % str(sum(sc_mean)/len(sc_mean)))
    

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.8668440934065934
                 precision    recall  f1-score   support

     chardonnay       0.85      0.97      0.91      3914
     pinot noir       0.87      0.98      0.92      4421
sauvignon blanc       0.89      0.61      0.72      1655
          syrah       0.90      0.60      0.72      1658

    avg / total       0.87      0.87      0.86     11648

[[3793    1  120    0]
 [   4 4312    1  104]
 [ 651    0 1004    0]
 [   2  668    0  988]]
Counter({'pinot noir': 4421, 'chardonnay': 3914, 'syrah': 1658, 'sauvignon blanc': 1655})


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.8638392857142857
                 precision    recall  f1-score   support

     chardonnay       0.85      0.97      0.91      3914
     pinot noir       0.86      0.98      0.92      4421
sauvignon blanc       0.91      0.58      0.71      1655
          syrah       0.90      0.59      0.71      1658

    avg / total       0.87      0.86      0.85     11648

[[3813    3   98    0]
 [   3 4312    0  106]
 [ 690    1  963    1]
 [   0  684    0  974]]
Counter({'pinot noir': 4421, 'chardonnay': 3914, 'syrah': 1658, 'sauvignon blanc': 1655})


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.8676053919464239
                 precision    recall  f1-score   support

     chardonnay       0.84      0.97      0.90      3914
     pinot noir       0.87      0.98      0.92      4421
sauvignon blanc       0.90      0.58      0.71      1654
          syrah       0.92      0.62      0.74      1658

    avg / total       0.87      0.87      0.86     11647

[[3803    4  107    0]
 [   7 4319    2   93]
 [ 691    1  962    0]
 [   5  630    2 1021]]
Counter({'pinot noir': 4421, 'chardonnay': 3914, 'syrah': 1658, 'sauvignon blanc': 1654})
Mean: 0.8660962570224343


Do not be impatient... Updates are coming soon to this page. In the meanwhile please feel free to contact me at [diveki@gmail.com](diveki@gmail.com). You can also fork this project from [my GitHub repository](https://github.com/diveki/WineSommelier) or you can take a sneaky look at [my GitHub Pages website](https://diveki.github.io).

Just as a bonus, the prelude for this report can be found [here](https://diveki.github.io/projects/wine.html).