In [1]:
#importing of dependencies

import pandas as pd
import numpy as np
from collections import Counter

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

### Data Preprocessing

In [7]:
# import the data

df = pd.read_csv("../output_data/final_wine_data_172k_test.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,price,points,variety,winery
0,1,Portugal,"This is ripe and fruity, a wine that is smooth...",15.0,87,Portuguese Red,Quinta dos Avidagos
1,2,US,"Tart and snappy, the flavors of lime flesh and...",14.0,87,Pinot Gris,Rainstorm
2,3,US,"Pineapple rind, lemon pith and orange blossom ...",13.0,87,Riesling,St. Julian
3,4,US,"Much like the regular bottling from 2012, this...",65.0,87,Pinot Noir,Sweet Cheeks
4,5,Spain,Blackberry and raspberry aromas show a typical...,15.0,87,Tempranillo-Merlot,Tandem


In [8]:
# use Counter to get the top 15 wine varietals

counter = Counter(df['variety'].tolist())
top_20_varietals = {i[0]: idx for idx, i in enumerate(counter.most_common(20))}
df = df[df['variety'].map(lambda x: x in top_20_varietals)]

df['variety'].value_counts()

Pinot Noir                  17489
Chardonnay                  14305
Cabernet Sauvignon          11872
Red Blend                    8466
Bordeaux-style Red Blend     7482
Grüner Veltliner             6613
Riesling                     6597
Sauvignon Blanc              6376
Sauvignon                    5421
Syrah                        5155
Rosé                         5052
Merlot                       3706
Zinfandel                    3389
Malbec                       3309
Nebbiolo                     3303
Sangiovese                   3221
Portuguese Red               3001
Sparkling Blend              2894
White Blend                  2750
Red Blends                   2536
Name: variety, dtype: int64

In [9]:
# filter out all extraneaous columns

df = df.filter(["variety", "description"], axis = 1)
df.head()

Unnamed: 0,variety,description
0,Portuguese Red,"This is ripe and fruity, a wine that is smooth..."
2,Riesling,"Pineapple rind, lemon pith and orange blossom ..."
3,Pinot Noir,"Much like the regular bottling from 2012, this..."
9,Cabernet Sauvignon,"Soft, supple plum envelopes an oaky structure ..."
11,Cabernet Sauvignon,"Slightly reduced, this wine offers a chalky, t..."


In [10]:
# split the dataset into training and testing datasets

X=df['description']
y=df['variety']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

### Basic Naive Bayes

In [11]:
%%time

naive_bayes = make_pipeline(
    CountVectorizer(stop_words='english', binary=True),
    MultinomialNB()
)

naive_bayes.fit(X_train, y_train)

print(f'Accuracy: {naive_bayes.score(X_test, y_test)} \n')
print(classification_report(y_test, naive_bayes.predict(X_test)))

Accuracy: 0.6888319505449813 

                          precision    recall  f1-score   support

Bordeaux-style Red Blend       0.63      0.66      0.65      1461
      Cabernet Sauvignon       0.54      0.71      0.61      2407
              Chardonnay       0.78      0.88      0.82      2890
        Grüner Veltliner       1.00      0.76      0.87      1299
                  Malbec       0.41      0.52      0.46       672
                  Merlot       0.59      0.08      0.14       732
                Nebbiolo       0.62      0.83      0.71       630
              Pinot Noir       0.75      0.79      0.77      3558
          Portuguese Red       0.56      0.79      0.66       594
               Red Blend       0.54      0.58      0.56      1674
              Red Blends       0.62      0.04      0.08       518
                Riesling       0.73      0.86      0.79      1315
                    Rosé       0.71      0.77      0.74      1030
              Sangiovese       0.45      0.3

### TF-IDF Logistic Regression

In [12]:
%%time

tfidf_logistic_regression = make_pipeline(
    CountVectorizer(stop_words='english'),
    TfidfTransformer(),
    LogisticRegression(),
    verbose = True
)

tfidf_logistic_regression.fit(X_train, y_train)

print(f'Accuracy: {tfidf_logistic_regression.score(X_test, y_test)} \n')
print(classification_report(y_test, tfidf_logistic_regression.predict(X_test)))

[Pipeline] ... (step 1 of 3) Processing countvectorizer, total=   4.2s
[Pipeline] .. (step 2 of 3) Processing tfidftransformer, total=   0.2s




[Pipeline]  (step 3 of 3) Processing logisticregression, total=  35.4s
Accuracy: 0.7561818773385391 

                          precision    recall  f1-score   support

Bordeaux-style Red Blend       0.71      0.79      0.75      1461
      Cabernet Sauvignon       0.62      0.76      0.68      2407
              Chardonnay       0.78      0.93      0.85      2890
        Grüner Veltliner       0.98      0.92      0.95      1299
                  Malbec       0.67      0.51      0.58       672
                  Merlot       0.74      0.33      0.45       732
                Nebbiolo       0.75      0.81      0.78       630
              Pinot Noir       0.73      0.89      0.80      3558
          Portuguese Red       0.79      0.71      0.75       594
               Red Blend       0.67      0.61      0.64      1674
              Red Blends       0.60      0.15      0.24       518
                Riesling       0.86      0.88      0.87      1315
                    Rosé       0.81    

### TF-IDF SVC Model

In [13]:
%%time

tfidf_svc = make_pipeline(
    CountVectorizer(
        stop_words='english',
    ),
    TfidfTransformer(),
    LinearSVC(),
    verbose = True
)

tfidf_svc.fit(X_train, y_train)

print(f'Accuracy: {tfidf_svc.score(X_test, y_test)} \n')
print(classification_report(y_test, tfidf_svc.predict(X_test)))

[Pipeline] ... (step 1 of 3) Processing countvectorizer, total=   4.0s
[Pipeline] .. (step 2 of 3) Processing tfidftransformer, total=   0.2s
[Pipeline] ......... (step 3 of 3) Processing linearsvc, total=   8.9s
Accuracy: 0.7728160078086872 

                          precision    recall  f1-score   support

Bordeaux-style Red Blend       0.73      0.80      0.76      1461
      Cabernet Sauvignon       0.67      0.74      0.70      2407
              Chardonnay       0.83      0.91      0.86      2890
        Grüner Veltliner       0.97      0.94      0.95      1299
                  Malbec       0.68      0.59      0.63       672
                  Merlot       0.59      0.40      0.48       732
                Nebbiolo       0.74      0.82      0.78       630
              Pinot Noir       0.80      0.86      0.83      3558
          Portuguese Red       0.77      0.78      0.77       594
               Red Blend       0.67      0.63      0.65      1674
              Red Blends     