In [1]:
#importing of dependencies
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced



### Data Preprocessing

In [2]:
# import the data
df = pd.read_csv("../data/final_wine_data_172k.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,price,points,variety,winery
0,1,Portugal,"This is ripe and fruity, a wine that is smooth...",15.0,87,Portuguese Red,Quinta dos Avidagos
1,2,US,"Tart and snappy, the flavors of lime flesh and...",14.0,87,Pinot Gris,Rainstorm
2,3,US,"Pineapple rind, lemon pith and orange blossom ...",13.0,87,Riesling,St. Julian
3,4,US,"Much like the regular bottling from 2012, this...",65.0,87,Pinot Noir,Sweet Cheeks
4,5,Spain,Blackberry and raspberry aromas show a typical...,15.0,87,Tempranillo-Merlot,Tandem


In [3]:
# use Counter to get the top 15 wine varietals
counter = Counter(df['variety'].tolist())
top_20_varietals = {i[0]: idx for idx, i in enumerate(counter.most_common(20))}
df = df[df['variety'].map(lambda x: x in top_20_varietals)]

df['variety'].value_counts()

Pinot Noir                  17489
Chardonnay                  14305
Cabernet Sauvignon          11872
Red Blend                    8466
Bordeaux-style Red Blend     7482
Grüner Veltliner             6613
Riesling                     6597
Sauvignon Blanc              6376
Sauvignon                    5421
Syrah                        5155
Rosé                         5052
Merlot                       3706
Zinfandel                    3389
Malbec                       3309
Nebbiolo                     3303
Sangiovese                   3221
Portuguese Red               3001
Sparkling Blend              2894
White Blend                  2750
Red Blends                   2536
Name: variety, dtype: int64

In [4]:
# filter out all extraneaous columns
df = df.filter(["variety", "description"], axis = 1)
df.head()

Unnamed: 0,variety,description
0,Portuguese Red,"This is ripe and fruity, a wine that is smooth..."
2,Riesling,"Pineapple rind, lemon pith and orange blossom ..."
3,Pinot Noir,"Much like the regular bottling from 2012, this..."
9,Cabernet Sauvignon,"Soft, supple plum envelopes an oaky structure ..."
11,Cabernet Sauvignon,"Slightly reduced, this wine offers a chalky, t..."


In [5]:
# split the dataset into training and testing datasets
X=df['description']
y=df['variety']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

### Basic Naive Bayes

In [6]:
%%time

naive_bayes = make_pipeline(
    CountVectorizer(stop_words='english', binary=True),
    NearMiss(),
    MultinomialNB()
)
naive_bayes.fit(X_train, y_train)

print(f'Accuracy: {naive_bayes.score(X_test, y_test)} \n')
print(classification_report_imbalanced(y_test, naive_bayes.predict(X_test)))

Accuracy: 0.5185863022612657 

                                pre       rec       spe        f1       geo       iba       sup

Bordeaux-style Red Blend       0.67      0.43      0.99      0.53      0.65      0.40      1461
      Cabernet Sauvignon       0.69      0.20      0.99      0.31      0.44      0.18      2407
              Chardonnay       0.90      0.37      0.99      0.52      0.61      0.34      2890
        Grüner Veltliner       0.98      0.84      1.00      0.91      0.92      0.82      1299
                  Malbec       0.29      0.62      0.96      0.40      0.77      0.57       672
                  Merlot       0.26      0.37      0.97      0.30      0.60      0.34       732
                Nebbiolo       0.56      0.87      0.98      0.69      0.93      0.85       630
              Pinot Noir       0.88      0.33      0.99      0.47      0.57      0.30      3558
          Portuguese Red       0.37      0.88      0.96      0.52      0.92      0.84       594
        

### TF-IDF Logistic Regression

In [7]:
%%time

tfidf_logistic_regression = make_pipeline(
    CountVectorizer(stop_words='english'),
    TfidfTransformer(),
    NearMiss(),
    LogisticRegression(),
)

tfidf_logistic_regression.fit(X_train, y_train)

print(f'Accuracy: {tfidf_logistic_regression.score(X_test, y_test)} \n')
print(classification_report_imbalanced(y_test, tfidf_logistic_regression.predict(X_test)))



Accuracy: 0.6155034976411258 

                                pre       rec       spe        f1       geo       iba       sup

Bordeaux-style Red Blend       0.74      0.49      0.99      0.59      0.70      0.46      1461
      Cabernet Sauvignon       0.72      0.40      0.98      0.51      0.63      0.37      2407
              Chardonnay       0.86      0.58      0.99      0.69      0.75      0.55      2890
        Grüner Veltliner       1.00      0.76      1.00      0.87      0.87      0.74      1299
                  Malbec       0.44      0.65      0.98      0.52      0.80      0.62       672
                  Merlot       0.26      0.54      0.95      0.35      0.72      0.50       732
                Nebbiolo       0.72      0.67      0.99      0.69      0.82      0.64       630
              Pinot Noir       0.82      0.52      0.98      0.64      0.71      0.49      3558
          Portuguese Red       0.46      0.88      0.97      0.61      0.93      0.85       594
        

### TF-IDF SVC Model

In [8]:
%%time

tfidf_svc = make_pipeline(
    CountVectorizer(
        stop_words='english',
    ),
    TfidfTransformer(),
    NearMiss(),
    LinearSVC()
)

tfidf_svc.fit(X_train, y_train)

print(f'Accuracy: {tfidf_svc.score(X_test, y_test)} \n')
print(classification_report_imbalanced(y_test, tfidf_svc.predict(X_test)))

Accuracy: 0.6302667968114527 

                                pre       rec       spe        f1       geo       iba       sup

Bordeaux-style Red Blend       0.76      0.52      0.99      0.62      0.72      0.49      1461
      Cabernet Sauvignon       0.71      0.44      0.98      0.54      0.66      0.41      2407
              Chardonnay       0.87      0.58      0.99      0.70      0.76      0.55      2890
        Grüner Veltliner       0.99      0.79      1.00      0.88      0.89      0.78      1299
                  Malbec       0.43      0.67      0.98      0.53      0.81      0.63       672
                  Merlot       0.28      0.57      0.95      0.37      0.74      0.52       732
                Nebbiolo       0.73      0.71      0.99      0.72      0.84      0.69       630
              Pinot Noir       0.81      0.54      0.98      0.65      0.73      0.51      3558
          Portuguese Red       0.50      0.87      0.98      0.64      0.92      0.85       594
        