# Wine Reviews Natural Language Processing

**Prepared by Elizabeth Webster**

*November 2022*

## Overview

## Business Problem

## Dataset

# Data Understanding

In [1]:
# Import the necessary libraries
import pandas as pd
from nltk.corpus import stopwords
import string
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
import nltk
from nltk import FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from wordcloud import WordCloud
import warnings
from sklearn.metrics import classification_report
warnings.filterwarnings('ignore')

In [2]:
# Create the DataFrame
df = pd.read_csv('Data/winemag-data-130k-v2.csv.zip', encoding='latin-1', index_col=0)

In [4]:
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",VulkÃ Bianco,87,,Sicily & Sardinia,Etna,,Kerin OâKeefe,@kerinokeefe,Nicosia 2013 VulkÃ Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwineÂ,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwineÂ,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129971 entries, 0 to 129970
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   country                129908 non-null  object 
 1   description            129971 non-null  object 
 2   designation            92506 non-null   object 
 3   points                 129971 non-null  int64  
 4   price                  120975 non-null  float64
 5   province               129908 non-null  object 
 6   region_1               108724 non-null  object 
 7   region_2               50511 non-null   object 
 8   taster_name            103727 non-null  object 
 9   taster_twitter_handle  98758 non-null   object 
 10  title                  129971 non-null  object 
 11  variety                129970 non-null  object 
 12  winery                 129971 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 13.9+ MB


dropping columns that are missing high numbers of values.
dropping taster info and title - should not impact point value.

In [16]:
wine_df = df.drop(['designation', 'region_1', 'region_2', 
                   'taster_name', 'taster_twitter_handle',
                   'title'], axis=1)
wine_df.head()

Unnamed: 0,country,description,points,price,province,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,,Sicily & Sardinia,White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,15.0,Douro,Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Michigan,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",87,65.0,Oregon,Pinot Noir,Sweet Cheeks


In [17]:
wine_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129971 entries, 0 to 129970
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   country      129908 non-null  object 
 1   description  129971 non-null  object 
 2   points       129971 non-null  int64  
 3   price        120975 non-null  float64
 4   province     129908 non-null  object 
 5   variety      129970 non-null  object 
 6   winery       129971 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 12.9+ MB


**plots for distribution**

# Data Cleaning

will drop NaNs for country (63 - .04%), province (63 - .04%), and variety (1).
will use median or mean for filling NaN prices(8996 - 6% of dataset) (after train test split).

In [21]:
wine_df.dropna(subset=['country', 'province', 'variety'], inplace=True)

In [26]:
wine_df.reset_index(drop=True, inplace=True)

In [27]:
wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129907 entries, 0 to 129906
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   country      129907 non-null  object 
 1   description  129907 non-null  object 
 2   points       129907 non-null  int64  
 3   price        120915 non-null  float64
 4   province     129907 non-null  object 
 5   variety      129907 non-null  object 
 6   winery       129907 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 6.9+ MB


## Train Test Split

In [31]:
# Create data and target dataframes
X = wine_df.drop(['points'], axis=1)
y = wine_df['points']

In [32]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, 
                                                    random_state=42)

In [33]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90934 entries, 32539 to 121958
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   country      90934 non-null  object 
 1   description  90934 non-null  object 
 2   price        84667 non-null  float64
 3   province     90934 non-null  object 
 4   variety      90934 non-null  object 
 5   winery       90934 non-null  object 
dtypes: float64(1), object(5)
memory usage: 4.9+ MB


In [35]:
X_train.price.mean()

35.31688851618694

In [36]:
X_train.price.median()

25.0

In [37]:
X_train['price'].fillna((X_train['price'].mean()), inplace=True)

In [38]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90934 entries, 32539 to 121958
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   country      90934 non-null  object 
 1   description  90934 non-null  object 
 2   price        90934 non-null  float64
 3   province     90934 non-null  object 
 4   variety      90934 non-null  object 
 5   winery       90934 non-null  object 
dtypes: float64(1), object(5)
memory usage: 4.9+ MB


In [39]:
X_train.price.mean()

35.31688851618695

## One Hot Encode Categoricals

In [41]:
categoricals = ['country', 'province', 'variety', 'winery']
X_train_cat = pd.get_dummies(X_train[categoricals], prefix=categoricals, drop_first=True)

In [52]:
X_train_cat

Unnamed: 0,country_Armenia,country_Australia,country_Austria,country_Bosnia and Herzegovina,country_Brazil,country_Bulgaria,country_Canada,country_Chile,country_China,country_Croatia,...,winery_Ãbano,winery_Ãcluse,winery_ÃlevÃ©e Winegrowers,winery_Ãric & JÃ¶el Durand,winery_Ãl Macchione,winery_Ãrale,winery_Ãko,winery_Ãkonomierat Rebholz,winery_Ã Maurice,winery_Å toka
32539,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
91312,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50735,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
82091,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26122,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103694,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Build a Baseline Model

In [54]:
# Vectorize X_train text
tfidf = TfidfVectorizer(max_features=20)
X_train_text_vectorized = tfidf.fit_transform(X_train['description'])
pd.DataFrame.sparse.from_spmatrix(X_train_text_vectorized, 
                                  columns=tfidf.get_feature_names())

Unnamed: 0,acidity,and,aromas,but,finish,flavors,from,fruit,in,is,it,of,on,palate,that,the,this,to,wine,with
0,0.261794,0.000000,0.246307,0.000000,0.000000,0.000000,0.00000,0.231912,0.222482,0.180810,0.189651,0.141615,0.000000,0.250530,0.257448,0.679290,0.000000,0.228462,0.190068,0.000000
1,0.000000,0.623943,0.269130,0.000000,0.000000,0.000000,0.00000,0.253401,0.243098,0.000000,0.207224,0.309474,0.000000,0.000000,0.000000,0.296893,0.156136,0.000000,0.207680,0.347254
2,0.000000,0.498093,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.262859,0.000000,0.617632,0.364161,0.000000,0.000000,0.197508,0.000000,0.000000,0.276318,0.231010
3,0.331833,0.289521,0.312203,0.000000,0.329160,0.252615,0.00000,0.000000,0.000000,0.229184,0.000000,0.179502,0.317508,0.317556,0.000000,0.344410,0.181124,0.000000,0.240918,0.201415
4,0.000000,0.265876,0.000000,0.219857,0.000000,0.154656,0.00000,0.179966,0.172649,0.561244,0.147171,0.329684,0.000000,0.000000,0.199783,0.421709,0.221776,0.000000,0.147495,0.246621
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90929,0.000000,0.508705,0.365706,0.000000,0.000000,0.295906,0.00000,0.000000,0.000000,0.000000,0.000000,0.420527,0.000000,0.000000,0.000000,0.403432,0.212164,0.000000,0.282205,0.235932
90930,0.000000,0.284329,0.000000,0.000000,0.323258,0.248085,0.00000,0.000000,0.553895,0.225074,0.236079,0.528850,0.000000,0.000000,0.000000,0.169117,0.177877,0.000000,0.000000,0.000000
90931,0.000000,0.401472,0.000000,0.000000,0.000000,0.233530,0.33151,0.543498,0.000000,0.211869,0.000000,0.165941,0.293521,0.000000,0.301672,0.318391,0.167441,0.000000,0.000000,0.000000
90932,0.000000,0.298745,0.000000,0.247037,0.000000,0.000000,0.00000,0.202215,0.000000,0.157657,0.330731,0.493923,0.000000,0.000000,0.000000,0.236922,0.124597,0.398414,0.165730,0.415665


**How do I get the axis numbers to stay the same instead of re-indexing???**

In [45]:
#Multinomial Bayes Baseline Model
baseline_model = MultinomialNB()

# Evaluate the model on X_train_vectorized and y_train
baseline_cv = cross_val_score(baseline_model, X_train_text_vectorized, y_train)
baseline_cv.mean()

0.1382431158884812

In [49]:
X_train_vectorized_df = pd.DataFrame(X_train_text_vectorized.toarray(), columns=tfidf.get_feature_names())
preprocessed_X_train = pd.concat([X_train_vectorized_df, X_train_cat], axis=1)
preprocessed_X_train

Unnamed: 0,acidity,and,aromas,but,finish,flavors,from,fruit,in,is,...,winery_Ãbano,winery_Ãcluse,winery_ÃlevÃ©e Winegrowers,winery_Ãric & JÃ¶el Durand,winery_Ãl Macchione,winery_Ãrale,winery_Ãko,winery_Ãkonomierat Rebholz,winery_Ã Maurice,winery_Å toka
0,0.261794,0.000000,0.246307,0.000000,0.00000,0.000000,0.0,0.231912,0.222482,0.180810,...,,,,,,,,,,
1,0.000000,0.623943,0.269130,0.000000,0.00000,0.000000,0.0,0.253401,0.243098,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.498093,0.000000,0.000000,0.00000,0.000000,0.0,0.000000,0.000000,0.262859,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.331833,0.289521,0.312203,0.000000,0.32916,0.252615,0.0,0.000000,0.000000,0.229184,...,,,,,,,,,,
4,0.000000,0.265876,0.000000,0.219857,0.00000,0.154656,0.0,0.179966,0.172649,0.561244,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129898,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
129899,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
129900,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
129902,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
#Multinomial Bayes Baseline Model
baseline_model = MultinomialNB()

# Evaluate the model on X_train_vectorized and y_train
baseline_cv = cross_val_score(baseline_model, preprocessed_X_train, y_train)
baseline_cv.mean()

ValueError: Found input variables with inconsistent numbers of samples: [118260, 90934]

# Natural Language Processing