# Predicting/Classifying Top Wine Regions
### Text Analytics Final Project

In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

import re

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR

## Reading in Data File

In [2]:
df1 = pd.read_csv('winemag_fin.csv', index_col = 0)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
df = df1

In [4]:
df = df.drop_duplicates()[['country','description','region_1','points','price']].dropna()
df = df.rename(columns = {'region_1': 'region'})  
df

Unnamed: 0,country,description,region,points,price
2,US,"Tart and snappy, the flavors of lime flesh and...",Willamette Valley,87,14.0
3,US,"Pineapple rind, lemon pith and orange blossom ...",Lake Michigan Shore,87,13.0
4,US,"Much like the regular bottling from 2012, this...",Willamette Valley,87,65.0
5,Spain,Blackberry and raspberry aromas show a typical...,Navarra,87,15.0
6,Italy,"Here's a bright, informal red that opens with ...",Vittoria,87,16.0
...,...,...,...,...,...
149634,France,Atypically light in body and reticent on the n...,Alsace,84,15.0
149635,US,A Syrah-Grenache blend that's dry and rustical...,Santa Barbara County,84,15.0
149637,US,"Outside of the vineyard, wines like this are w...",California,84,6.0
149638,Argentina,"Heavy and basic, with melon and pineapple arom...",Uco Valley,84,9.0


### Filtering Dataset to contain only Top Regions

In [5]:
df = df[(df["region"]=='Finger Lakes') | (df["region"]=='Alsace')  | (df["region"]=='Rioja') | (df["region"]=='Willamette Valley') | (df["region"]=='Mendoza') | (df["region"]=='Columbia Valley (WA)') | (df["region"]=='Toscana') | (df["region"]=='Napa Valley')]

In [6]:
df

Unnamed: 0,country,description,region,points,price
2,US,"Tart and snappy, the flavors of lime flesh and...",Willamette Valley,87,14.0
4,US,"Much like the regular bottling from 2012, this...",Willamette Valley,87,65.0
7,France,This dry and restrained wine offers spice in p...,Alsace,87,24.0
9,France,This has great depth of flavor with its fresh ...,Alsace,87,27.0
10,US,"Soft, supple plum envelopes an oaky structure ...",Napa Valley,87,19.0
...,...,...,...,...,...
149617,US,"Bacon, lychee and bubble gum aromas are unexpe...",Finger Lakes,84,13.0
149626,US,"Made in a superripe style, this Chard is jammy...",Napa Valley,84,16.0
149627,France,Opens with spice aromas that seem a bit unusua...,Alsace,84,21.0
149630,US,Tastes kind of soft and thick in jammy cherry ...,Napa Valley,84,27.0


In [7]:
df['region'] = df['region'].copy()
df['reg_num'] = pd.factorize(df['region'])[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['region'] = df['region'].copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['reg_num'] = pd.factorize(df['region'])[0]


In [8]:
df_regions = df
df_regions[['reg_num', 'region']].value_counts()

reg_num  region              
2        Napa Valley             8285
4        Columbia Valley (WA)    7094
3        Mendoza                 4307
0        Willamette Valley       3372
1        Alsace                  2712
7        Rioja                   2578
6        Finger Lakes            2350
5        Toscana                 1990
dtype: int64

### Removing Outliers

In [9]:
Q1 = np.percentile(df['price'], 10,
                   interpolation = 'midpoint')
 
Q3 = np.percentile(df['price'], 90,
                   interpolation = 'midpoint')
IQR = Q3 - Q1
df = df[df.price < (Q3+1.5*IQR)+1]

Q1 = np.percentile(df['points'], 5, interpolation = 'midpoint')#
 
Q3 = np.percentile(df['points'], 95, interpolation = 'midpoint')
IQR = Q3 - Q1
df = df[df.points < (Q3+1.5*IQR)+1]

### Text Preprocessing

In [10]:
remove_non_alphabets = lambda x: re.sub(r'[^a-zA-Z]',' ',x)

tokenize = lambda x: word_tokenize(x)

ps = PorterStemmer()

stem = lambda w: [ ps.stem(x) for x in w ]

lemmatizer = WordNetLemmatizer()
leammtizer = lambda x: [ lemmatizer.lemmatize(word) for word in x ]

remove_stop = lambda x: [w for w in x if w not in stop]

print('Processing : [=', end='')
df['description'] = df['description'].apply(remove_non_alphabets)
print('=', end='')
df['description'] = df['description'].apply(tokenize)
print('=', end='')
stop = text.ENGLISH_STOP_WORDS
df['description'] = df['description'].apply(remove_stop)
print('=', end='')
df['description'] = df['description'].apply(stem)
print('=', end='')
df['description'] = df['description'].apply(leammtizer)
print('=', end='')
df['description'] = df['description'].apply(lambda x: ' '.join(x))
print(']')



### Defining Functions for Modeling

In [11]:
def get_sample(df,x):
    df = df.sample(frac=x, replace=False, random_state=1)
    df = df.reset_index(drop=True)
    return df

def tfidf_vector(df):
    
    tfidf_vectorizer=TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,1),
                                     max_features=1000)
    
    tfidf_features = tfidf_vectorizer.fit_transform(df.description)  
    
    df_tfidf = pd.DataFrame(tfidf_features.toarray(),
                      columns=tfidf_vectorizer.get_feature_names())
    df_tfidf 
    
    return df_tfidf

def jointwo(df,df_tfidf):
    df = df.join(df_tfidf)
    df = df.drop(columns=['country','description'])
    df = df.fillna(0)
    return df

def run_random_clf(df):
    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['reg_num','points',]), df.reg_num, test_size=0.30, random_state=42)
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, y_train)
    return clf, X_test, y_test
    
def run_log(df):
    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['reg_num']), df.reg_num, test_size=0.30, random_state=42)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)
    return clf, X_test, y_test
    

def run_svm(df):
    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['reg_num']), df.reg_num, test_size=0.30, random_state=42)
    clf = svm.SVC()
    clf.fit(X_train, y_train)
    return clf, X_test, y_test

def run_mlp(df):
    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['reg_num']), df.reg_num, test_size=0.30, random_state=42)
    clf = MLPClassifier(random_state=1, max_iter=300)
    clf.fit(X_train, y_train)
    return clf, X_test, y_test

### Preparing Data for Modeling

In [12]:
df_tfidf = tfidf_vector(df.reset_index())



In [13]:
df_tfidf.rename(columns={'price':'pric'},inplace= True)

In [14]:
df_fin = df.reset_index().join(df_tfidf)

In [15]:
cols_to_norm = ['points','price']
df_fin[cols_to_norm] = df_fin[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [16]:
df_fin.head()

Unnamed: 0,index,country,description,region,points,price,reg_num,absolut,abund,accent,...,yet,yield,you,young,youth,zest,zesti,zin,zinfandel,zippi
0,2,US,tart snappi flavor lime flesh rind domin some ...,Willamette Valley,0.35,0.070922,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,US,much like regular bottl come rough tannic rust...,Willamette Valley,0.35,0.432624,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,France,thi dri restrain wine offer spice profus balan...,Alsace,0.35,0.141844,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,France,thi great depth flavor fresh appl pear fruit t...,Alsace,0.35,0.163121,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10,US,soft suppl plum envelop oaki structur cabernet...,Napa Valley,0.35,0.106383,2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
df_ml = df_fin.iloc[:,4:]

In [18]:
df_ml

Unnamed: 0,points,price,reg_num,absolut,abund,accent,access,accompani,acid,ad,...,yet,yield,you,young,youth,zest,zesti,zin,zinfandel,zippi
0,0.35,0.070922,0,0.0,0.0,0.0,0.000000,0.0,0.135196,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.35,0.432624,0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.35,0.141844,1,0.0,0.0,0.0,0.000000,0.0,0.211171,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.35,0.163121,1,0.0,0.0,0.0,0.000000,0.0,0.191478,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.35,0.106383,2,0.0,0.0,0.0,0.296965,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32176,0.20,0.063830,6,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32177,0.20,0.085106,2,0.0,0.0,0.0,0.000000,0.0,0.120200,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32178,0.20,0.120567,1,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32179,0.20,0.163121,2,0.0,0.0,0.0,0.000000,0.0,0.148817,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Machine Learning

### Multi-Layer Perceptron

In [24]:
clf, X_test, y_test = run_mlp(df_ml)
y_pred = clf.predict(X_test)
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.78      0.81      0.80      1017
           1       0.92      0.92      0.92       833
           2       0.92      0.92      0.92      2386
           3       0.85      0.84      0.84      1270
           4       0.89      0.88      0.88      2123
           5       0.89      0.91      0.90       562
           6       0.92      0.91      0.91       704
           7       0.79      0.77      0.78       760

    accuracy                           0.88      9655
   macro avg       0.87      0.87      0.87      9655
weighted avg       0.88      0.88      0.88      9655



In [25]:
print(metrics.confusion_matrix(y_test,y_pred))

[[ 825   17   31   12  122    2    4    4]
 [  14  764   21    1    9    0   21    3]
 [  49   12 2202   19   59   27   12    6]
 [   7    3   30 1063   21   12    3  131]
 [ 145   11   54    8 1871   10   16    8]
 [   3    4   18    8   11  512    1    5]
 [   7   15   21    6   10    3  641    1]
 [   6    4    5  135   11   10    2  587]]


### Random Forest

In [27]:
clf, X_test, y_test = run_random_clf(df_ml)
y_pred = clf.predict(X_test)
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.66      0.75      1017
           1       0.91      0.85      0.88       833
           2       0.85      0.93      0.89      2386
           3       0.76      0.90      0.82      1270
           4       0.81      0.90      0.85      2123
           5       0.97      0.80      0.88       562
           6       0.92      0.80      0.86       704
           7       0.94      0.62      0.75       760

    accuracy                           0.84      9655
   macro avg       0.88      0.81      0.83      9655
weighted avg       0.85      0.84      0.84      9655



In [38]:
df_regions[['reg_num', 'region']].value_counts().sort_index()

reg_num  region              
0        Willamette Valley       3372
1        Alsace                  2712
2        Napa Valley             8285
3        Mendoza                 4307
4        Columbia Valley (WA)    7094
5        Toscana                 1990
6        Finger Lakes            2350
7        Rioja                   2578
dtype: int64

#### Feature Importance RF

In [33]:
feature_imp = pd.Series(clf.feature_importances_,index=X_test.columns).sort_values(ascending=False)
pd.DataFrame(feature_imp)

Unnamed: 0,0
price,0.036401
aroma,0.030378
palat,0.021807
fruit,0.018710
flavor,0.016169
...,...
ager,0.000054
dramat,0.000053
could,0.000048
should,0.000047


### Logistic Regression

In [23]:
clf, X_test, y_test = run_log(df_ml)
y_pred = clf.predict(X_test)
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.77      0.75      0.76      1017
           1       0.93      0.91      0.92       833
           2       0.91      0.93      0.92      2386
           3       0.79      0.87      0.83      1270
           4       0.85      0.87      0.86      2123
           5       0.96      0.86      0.91       562
           6       0.92      0.90      0.91       704
           7       0.80      0.66      0.72       760

    accuracy                           0.86      9655
   macro avg       0.86      0.84      0.85      9655
weighted avg       0.86      0.86      0.86      9655



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
