In [1]:
#Imports
import numpy as np
import pandas as pd
import string

In [2]:
# Read in wine data pickle
df = pd.read_pickle('./wine_data.pkl')

In [3]:
df.head()

Unnamed: 0,country,description,province,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Sicily & Sardinia,White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Douro,Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",Oregon,Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Michigan,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Oregon,Pinot Noir,Sweet Cheeks


In [4]:
# Drop unnecessary columns
df = df.drop(columns = {'country', 'winery'})

#### Create Target Column

In [5]:
# Combine Province & Variety, then use LabelEncoder to create target column
df['target'] = df.variety + ' from ' + df.province

In [6]:
df.head()

Unnamed: 0,description,province,variety,target
0,"Aromas include tropical fruit, broom, brimston...",Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia
1,"This is ripe and fruity, a wine that is smooth...",Douro,Portuguese Red,Portuguese Red from Douro
2,"Tart and snappy, the flavors of lime flesh and...",Oregon,Pinot Gris,Pinot Gris from Oregon
3,"Pineapple rind, lemon pith and orange blossom ...",Michigan,Riesling,Riesling from Michigan
4,"Much like the regular bottling from 2012, this...",Oregon,Pinot Noir,Pinot Noir from Oregon


In [7]:
# Group by target, drop those without multiple descriptions for same Variety + Province pair
df.target.nunique()

4119

In [8]:
targets_by_count = df.groupby('target').count().sort_values(by= ['description'], ascending=False)

In [9]:
# List of targets to keep (targets with few descriptions may mess up future modelling)
over_50_descs = targets_by_count.loc[targets_by_count['description'] >= 50]
over_50_descs.shape

(314, 3)

In [10]:
# Clean up 'over_50_descs' table in anticipation of inner join w/ df
over_50_descs = over_50_descs.reset_index()
over_50_descs = over_50_descs.drop(columns = {'description', 'province', 'variety'})


In [11]:
# Remove lesser-described wines from df
df = df.merge(over_50_descs, on='target')

In [12]:
df.head()

Unnamed: 0,description,province,variety,target
0,"Aromas include tropical fruit, broom, brimston...",Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia
1,Delicate aromas recall white flower and citrus...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia
2,Pretty aromas of yellow flower and stone fruit...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia
3,"Part of the extended Calanìca series, this Gri...",Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia
4,"This offers heady aromas of honeysuckle, white...",Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia


In [13]:
df.shape

(109127, 4)

In [14]:
# Use LabelEncoder to create coded target column

from sklearn.preprocessing import LabelEncoder

label_target = LabelEncoder()
df['target_code'] = label_target.fit_transform(df['target'])

In [15]:
df.head()

Unnamed: 0,description,province,variety,target,target_code
0,"Aromas include tropical fruit, broom, brimston...",Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
1,Delicate aromas recall white flower and citrus...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
2,Pretty aromas of yellow flower and stone fruit...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
3,"Part of the extended Calanìca series, this Gri...",Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
4,"This offers heady aromas of honeysuckle, white...",Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305


## Clean Wine Description Text

In [16]:
# Define function to remove puntuation & numbers
def cleanText(text):
    
    # Make all descriptions lowercase
    text = text.lower()
    
    # Make contractions single words (e.g. "isn't" -> "isnt")
    text = text.replace("'", '')
    
    #Remove 'EM DASH' unicode from text
    em_dash = u'\u2014'
    text = text.replace(em_dash,' ')
    
    #Remove 'EN DASH' unicode from text
    en_dash = u'\u2013'
    text = text.replace(en_dash,' ')

    # Remove all punctuation
    text = text.translate(text.maketrans(string.punctuation, ' '*len(string.punctuation)))
    
    # Remove all numbers from text
    text = text.translate(text.maketrans('', '', string.digits))
    
    # Remove words shorter than two characters
    text = ' '.join(word for word in text.split() if len(word) > 2)

    return text

In [17]:
df['description'] = df['description'].apply(cleanText)

In [18]:
df.head()

Unnamed: 0,description,province,variety,target,target_code
0,aromas include tropical fruit broom brimstone ...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
1,delicate aromas recall white flower and citrus...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
2,pretty aromas yellow flower and stone fruit le...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
3,part the extended calanìca series this grillo ...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
4,this offers heady aromas honeysuckle white sto...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305


## Tokenize & Lemmatize Wine Description Text

In [19]:
from nltk.stem.wordnet import WordNetLemmatizer

def tokenizeLemmatizeText(text):
    # Tokenize words in description
    text = [WordNetLemmatizer().lemmatize(word, pos="v") for word in text.split()]
    text = [WordNetLemmatizer().lemmatize(word, pos="n") for word in text]
    return text

In [20]:
df['description'] = df['description'].apply(tokenizeLemmatizeText)

In [21]:
df.head()

Unnamed: 0,description,province,variety,target,target_code
0,"[aroma, include, tropical, fruit, broom, brims...",Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
1,"[delicate, aroma, recall, white, flower, and, ...",Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
2,"[pretty, aroma, yellow, flower, and, stone, fr...",Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
3,"[part, the, extend, calanìca, series, this, gr...",Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
4,"[this, offer, heady, aroma, honeysuckle, white...",Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305


In [22]:
from nltk.corpus import stopwords 

def removeStopwords(text):
    stop_words = set(stopwords.words('english'))
    return [word for word in text if not word in stop_words]

In [23]:
df['description'] = df['description'].apply(removeStopwords)

In [24]:
# Rejoin tokens in each description
df.description = df.description.apply(lambda x: ' '.join(x))

In [25]:
#df.to_pickle('./tokenized_wine_data_w_target.pkl')

In [26]:
df.head(10)

Unnamed: 0,description,province,variety,target,target_code
0,aroma include tropical fruit broom brimstone d...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
1,delicate aroma recall white flower citrus pala...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
2,pretty aroma yellow flower stone fruit lead no...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
3,part extend calanìca series grillo viognier bl...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
4,offer heady aroma honeysuckle white stone frui...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
5,edèlmio sophisticate toasty blend carricante c...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
6,blend catarratto chardonnay show tiny touch ox...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
7,blend incrocio manzoni vermentino open bold fl...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
8,one simple pleasure southern italy corvo stead...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305
9,make nasco vermentino offer delicate aroma mar...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,305


#### Pickle cleaned dataframe

In [27]:
df.to_pickle('./clean_wine_data_w_target.pkl')