In [3]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk

In [4]:
# create a dataframe and import into pandas
df = pd.read_csv('../output_data/final_wine_data_172k_test.csv')

In [5]:
# check the top 5 of the dataframe
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,price,points,variety,winery
0,1,Portugal,"This is ripe and fruity, a wine that is smooth...",15.0,87,Portuguese Red,Quinta dos Avidagos
1,2,US,"Tart and snappy, the flavors of lime flesh and...",14.0,87,Pinot Gris,Rainstorm
2,3,US,"Pineapple rind, lemon pith and orange blossom ...",13.0,87,Riesling,St. Julian
3,4,US,"Much like the regular bottling from 2012, this...",65.0,87,Pinot Noir,Sweet Cheeks
4,5,Spain,Blackberry and raspberry aromas show a typical...,15.0,87,Tempranillo-Merlot,Tandem


In [6]:
# look at the string module
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

# Remove punctuation

In [7]:
#Function to remove Punctuation
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])# It will discard all punctuations
    return text_nopunct

df['descrip_clean'] = df['description'].apply(lambda x: remove_punct(x))

df.head()

Unnamed: 0.1,Unnamed: 0,country,description,price,points,variety,winery,descrip_clean
0,1,Portugal,"This is ripe and fruity, a wine that is smooth...",15.0,87,Portuguese Red,Quinta dos Avidagos,This is ripe and fruity a wine that is smooth ...
1,2,US,"Tart and snappy, the flavors of lime flesh and...",14.0,87,Pinot Gris,Rainstorm,Tart and snappy the flavors of lime flesh and ...
2,3,US,"Pineapple rind, lemon pith and orange blossom ...",13.0,87,Riesling,St. Julian,Pineapple rind lemon pith and orange blossom s...
3,4,US,"Much like the regular bottling from 2012, this...",65.0,87,Pinot Noir,Sweet Cheeks,Much like the regular bottling from 2012 this ...
4,5,Spain,Blackberry and raspberry aromas show a typical...,15.0,87,Tempranillo-Merlot,Tandem,Blackberry and raspberry aromas show a typical...


# Tokenization

In [8]:
import re

# Function to Tokenize words
def tokenize(text):
    tokens = re.split('\W+', text) #W+ means that either a word character (A-Za-z0-9_) or a dash (-) can go there.
    return tokens

df['descript_tokenized'] = df['descrip_clean'].apply(lambda x: tokenize(x.lower())) 

#We convert to lower as Python is case-sensitive. 

df.head()

Unnamed: 0.1,Unnamed: 0,country,description,price,points,variety,winery,descrip_clean,descript_tokenized
0,1,Portugal,"This is ripe and fruity, a wine that is smooth...",15.0,87,Portuguese Red,Quinta dos Avidagos,This is ripe and fruity a wine that is smooth ...,"[this, is, ripe, and, fruity, a, wine, that, i..."
1,2,US,"Tart and snappy, the flavors of lime flesh and...",14.0,87,Pinot Gris,Rainstorm,Tart and snappy the flavors of lime flesh and ...,"[tart, and, snappy, the, flavors, of, lime, fl..."
2,3,US,"Pineapple rind, lemon pith and orange blossom ...",13.0,87,Riesling,St. Julian,Pineapple rind lemon pith and orange blossom s...,"[pineapple, rind, lemon, pith, and, orange, bl..."
3,4,US,"Much like the regular bottling from 2012, this...",65.0,87,Pinot Noir,Sweet Cheeks,Much like the regular bottling from 2012 this ...,"[much, like, the, regular, bottling, from, 201..."
4,5,Spain,Blackberry and raspberry aromas show a typical...,15.0,87,Tempranillo-Merlot,Tandem,Blackberry and raspberry aromas show a typical...,"[blackberry, and, raspberry, aromas, show, a, ..."


In [9]:
# look at the column that is tokenized
print(df["descript_tokenized"])

0         [this, is, ripe, and, fruity, a, wine, that, i...
1         [tart, and, snappy, the, flavors, of, lime, fl...
2         [pineapple, rind, lemon, pith, and, orange, bl...
3         [much, like, the, regular, bottling, from, 201...
4         [blackberry, and, raspberry, aromas, show, a, ...
5         [heres, a, bright, informal, red, that, opens,...
6         [this, dry, and, restrained, wine, offers, spi...
7         [savory, dried, thyme, notes, accent, sunnier,...
8         [this, has, great, depth, of, flavor, with, it...
9         [soft, supple, plum, envelopes, an, oaky, stru...
10        [this, is, a, dry, wine, very, spicy, with, a,...
11        [slightly, reduced, this, wine, offers, a, cha...
12        [building, on, 150, years, and, six, generatio...
13        [zesty, orange, peels, and, apple, notes, abou...
14        [baked, plum, molasses, balsamic, vinegar, and...
15        [raw, blackcherry, aromas, are, direct, and, s...
16        [desiccated, blackberry, leath

# Remove stopwords

In [11]:
nltk.download('stopwords')

stopword = nltk.corpus.stopwords.words('english')# All English Stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/erictonian/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
# Function to remove Stopwords
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]# To remove all stopwords
    return text

df['discript_nostop'] = df["descript_tokenized"].apply(lambda x: remove_stopwords(x))

df.head()

Unnamed: 0.1,Unnamed: 0,country,description,price,points,variety,winery,descrip_clean,descript_tokenized,discript_nostop
0,1,Portugal,"This is ripe and fruity, a wine that is smooth...",15.0,87,Portuguese Red,Quinta dos Avidagos,This is ripe and fruity a wine that is smooth ...,"[this, is, ripe, and, fruity, a, wine, that, i...","[ripe, fruity, wine, smooth, still, structured..."
1,2,US,"Tart and snappy, the flavors of lime flesh and...",14.0,87,Pinot Gris,Rainstorm,Tart and snappy the flavors of lime flesh and ...,"[tart, and, snappy, the, flavors, of, lime, fl...","[tart, snappy, flavors, lime, flesh, rind, dom..."
2,3,US,"Pineapple rind, lemon pith and orange blossom ...",13.0,87,Riesling,St. Julian,Pineapple rind lemon pith and orange blossom s...,"[pineapple, rind, lemon, pith, and, orange, bl...","[pineapple, rind, lemon, pith, orange, blossom..."
3,4,US,"Much like the regular bottling from 2012, this...",65.0,87,Pinot Noir,Sweet Cheeks,Much like the regular bottling from 2012 this ...,"[much, like, the, regular, bottling, from, 201...","[much, like, regular, bottling, 2012, comes, a..."
4,5,Spain,Blackberry and raspberry aromas show a typical...,15.0,87,Tempranillo-Merlot,Tandem,Blackberry and raspberry aromas show a typical...,"[blackberry, and, raspberry, aromas, show, a, ...","[blackberry, raspberry, aromas, show, typical,..."


# Preprocessing Data: Using a Lemmatizer

In [15]:
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/erictonian/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [16]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

df['descript_lemmatized'] = df['discript_nostop'].apply(lambda x: lemmatizing(x))

df.head()

Unnamed: 0.1,Unnamed: 0,country,description,price,points,variety,winery,descrip_clean,descript_tokenized,discript_nostop,descript_lemmatized
0,1,Portugal,"This is ripe and fruity, a wine that is smooth...",15.0,87,Portuguese Red,Quinta dos Avidagos,This is ripe and fruity a wine that is smooth ...,"[this, is, ripe, and, fruity, a, wine, that, i...","[ripe, fruity, wine, smooth, still, structured...","[ripe, fruity, wine, smooth, still, structured..."
1,2,US,"Tart and snappy, the flavors of lime flesh and...",14.0,87,Pinot Gris,Rainstorm,Tart and snappy the flavors of lime flesh and ...,"[tart, and, snappy, the, flavors, of, lime, fl...","[tart, snappy, flavors, lime, flesh, rind, dom...","[tart, snappy, flavor, lime, flesh, rind, domi..."
2,3,US,"Pineapple rind, lemon pith and orange blossom ...",13.0,87,Riesling,St. Julian,Pineapple rind lemon pith and orange blossom s...,"[pineapple, rind, lemon, pith, and, orange, bl...","[pineapple, rind, lemon, pith, orange, blossom...","[pineapple, rind, lemon, pith, orange, blossom..."
3,4,US,"Much like the regular bottling from 2012, this...",65.0,87,Pinot Noir,Sweet Cheeks,Much like the regular bottling from 2012 this ...,"[much, like, the, regular, bottling, from, 201...","[much, like, regular, bottling, 2012, comes, a...","[much, like, regular, bottling, 2012, come, ac..."
4,5,Spain,Blackberry and raspberry aromas show a typical...,15.0,87,Tempranillo-Merlot,Tandem,Blackberry and raspberry aromas show a typical...,"[blackberry, and, raspberry, aromas, show, a, ...","[blackberry, raspberry, aromas, show, typical,...","[blackberry, raspberry, aroma, show, typical, ..."


In [19]:
#remove extra columns for size concerns
df = df[['country', 'descript_lemmatized','price','points', 'variety', 'winery']]
df.head()

Unnamed: 0,country,descript_lemmatized,price,points,variety,winery
0,Portugal,"[ripe, fruity, wine, smooth, still, structured...",15.0,87,Portuguese Red,Quinta dos Avidagos
1,US,"[tart, snappy, flavor, lime, flesh, rind, domi...",14.0,87,Pinot Gris,Rainstorm
2,US,"[pineapple, rind, lemon, pith, orange, blossom...",13.0,87,Riesling,St. Julian
3,US,"[much, like, regular, bottling, 2012, come, ac...",65.0,87,Pinot Noir,Sweet Cheeks
4,Spain,"[blackberry, raspberry, aroma, show, typical, ...",15.0,87,Tempranillo-Merlot,Tandem


# Saving the cleaned Data

In [20]:
# We save the file as CSV 
df.to_csv("../output_data/final_wine_data_172k_clean_test.csv", index = False)