# Get and preprocess test data.

In [34]:
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

Test data from: https://data.world/chrismjones/winereviews

In [16]:
df = pd.read_csv('test_data.csv')
df = df[df['country'] == 'US']
df = df[['points', 'description', 'variety', 'price']]
shuffled_df = df.sample(frac=1)
random_subset = shuffled_df.head(3000)
random_subset

Unnamed: 0,points,description,variety,price
97312,87,"With lilting lemon, grapefruit and pretty pine...",Sauvignon Blanc,16.0
46818,83,Scents of black coffee and ripe black fruit me...,Baco Noir,28.0
60314,87,Shows brisk acidity for this warmish appellati...,Chardonnay,24.0
62748,87,"From organic grapes, this fresh, slightly swee...",Chenin Blanc,16.0
7110,84,"Outspoken, delineated notes of candied lemon a...",Symphony,38.0
...,...,...,...,...
42926,93,Lots to admire in this Cab. It's bone dry and ...,Cabernet Sauvignon,75.0
137643,94,"A terrific Zinfandel, with a polished, pretty ...",Zinfandel,35.0
37805,87,Made half in stainless steel and half in neutr...,Riesling,19.0
135063,89,The vineyard is on the lower slopes of Atlas P...,Cabernet Sauvignon,75.0


In [17]:
random_subset = random_subset.reset_index(drop=True)
random_subset

Unnamed: 0,points,description,variety,price
0,87,"With lilting lemon, grapefruit and pretty pine...",Sauvignon Blanc,16.0
1,83,Scents of black coffee and ripe black fruit me...,Baco Noir,28.0
2,87,Shows brisk acidity for this warmish appellati...,Chardonnay,24.0
3,87,"From organic grapes, this fresh, slightly swee...",Chenin Blanc,16.0
4,84,"Outspoken, delineated notes of candied lemon a...",Symphony,38.0
...,...,...,...,...
2995,93,Lots to admire in this Cab. It's bone dry and ...,Cabernet Sauvignon,75.0
2996,94,"A terrific Zinfandel, with a polished, pretty ...",Zinfandel,35.0
2997,87,Made half in stainless steel and half in neutr...,Riesling,19.0
2998,89,The vineyard is on the lower slopes of Atlas P...,Cabernet Sauvignon,75.0


In [20]:
# Get superior_rating.
random_subset['superior_rating'] = random_subset['points'] >= 90
random_subset['superior_rating'] = random_subset['superior_rating'].astype(int)

In [45]:
# Encode varity names.
# Adding a new column with shortened variety names.
names2code = {
    "Champagne Blend":"Cd", 
    "Bordeaux-style Red Blend":"BR",
    "Rosé":"Re", 
    "Chardonnay":"Cy", 
    "Syrah":"Sh", 
    "Red Blend":"Rd", 
    "Gamay":"Gy", 
    "Gewürztraminer":"Gr",
    "Chenin Blanc":"Cc", 
    "Alsace white blend":"Ad", 
    "Tannat":"Tt", 
    "White Blend":"Wd", 
    "Rhône-style Red Blend":"RR", 
    "Muscat":"Mt",
    "Bordeaux-style White Blend":"BW", 
    "Pinot Noir":"Pr",
    "Malbec":"Mc", 
    "Merlot":"Mr", 
    "Sauvignon Blanc":"Sc",
    "Viognier":"Vr", 
    "Pinot Gris":"Ps", 
    "Cabernet Franc":"CF",
    "Riesling":"Rg", 
    "Tannat-Cabernet":"TC", 
    "Marsanne":"Me",
    "Pinot Blanc":"Pc", 
    "Cabernet Sauvignon":"Cn", 
    "Rhône-style White Blend":"RW", 
    "Grenache":"Ge",
    "Sparkling Blend":"Sd", 
    "Malbec-Merlot":"MM",
    "Sylvaner":"Sr", 
    "Melon":"Mn", 
    "Chenin Blanc-Chardonnay":"CB", 
    "Petit Manseng":"Pg"
}
random_subset['variety_c'] = random_subset['variety'].apply(
    lambda v: names2code[v] if v in names2code else None
)
random_subset.dropna(inplace=True)
random_subset

Unnamed: 0,points,description,variety,price,superior_rating,tfidf_tsne_1,Rich,sweetness,acidity,tannin,alcohol,body,variety_c
0,87,"With lilting lemon, grapefruit and pretty pine...",Sauvignon Blanc,16.0,0,-42.146626,0,0.571,0.571,0.500,0.500,0.429,Sc
2,87,Shows brisk acidity for this warmish appellati...,Chardonnay,24.0,0,-39.968876,0,0.400,0.600,0.500,0.400,0.400,Cy
3,87,"From organic grapes, this fresh, slightly swee...",Chenin Blanc,16.0,0,-0.592675,0,0.600,0.400,0.500,0.500,0.400,Cc
6,86,Patz & Hall has captured the cool-climate esse...,Chardonnay,39.0,0,18.442337,0,0.375,0.625,0.500,0.312,0.500,Cy
7,89,"A Bordeaux blend, smooth and ultrachocolaty in...",Bordeaux-style Red Blend,40.0,0,-5.307372,0,0.583,0.333,0.417,0.500,0.500,BR
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2994,86,"A little too soft and sweet for balance, with ...",Chardonnay,32.0,0,-45.948177,0,0.500,0.417,0.417,0.500,0.500,Cy
2995,93,Lots to admire in this Cab. It's bone dry and ...,Cabernet Sauvignon,75.0,1,-13.696779,1,0.444,0.389,0.500,0.444,0.556,Cn
2997,87,Made half in stainless steel and half in neutr...,Riesling,19.0,0,-26.913145,0,0.438,0.438,0.500,0.438,0.500,Rg
2998,89,The vineyard is on the lower slopes of Atlas P...,Cabernet Sauvignon,75.0,0,-31.955271,0,0.643,0.286,0.429,0.571,0.500,Cn


In [46]:
from nlp import clean, remove_stopwords, eng_filter, lemmatize, get_character_scores

In [32]:
# Clean text.
description_processed = random_subset["description"].apply(lambda text: clean(text))
description_processed = description_processed.apply(lambda text: remove_stopwords(text))
description_processed = description_processed.apply(lambda text: eng_filter(text))
description_processed = description_processed.apply(lambda text: lemmatize(text))

In [35]:
# Get (less accurate) TF-IDF TSNE Components.
vectorizer_tfidf = TfidfVectorizer(ngram_range=(1,1))
vectorizer_tfidf.fit(description_processed)
review_tfidf = vectorizer_tfidf.transform(description_processed).toarray()
tsne2 = TSNE(n_components=2, random_state=23) # (2500, 2796) -> (2500, 2)
tfidf_reduced2 = tsne2.fit_transform(review_tfidf)
random_subset['tfidf_tsne_1'] = tfidf_reduced2[:,0]

In [37]:
# Get Rich.
random_subset['Rich'] = description_processed.apply(lambda x: int("rich" in x))

In [40]:
# Get characteristics.
df_char_scores = pd.DataFrame.from_dict([
    get_character_scores(text) 
    for text in description_processed.to_numpy().tolist()
])
df_char_scores

Unnamed: 0,sweetness,acidity,tannin,alcohol,body
0,0.571,0.571,0.500,0.500,0.429
1,0.583,0.583,0.500,0.417,0.333
2,0.400,0.600,0.500,0.400,0.400
3,0.600,0.400,0.500,0.500,0.400
4,0.500,0.500,0.500,0.500,0.417
...,...,...,...,...,...
2995,0.444,0.389,0.500,0.444,0.556
2996,0.625,0.438,0.562,0.562,0.625
2997,0.438,0.438,0.500,0.438,0.500
2998,0.643,0.286,0.429,0.571,0.500


In [41]:
random_subset = pd.concat([random_subset, df_char_scores], axis=1)

In [48]:
test_set = random_subset[[
    'variety', 'price', 'superior_rating', 'tfidf_tsne_1',
    'Rich', 'tannin', 'alcohol', 'body'
]]
# test_set.to_csv("test_data_processed.csv", index=False)