In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
scraped = pd.read_pickle('../data/cleaned_dataset.gzip', compression='gzip')
scraped.head(3)

Unnamed: 0,name,description,price,category_1,category_2,origin,wine_size_value,wine_size_units,wine_abv
0,1000 Stories Bourbon Barrel Aged Batch Blue Ca...,"This is a very special, limited release of 100...",18.99,Red Wine,Carignan,"from Mendocino, California",750,ML,14.7
0,1000 Stories Bourbon Barrel Aged Chardonnay 2018,The 2018 Chardonnay reflects the classic Calif...,17.99,White Wine,Chardonnay,from California,750,ML,14.0
0,1000 Stories Bourbon Barrel Aged Gold Rush Red...,The California Gold Rush was a period of coura...,17.99,Red Wine,Other Red Blends,from California,750,ML,14.9


In [4]:
scraped = scraped.replace(r'\\n',' ', regex=True) 

In [5]:
gpt2_txt = scraped.copy()

gpt2_txt['start_token'] = "<|startoftext|>"
gpt2_txt['end_token'] = "<|endoftext|>"
gpt2_txt['prompt_token'] = "[prompt]"
gpt2_txt['response_token'] = "[response]"
gpt2_txt['category_1_token'] = "[category_1]"
gpt2_txt['category_2_token'] = "[category_2]"
gpt2_txt['origin_token'] = "[origin]"
gpt2_txt['description_token'] = "[description]"

gpt2_txt = gpt2_txt[
  [
    'start_token', 
    'prompt_token', 
    'name', 
    'response_token',
    'category_1_token',
    'category_1',
    'category_2_token',
    'category_2',
    'origin_token',
    'origin',
    'description_token',
    'description',
    'end_token'
  ]
]

In [6]:
gpt2_txt.head(3)

Unnamed: 0,start_token,prompt_token,name,response_token,category_1_token,category_1,category_2_token,category_2,origin_token,origin,description_token,description,end_token
0,<|startoftext|>,[prompt],1000 Stories Bourbon Barrel Aged Batch Blue Ca...,[response],[category_1],Red Wine,[category_2],Carignan,[origin],"from Mendocino, California",[description],"This is a very special, limited release of 100...",<|endoftext|>
0,<|startoftext|>,[prompt],1000 Stories Bourbon Barrel Aged Chardonnay 2018,[response],[category_1],White Wine,[category_2],Chardonnay,[origin],from California,[description],The 2018 Chardonnay reflects the classic Calif...,<|endoftext|>
0,<|startoftext|>,[prompt],1000 Stories Bourbon Barrel Aged Gold Rush Red...,[response],[category_1],Red Wine,[category_2],Other Red Blends,[origin],from California,[description],The California Gold Rush was a period of coura...,<|endoftext|>


# Quality cleanup

In [8]:
print(gpt2_txt.shape)
gpt2_txt_clean = gpt2_txt[~gpt2_txt['name'].str.lower().str.contains('gift')]
print(gpt2_txt_clean.shape)
gpt2_txt_clean = gpt2_txt_clean[~gpt2_txt_clean['description'].str.lower().str.contains('gift')]
print(gpt2_txt_clean.shape)


(123516, 13)
(123516, 13)
(123450, 13)


In [9]:
gpt2_txt_clean.head()

Unnamed: 0,start_token,prompt_token,name,response_token,category_1_token,category_1,category_2_token,category_2,origin_token,origin,description_token,description,end_token
0,<|startoftext|>,[prompt],1000 Stories Bourbon Barrel Aged Batch Blue Ca...,[response],[category_1],Red Wine,[category_2],Carignan,[origin],"from Mendocino, California",[description],"This is a very special, limited release of 100...",<|endoftext|>
0,<|startoftext|>,[prompt],1000 Stories Bourbon Barrel Aged Chardonnay 2018,[response],[category_1],White Wine,[category_2],Chardonnay,[origin],from California,[description],The 2018 Chardonnay reflects the classic Calif...,<|endoftext|>
0,<|startoftext|>,[prompt],1000 Stories Bourbon Barrel Aged Gold Rush Red...,[response],[category_1],Red Wine,[category_2],Other Red Blends,[origin],from California,[description],The California Gold Rush was a period of coura...,<|endoftext|>
0,<|startoftext|>,[prompt],1000 Stories Bourbon Barrel Aged Prospectors' ...,[response],[category_1],Red Wine,[category_2],Cabernet Sauvignon,[origin],from California,[description],The result is a signature 1000 Stories wine: l...,<|endoftext|>
0,<|startoftext|>,[prompt],1000 Stories Bourbon Barrel Aged Zinfandel 2013,[response],[category_1],Red Wine,[category_2],Zinfandel,[origin],"from North Coast, California",[description],"The wine has a deep, rich purple color. An int...",<|endoftext|>


In [28]:
gpt2_txt_clean['origin'].value_counts()[:60]

 from Napa Valley, California                                      10343
 from California                                                    6484
 from Sonoma County, California                                     5282
 from Central Coast, California                                     4134
 from Columbia Valley, Washington                                   3267
 from Willamette Valley, Oregon                                     2771
 from Russian River, Sonoma County, California                      2759
 from Tuscany, Italy                                                2428
 from Burgundy, France                                              2424
 from Australia                                                     2195
 from Chile                                                         2057
 from South Africa                                                  2008
 from Spain                                                         1870
 from Rioja, Spain                                 

# Data split and save

In [9]:
train, test = train_test_split(gpt2_txt_clean, test_size=0.2, random_state=0)
print(train.shape)
print(test.shape)

(98760, 13)
(24690, 13)


In [10]:
gpt2_txt_clean.to_csv(
    "data/scraped/name_desc_nlp_ready.txt", 
    sep=' ',
    index=False,
    header=None)

train.to_csv(
    "data/scraped/name_desc_nlp_ready_train.txt", 
    sep=' ',
    index=False,
    header=None)

test.to_csv(
    "data/scraped/name_desc_nlp_ready_test.txt", 
    sep=' ',
    index=False,
    header=None)