# Preprocessing - Wine.com

In [1]:
#imports
import numpy as np
import pandas as pd
import os
import re
import string
import time

In [2]:
#set directory locations
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
raw_folder = parent_directory + '/data/wine-com/raw/'
processed_folder = parent_directory + '/data/wine-com/processed/'

### Load Data

In [3]:
data = dict()

with open(raw_folder + '1678665697.3855994.txt', newline='\r\n') as file:
    header = next(file)
    data['lines'] = file.readlines()

data['data'] = []

for line in data['lines']:
    values = line.strip().split('|')
    if len(values) == 8:
        row = {
            'Product_Name': values[0],
            'Product_Varietal': values[1],
            'Product_Origin': values[2],
            'Product_Price': values[3],
            'Product_Attributes': values[4],
            'Critical_Reviews': values[5],
            'User_Avg_Rating': values[6],
            'User_Rating_Count': values[7]
        }

        data['data'].append(row)

In [4]:
df = pd.DataFrame(data['data'])

In [5]:
df.head()

Unnamed: 0,Product_Name,Product_Varietal,Product_Origin,Product_Price,Product_Attributes,Critical_Reviews,User_Avg_Rating,User_Rating_Count
0,Clos Amador Cava Tendre Rose,Sparkling Rosé,"Cava, Spain",12,"Sparkling & Champagne,",92,4.3,64
1,Herdade de Sao Miguel Reserva 2019,Other Red Blends,"Alentejo, Portugal",22,"Red Wine,",94,4.2,5
2,Proyecto Salvaje del Moncayo Garnacha 2020,Grenache,"Navarra, Spain",12,"Red Wine,",92 91,4.6,21
3,La Lecciaia Orvieto Classico 2021,Other White Blends,"Umbria, Italy",12,"White Wine,",90,5.0,23
4,Laurent-Perrier Grand Siecle No. 25,Non-Vintage Sparkling Wine,"Champagne, France",250,"Sparkling & Champagne,Collectible,",99 97 97 96 95 95,4.7,6


### Data Preprocessing

In [6]:
def parse_vintage(text):
    return re.sub(r'\D', '', text)

df['Product_Vintage'] = df['Product_Name'].apply(parse_vintage)

In [7]:
def clean_product_name(text):
    return re.sub(r'\d+', '', text)

df['Product_Name'] = df['Product_Name'].apply(clean_product_name)

In [8]:
def add_price_fractional(text):
    return text + '.99'

df['Product_Price'] = df['Product_Price'].apply(add_price_fractional)

In [9]:
def parse_family(text):
    return text.split(',')[0]

df['Product_Attributes'] = df['Product_Attributes'].apply(parse_family)

In [10]:
def average_critical_scores(text):
    scores_string = re.sub(r'^\s+|\s+$', '', text)
    scores = scores_string.split()
    count_score = len(scores)
    sum_scores = 0
    for score in scores:
        sum_scores += int(score)
    try:
        avg_score = sum_scores/count_score
    except Exception:
        avg_score = None
    return avg_score

df['Critical_Avg_Rating'] = df['Critical_Reviews'].apply(average_critical_scores)

In [11]:
def count_critical_scores(text):
    scores_string = re.sub(r'^\s+|\s+$', '', text)
    scores = scores_string.split()
    count_score = len(scores)
    return count_score

df['Critical_Rating_Count'] = df['Critical_Reviews'].apply(count_critical_scores)

In [12]:
def product_origin_specificity(text):
    appellation_list = text.split(',')
    appellation_level = len(appellation_list)
    return appellation_level

df['Appellation_Level'] = df['Product_Origin'].apply(product_origin_specificity)

### Data Trimming

In [13]:
df = df.drop(columns = ['Critical_Reviews'])

In [14]:
df['Product_Vintage'] = df['Product_Vintage'].replace('', 0)
df['Product_Vintage'] = df['Product_Vintage'].astype(float)
df = df[df['Product_Vintage'] > 1950]
df['Product_Vintage'] = df['Product_Vintage'].astype(int)

In [15]:
df = df[df['Appellation_Level'] > 1]
df = df[df['Appellation_Level'] < 4]

In [16]:
df['User_Rating_Count'] = df['User_Rating_Count'].astype(float)
df = df[df['User_Rating_Count'] > 0]

### Data Review

In [17]:
df.head()

Unnamed: 0,Product_Name,Product_Varietal,Product_Origin,Product_Price,Product_Attributes,User_Avg_Rating,User_Rating_Count,Product_Vintage,Critical_Avg_Rating,Critical_Rating_Count,Appellation_Level
1,Herdade de Sao Miguel Reserva,Other Red Blends,"Alentejo, Portugal",22.99,Red Wine,4.2,5.0,2019,94.0,1,2
2,Proyecto Salvaje del Moncayo Garnacha,Grenache,"Navarra, Spain",12.99,Red Wine,4.6,21.0,2020,91.5,2,2
3,La Lecciaia Orvieto Classico,Other White Blends,"Umbria, Italy",12.99,White Wine,5.0,23.0,2021,90.0,1,2
5,Howard Park Flint Rock Shiraz,Syrah/Shiraz,"Great Southern, Western Australia, Australia",26.99,Red Wine,4.4,5.0,2019,94.0,1,3
6,Scott Harvey Mountain Selection Zinfandel,Zinfandel,"Amador, Sierra Foothills, California",29.99,Red Wine,4.3,43.0,2019,91.5,2,3


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2881 entries, 1 to 14141
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Product_Name           2881 non-null   object 
 1   Product_Varietal       2881 non-null   object 
 2   Product_Origin         2881 non-null   object 
 3   Product_Price          2881 non-null   object 
 4   Product_Attributes     2881 non-null   object 
 5   User_Avg_Rating        2881 non-null   object 
 6   User_Rating_Count      2881 non-null   float64
 7   Product_Vintage        2881 non-null   int32  
 8   Critical_Avg_Rating    2004 non-null   float64
 9   Critical_Rating_Count  2881 non-null   int64  
 10  Appellation_Level      2881 non-null   int64  
dtypes: float64(2), int32(1), int64(2), object(6)
memory usage: 258.8+ KB


### Clean & Write Data

In [19]:
df.to_csv(processed_folder + '1678665697.3855994.txt',
          sep = '|',
          index=False)