# Preprocessing - Wine.com

In [1]:
#imports
import numpy as np
import pandas as pd
import os
import re
import string
from sklearn.base import TransformerMixin
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
#set directory locations
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
raw_folder = parent_directory + '/data/wine-com/raw/'
processed_folder = parent_directory + '/data/wine-com/processed/'

### Load Data

In [3]:
#instantiate data structure to hold initial raw data/critical critical reviews
data = dict()
review_data = dict()
#iterate through wine.com raw data folder
for filename in os.listdir(raw_folder):
    #determine if file has already been processed - if true pass
    if filename not in os.listdir(processed_folder):
        print(f'{filename} loading')
        data[filename] = dict()
        review_data[filename] = dict()
        #open file readlines to raw dictionary
        with open(raw_folder + filename, newline='\r\n') as file:
            header = next(file)
            data[filename]['lines'] = file.readlines()
        #create list to hold parsed records
        data[filename]['data'] = []
        review_data[filename]['data'] = []
        #iterate through records to parse raw data
        for line in data[filename]['lines']:
            values = line.strip().split('|')
            #while redundancy is built into the scraper to ensure that the correct format is achieved,
            #this prevents rogue html from causing a bad data load
            if len(values) == 9:
                row = {
                    'product_url': values[0],
                    'product_name': values[1],
                    'product_variety': values[2],
                    'product_origin': values[3],
                    'product_family': values[4],
                    'user_avg_rating': values[5],
                    'user_rating_count': values[6],
                    'winemaker_description': values[7]
                    #'critical_reviews' is parsed loaded below
                }
                #write to list to be loaded to dataframe
                data[filename]['data'].append(row)
                
                #parse critical reviews - should be loaded as list
                reviews = values[8].split(';')
                for review in reviews:
                    reviewer_name, reviewer_rating, reviewer_text = review.split(',')
                    review = {
                    'product_url': values[0],
                    'reviewer_name': reviewer_name,
                    'reviewer_rating': reviewer_rating,
                    'reviewer_text': reviewer_text
                    }
                    #print(reviewer_text)
                    review_data[filename]['data'].append(review)
                    
        # Convert the list of dictionaries to a Pandas DataFrame
        data[filename]['unmerged_df'] = pd.DataFrame(data[filename]['data'])
        review_data[filename]['df'] = pd.DataFrame(review_data[filename]['data'])
        
        #perform sql-style left join of review data onto main df
        data[filename]['merged_df'] = pd.merge(data[filename]['unmerged_df'], 
                                               review_data[filename]['df'], 
                                               on='product_url', 
                                               how='left')

1677353356.1461694.txt loading


In [4]:
data['1677353356.1461694.txt']['merged_df'].head()

Unnamed: 0,product_url,product_name,product_variety,product_origin,product_family,user_avg_rating,user_rating_count,winemaker_description,reviewer_name,reviewer_rating,reviewer_text
0,https://www.wine.com/product/proyecto-salvaje-...,Proyecto Salvaje del Moncayo Garnacha 2020,Grenache,"from Navarra, Spain",Red Wine,4.8,19,It is a bright burgundy wine with medium-depth...,Decanter,92,This is part of the Proyecto Garnachas de Espa...
1,https://www.wine.com/product/proyecto-salvaje-...,Proyecto Salvaje del Moncayo Garnacha 2020,Grenache,"from Navarra, Spain",Red Wine,4.8,19,It is a bright burgundy wine with medium-depth...,Wilfred Wong of Wine.com,91,COMMENTARY: The 2020 Proyecto Garnachas Salvaj...
2,https://www.wine.com/product/domaine-du-terme-...,Domaine du Terme Gigondas 2019,Rhone Red Blends,"from Gigondas, Rhone, France",Red Wine,4.0,17,,Wine & Spirits,96,Spectacular Gigondas this wine’s red-cherry f...
3,https://www.wine.com/product/domaine-du-terme-...,Domaine du Terme Gigondas 2019,Rhone Red Blends,"from Gigondas, Rhone, France",Red Wine,4.0,17,,Decanter,94,Straight from the first sniff it's clear this ...
4,https://www.wine.com/product/scott-harvey-moun...,Scott Harvey Mountain Selection Zinfandel 2019,Zinfandel,"from Amador, Sierra Foothills, California",Red Wine,4.3,39,Fruit forward rich full flavors expressing bo...,Wine Enthusiast,93,This fresh-smelling full-bodied and flavor-pa...


In [7]:
data['1677353356.1461694.txt']['merged_df'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1516 entries, 0 to 1515
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   product_url            1516 non-null   object
 1   product_name           1516 non-null   object
 2   product_variety        1516 non-null   object
 3   product_origin         1516 non-null   object
 4   product_family         1516 non-null   object
 5   user_avg_rating        1516 non-null   object
 6   user_rating_count      1516 non-null   object
 7   winemaker_description  1516 non-null   object
 8   reviewer_name          1516 non-null   object
 9   reviewer_rating        1516 non-null   object
 10  reviewer_text          1516 non-null   object
dtypes: object(11)
memory usage: 142.1+ KB


### Data Preprocessing

In [5]:
# a text preprocessor class is created to manage preprocessing of text fields
# inheriting from TransformerMixin of Sklearn, this should allow for tying into large sklearn pipeline
class preprocess_text(TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.punctuation = set(string.punctuation)
    
    def transform(self, X):
        cleaned_records = []
        for record in X:
            # convert text to lowercase
            text = record.lower()
            # remove punctuation
            text = re.sub(r'\W+', ' ', text)
            # tokenize text into individual words
            words = word_tokenize(text)
            # remove stopwords
            words = [word for word in words if word not in self.stop_words]
            # join words
            cleaned_text = ' '.join(words)
            cleaned_records.append(cleaned_text)
        return cleaned_records
    
    def fit(self, X, y=None):
        return self

### Write Cleaned Data

In [6]:
processor = preprocess_text()

for filename in data:
    data[filename]['merged_df']['reviewer_text'] = processor.transform(data[filename]['merged_df']['reviewer_text'])
    data[filename]['merged_df'].to_csv(processed_folder + filename,
                                       sep = '|',
                                       line_terminator = '\r\n',
                                       index=False)