# Preprocessing - Wine.com

### Summary
The code contained within this file aims to reconcile and process data scraped from Wine.com.

This implementation observes the following considerations:
1. Given that scrapes can and will fail, this script assumes that all files present in the raw folder are to be processed and combined into a master record. Redundant records and processing are prevented through the use of a site map that tracks successfully parsed and written pages.
2. Ideally, this data would best be stored in a database with a mininum of two tables, product info and critical reviews, however as this analysis is predominantly interested in review data, a SQL style left join has been deemed adequate for this analysis despite the effect on the overall file size. This is a case of not letting perfect get in the way of good enough or having an "agile" mindset.
3. The preprocess_text.py file contains the preprocess_text Class to be used in pipeline when addressing novel data.

In [5]:
#imports
import numpy as np
import pandas as pd
import os
import re
import string
import time
from sklearn.base import TransformerMixin
from nltk.corpus import stopwords

KeyboardInterrupt: 

In [None]:
import nltk
nltk.download()

In [2]:
#set directory locations
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
raw_folder = parent_directory + '/data/wine-com/raw/'
processed_folder = parent_directory + '/data/wine-com/processed/'

### Load Data

In [5]:
#instantiate data structure to hold initial raw data/critical critical reviews
data = dict()
review_data = dict()
#iterate through wine.com raw data folder
for filename in os.listdir(raw_folder):
    #determine if file has already been processed - if true pass
    if filename not in os.listdir(processed_folder):
        print(f'{filename} loading')
        data[filename] = dict()
        review_data[filename] = dict()
        #open file readlines to raw dictionary
        with open(raw_folder + filename, newline='\r\n') as file:
            header = next(file)
            data[filename]['lines'] = file.readlines()
        #create list to hold parsed records
        data[filename]['data'] = []
        review_data[filename]['data'] = []
        #iterate through records to parse raw data
        for line in data[filename]['lines']:
            values = line.strip().split('|')
            #while redundancy is built into the scraper to ensure that the correct format is achieved,
            #this prevents rogue html from causing a bad data load
            if len(values) == 9:
                row = {
                    'product_url': values[0],
                    'product_name': values[1],
                    'product_variety': values[2],
                    'product_origin': values[3],
                    'product_family': values[4],
                    'user_avg_rating': values[5],
                    'user_rating_count': values[6],
                    'winemaker_description': values[7]
                    #'critical_reviews' is parsed loaded below
                }
                #write to list to be loaded to dataframe
                data[filename]['data'].append(row)
                
                #parse critical reviews - should be loaded as list
                reviews = values[8].split(';')
                for review in reviews:
                    try:
                        reviewer_name, reviewer_rating, reviewer_text = review.split(',')
                        review = {
                        'product_url': values[0],
                        'reviewer_name': reviewer_name,
                        'reviewer_rating': reviewer_rating,
                        'reviewer_text': reviewer_text
                        }
                        
                        review_data[filename]['data'].append(review)
                    except Exception:
                        pass      
                    
        # Convert the list of dictionaries to a Pandas DataFrame
        data[filename]['unmerged_df'] = pd.DataFrame(data[filename]['data'])
        review_data[filename]['df'] = pd.DataFrame(review_data[filename]['data'])
        
        #perform sql-style left join of review data onto main df
        data[filename]['merged_df'] = pd.merge(data[filename]['unmerged_df'], 
                                               review_data[filename]['df'], 
                                               on='product_url', 
                                               how='left')

1677386048.5362737.txt loading
1677417882.7459548.txt loading


### Data Preprocessing

In [19]:
# a text preprocessor class is created to manage preprocessing of text fields
# inheriting from TransformerMixin of Sklearn, this should allow for tying into large sklearn pipeline
class preprocess_text(TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
    
    def transform(self, X):
        if isinstance(X, pd.Series):
            #standardize type
            X = X.astype(str)
            # lower text
            X = X.str.lower()
            # remove punctuation
            X = X.str.replace('\W+', ' ', regex = True)
            # tokenize text into individual words
            X = X.str.split()
            # remove stopwords
            X = X.apply(lambda x: [word for word in x if word not in (self.stop_words)])
            # join words
            X = X.apply(lambda x: ' '.join(x))
            return X
        elif isinstance(X, pd.DataFrame):
            colnames = X.columns
            for col in colnames:
                #standardize type
                X[col] = X[col].astype(str)
                # convert text to lowercase
                X[col] = X[col].str.lower()
                # remove punctuation
                X[col] = X[col].str.replace('\W+', ' ', regex = True)
                # tokenize text into individual words
                X[col] = X[col].str.split()
                # remove stopwords
                X[col] = X[col].apply(lambda x: [word for word in x if word not in (self.stop_words)])
                # join words
                X[col] = X[col].apply(lambda x: ' '.join(x))
            return X
        else:
            return X
    
    def fit(self, X, y=None):
        return self

### Clean & Write Data

In [20]:
processor = preprocess_text()

output_path = processed_folder + f'{time.time()}.txt'

for filename in data:
    data[filename]['merged_df']['winemaker_description'] = processor.transform(data[filename]['merged_df']['winemaker_description'])
    data[filename]['merged_df']['reviewer_text'] = processor.transform(data[filename]['merged_df']['reviewer_text'])
    data[filename]['merged_df'].to_csv(output_path,
                                       mode='a',
                                       header = not os.path.exists(output_path),
                                       sep = '|',
                                       line_terminator = '\r\n',
                                       index=False)