# Preprocessing

### Summary
The code contained within this notebook and sister file "preprocessing.py" are intended to perform the bulk of the preprocessing required for nlp modeling tasks. Given that web scraping can result in large dumps of data, this analysis opt to utilize SpaCy to support data cleaning and parsing efforts.

In [8]:
#imports
import numpy as np
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
import os
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin

nlp = spacy.load('en_core_web_sm')

In [9]:
#directory locations
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
raw_folder = parent_directory + '/data/wine-com/raw/'
processed_folder = parent_directory + '/data/wine-com/processed/'

### Load Data

In [10]:
file_data = dict()

for filename in os.listdir(raw_folder):
    print(filename)
    file_data[filename] = dict()
    with open(raw_folder + filename) as file:
        header = next(file)
        file_data[filename]['lines'] = file.readlines()
        
    file_data[filename]['data'] = []

    for line in file_data[filename]['lines']:
        values = line.strip().split('|')
        if len(values) == 7:
            row = {
                'url': values[0],
                'name': values[1],
                'variety': values[2],
                'origin': values[3],
                'type': values[4],
                'description': values[5],
                'reviews': values[6]
            }
            file_data[filename]['data'].append(row)
            
    # Convert the list of dictionaries to a Pandas DataFrame
    file_data[filename]['df'] = pd.DataFrame(file_data[filename]['data'])

1676752918.122732.txt


## Data Preprocessing

In [19]:
class preprocessText(TransformerMixin):
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    
    def transform(self, X):
        docs = self.nlp.pipe(X, disable=["parser", "ner"])
        cleaned_docs = []
        for doc in docs:
            cleaned_tokens = [token.text for token in doc if not token.is_punct]
            cleaned_doc = " ".join(cleaned_tokens)
            cleaned_docs.append(cleaned_doc)
        return cleaned_docs
    
    def fit(self, X, y=None):
        return self

In [20]:
processor = preprocessText()

for filename in file_data:
    file_data[filename]['df']['description'] = processor.transform(file_data[filename]['df']['description'])
    file_data[filename]['df']['reviews'] = processor.transform(file_data[filename]['df']['reviews'])
    file_data[filename]['df'].to_csv(processed_folder + filename, 
                                     sep='|', 
                                     index=False)