In this notebook the data will be pre-processed before fitting the model. 

This includes:
* Fixing the Clothing ID that is duplicated across Classes.
* Removing articles with a low number of reviews where the recommendation probability is not clear yet.
* Adding a column with the recommendation probability (our modelling target).
* Pre-computing the NLP per review using the title and the text. 
    * This will save time when fitting the model, especially in CV. 
    * We can still dynamically select which NLP columns to use in the pipeline


Import requirements

In [10]:
import os
from pathlib import Path

import pandas as pd
import spacy

In [11]:
# ! python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

Load data

In [12]:
file_path = Path(os.path.abspath('')).parent / "data" / "reviews.csv"
df = pd.read_csv(file_path)

Update duplicated Clothing ID

In [13]:
clothing_class_counts = df.groupby('Clothing ID')['Class Name'].nunique()
multiple_class_ids = clothing_class_counts[clothing_class_counts > 1]

for id in multiple_class_ids.index:   
    next_id = max(df['Clothing ID']) + 1
    df.loc[(df['Clothing ID'] == id) & (df['Division Name'] == "General"), 'Clothing ID'] = next_id

Remove article IDs with low number of reviews

In [14]:
min_reviews = 10
num_reviews = df['Clothing ID'].value_counts()
df = df[df['Clothing ID'].isin(num_reviews[num_reviews >= min_reviews].index)]

Pre-process NLP

In [15]:
positive_sentiment = ["great", "beautiful", "cute", "nice", "lovely", "comfortable", "gorgeous", "perfect",
                  "pretty", "flattering", "good", "comfy", "amazing", "adorable", "stylish", "cozy", "fun",
                  "perfection", "perfect", "awesome"]
negative_sentiment = ["poor", "meh", "boxy", "scratchy", "shapeless", "odd", "thin", "weird", "disapointing",
                      "awkward", "bummer", "unflattering", "bad", "strange", "itchy", "cheaply", "cheap",
                      "stiff", "minus"]
mixed_sentiment = ["but", "yet", "or"]


def process_text(input_field: str, input_text: str):
    input_field = input_field.lower().replace(" ", "_")
    doc = nlp(input_text)
    
    num_pronoums = 0
    num_adjectives = 0
    num_verbs = 0
    num_nouns = 0
    num_conjunctions = 0
    num_numbers = 0
    num_symbols = 0

    positive_count = 0
    negative_count = 0
    mixed_count = 0
    
    for token in doc:        
        if token.pos_ == 'PROPN':
            num_pronoums +=1
        if token.pos_ == 'ADJ':
            num_adjectives += 1
        if token.pos_ == 'VERB':
            num_verbs += 1
        if token.pos_ == 'NOUN':    
            num_nouns += 1
        if token.pos_ == 'CCONJ':
            num_conjunctions += 1
        if token.pos_ == 'NUM':
            num_numbers += 1
        if token.pos_ == 'SYM':
            num_symbols += 1

        if token.pos_ in ['ADJ', 'PROPN', 'CCONJ']:
            if token.lemma_.lower() in positive_sentiment:
                positive_count += 1
            elif token.lemma_.lower() in negative_sentiment:
                negative_count += 1
            elif token.lemma_.lower() in mixed_sentiment:
                mixed_count += 1
        
    return {
        # special characters
        input_field + '_char_point': input_text.count('.'),
        input_field + '_char_comma': input_text.count(','),
        input_field + '_char_semicolon': input_text.count(';'),
        input_field + '_char_quotes': input_text.count('"') + input_text.count("'"),
        input_field + '_char_exclamation': input_text.count('!'),
        input_field + '_char_question': input_text.count('?'),
        input_field + '_char_hashtag': input_text.count('#'),
        input_field + '_char_ellipsis': input_text.count('...') + input_text.count('..'),  
        # NLP
        input_field + '_nlp_tokens': len(doc),
        input_field + '_nlp_sentences': len(list(doc.sents)),
        input_field + '_nlp_pronoums': num_pronoums,
        input_field + '_nlp_adjectives': num_adjectives,
        input_field + '_nlp_verbs': num_verbs,
        input_field + '_nlp_nouns': num_nouns,
        input_field + '_nlp_conjunctions': num_conjunctions,
        input_field + '_nlp_numbers': num_numbers,
        input_field + '_nlp_symbols': num_symbols,
        # Sentiment
        input_field + '_sentiment_positive': positive_count,
        input_field + '_sentiment_negative': negative_count,
        input_field + '_sentiment_mixed': mixed_count,    
    }


In [16]:
for column in ["Title", "Review Text"]:
    title_characters = df[column].apply(lambda txt: process_text(column.lower().replace(" ","_"), txt)).apply(pd.Series)
    df = pd.merge(df, title_characters,  left_index=True, right_index=True)

df.drop(["Title", "Review Text"], axis=1, inplace=True)

Export CSV file

In [17]:
file_path = Path(os.path.abspath('')).parent / "data" / "reviews_processed.csv"
df.to_csv(file_path, index=False)