In [2]:
import os
os.chdir('../')

In [3]:
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import re
import unicodedata
import string
from collections import Counter

from wordcloud import WordCloud

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import CountVectorizer

import spacy

from textblob import TextBlob

from symspellpy.symspellpy import SymSpell, Verbosity

import emoji

from src.llm_preprocessing import generate_whitelist

In [3]:
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_colwidth', None)

In [4]:
# nltk.download("punkt")
# nltk.download("wordnet")
# nltk.download("stopwords")
# nltk.download('omw-1.4')
# nltk.download('vader_lexicon')

In [5]:
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = 'data/dictionaries/frequency_dictionary_en_82_765.txt'  # included in the repo
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

True

In [6]:
df_pretreatment = pd.read_csv('data/processed/final_label.csv')
df = df_pretreatment.copy()

In [7]:
df.head(5)

Unnamed: 0,text,target
0,"judging from previous posts this used to be a good place , but not any longer .",0.0
1,"we , there were four of us , arrived at noon - the place was empty - and the staff acted like we were imposing on them and they were very rude .",0.0
2,"they never brought us complimentary noodles , ignored repeated requests for sugar , and threw our dishes on the table .",0.0
3,the food was lousy - too sweet or too salty and the portions tiny .,0.0
4,"after all that , they complained to me about the small tip .",0.0


# Text processing pipeline

## Clean text

### Check for emojis

In [8]:
def contains_emoji(text):
    return any(char in emoji.EMOJI_DATA for char in text)

In [9]:
df['has_emoji'] = df['text'].apply(contains_emoji)

# See how many have emojis
print(df['has_emoji'].value_counts())

has_emoji
False    587
Name: count, dtype: int64


There are no emojis, but it is important to keep in the pipeline in case the test set has

### Check for numbers

In [10]:
df['has_number'] = df['text'].str.contains(r"\d")
print(df['has_number'].value_counts())

has_number
False    560
True      27
Name: count, dtype: int64


In [11]:
df[df['has_number']].head(3)

Unnamed: 0,text,target,has_emoji,has_number
16,"went on a 3 day oyster binge , with fish bringing up the closing , and i am so glad this was the place it o trip ended , because it was so great !",1.0,False,True
27,we went around 9 : 30 on a friday and it had died down a bit by then so the service was great !,1.0,False,True
51,"$ 6 and there is much tasty food , all of it fresh and continually refilled .",1.0,False,True


In [12]:
df['has_rating_number'] = df['text'].str.contains(r"\b(?:give it a|rated?|score|rate|i give|i give this|its a|it's a|i rate|rating|^)(?:\s+)?(?:10|[0-9])\b", flags=re.IGNORECASE)

In [13]:
df[df['has_rating_number']]

Unnamed: 0,text,target,has_emoji,has_number,has_rating_number
414,but overall i give it a 10,1.0,False,True,True
415,10,1.0,False,True,True


Most numbers are useless but some cases contain ratings. The feature has_rating_number captures this using a common words associated with a rate

### Clean text function

In [14]:
def clean_text(text):
    text = text.lower() # Lower case
    text = emoji.demojize(text) # transform emojis
    text = re.sub(r"\d+", "", text)  # remove digits
    text = re.sub(r"[^\w\s!]", "", text)  # keep only words, spaces, and !
    text = re.sub(r"\s+", " ", text).strip() # remove whitespaces
    return text

In [15]:
df['text_v1'] = df['text'].apply(clean_text)

## Remove typos

The idea is to correct some typos using sym_spell dictionary. Some food names such as mizu, confit, foie gras, etc can be inadequately replaced. To evaluate with that a new column is created with all corrections made per text

In [16]:
correction_counter = Counter()
blocklist = {'the': 'the', '!': '!', 's': 's', 'n': 'n', 'wa': 'was'}

In [17]:
def correct_typos(text, blocklist={'the': 'the', '!': '!', 's': 's', 'n': 'n', 'wa': 'was'}, whitelist=[]):
    corrected_words = []
    corrections_in_row = []
    
    for word in text.split():
        # Always respect blocklist and whitelist first
        if word in blocklist or word in whitelist or len(word) <= 2:
            corrected_words.append(word)
            continue
        
        # Try correcting
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        
        if suggestions and suggestions[0].term != word:
            correction_counter[(word, suggestions[0].term)] += 1
            corrected_words.append(suggestions[0].term)
            corrections_in_row.append(word)  # Log the original word
        else:
            corrected_words.append(word)
    
    return " ".join(corrected_words), "; ".join(corrections_in_row) if corrections_in_row else None


In [18]:
df[['corrected_text', 'corrections']] = df['text_v1'].apply(lambda x: pd.Series(correct_typos(x)))

In [19]:
df[['corrected_text', 'corrections']].loc[~df['corrections'].isna()].head()

Unnamed: 0,corrected_text,corrections
8,the duck conflict is always amazing and the foil gas terrine with figs was out of this world,confit; foie; gras
12,live asked a cart attendant for a lotus leaf wrapped rice and she replied back rice and just walked away,ive
13,i had to ask her three times before she finally came back with the dish live requested,ive
14,chow fun was dry pork she mai was more than usually greasy and had to share a table with loud and rude family,shu
17,service was device oysters where a sensual as they come and the price ca n t be beat ! ! !,devine


### Creating a custom whitelist using llm

With the corrections column, the idea is to create a whitelist to avoid wrong corrections. To do that, a gpt4 model is called to evaluate each of the corrections and say if they are adequate or not. Using those results the whitelist is created. It is then hardcoded to avoid having to run the llm everytime

In [20]:
df_corrections = df.loc[df['corrections'].notna(), ["corrected_text", "corrections"]]
# results = generate_whitelist(df_corrections, 'corrected_text', model_name='gpt4')
# df_corrections['result'] = results

In [21]:
def apply_whitelist(df):
    whitelist_set = set()

    for idx, row in df.iterrows():
        words = [w.strip() for w in row['corrections'].split(';') if w.strip()]
        results = row['result']

        for word, flag in zip(words, results):
            if flag == 1:
                whitelist_set.add(word)

    return sorted(whitelist_set)

In [22]:
# whitelist = apply_whitelist(df_corrections)

In [23]:
whitelist = ['branzino', 'carozza', 'confit', 'congee', 'cozy', 'dal', 'facie', 'favorite', 'fixe', 'flavoring', 'foie', 'frites', 'gras', 'grazie', 'gulab', 'jamun', 'kha', 'maitre', 'medicore', 'mizu', 'mom', 'msg', 'neighborhood', 'noir', 'nyc', 'overpack', 'porcini', 'prima', 'prixe', 'russe', 'shabu', 'svc', 'tartare', 'theater', 'tristate', 'uncourteous', 'volare', 'wks']

### Final typo corrections

With the withelist a new column with fine-processed text is created

In [24]:
df['text_v2'] = df["text_v1"].apply(lambda x: pd.Series(correct_typos(x, blocklist, whitelist)))[0]

In [25]:
df.drop(columns=['corrected_text', 'corrections'], inplace=True)

## Lemmatize and stopwords

Lemmatize and stopwords will be evaluated in more details in the EDA phase. Spacy was used instead of nltk because it was lemmatizing 'us' to 'u'. Some custom stopwords were added

In [26]:
nlp = spacy.load('en_core_web_sm')

def spacy_lemmatize(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

In [27]:
stop_words = set(stopwords.words("english"))
custom_stopwords = {'I', 'i', 'n', 'wa', 's'}
stop_words.update(custom_stopwords)

def remove_stopwords(text):
    tokens = text.split()
    return " ".join([w for w in tokens if w not in stop_words])

# Full treatment pipeline

Incorporating all steps in a pipeline

In [28]:
def full_preprocess(text):
    text = clean_text(text)
    text = correct_typos(text, blocklist, whitelist)[0]
    text = spacy_lemmatize(text)
    text = remove_stopwords(text)
    return text

In [29]:
df

Unnamed: 0,text,target,has_emoji,has_number,has_rating_number,text_v1,text_v2
0,"judging from previous posts this used to be a good place , but not any longer .",0.0,False,False,False,judging from previous posts this used to be a good place but not any longer,judging from previous posts this used to be a good place but not any longer
1,"we , there were four of us , arrived at noon - the place was empty - and the staff acted like we were imposing on them and they were very rude .",0.0,False,False,False,we there were four of us arrived at noon the place was empty and the staff acted like we were imposing on them and they were very rude,we there were four of us arrived at noon the place was empty and the staff acted like we were imposing on them and they were very rude
2,"they never brought us complimentary noodles , ignored repeated requests for sugar , and threw our dishes on the table .",0.0,False,False,False,they never brought us complimentary noodles ignored repeated requests for sugar and threw our dishes on the table,they never brought us complimentary noodles ignored repeated requests for sugar and threw our dishes on the table
3,the food was lousy - too sweet or too salty and the portions tiny .,0.0,False,False,False,the food was lousy too sweet or too salty and the portions tiny,the food was lousy too sweet or too salty and the portions tiny
4,"after all that , they complained to me about the small tip .",0.0,False,False,False,after all that they complained to me about the small tip,after all that they complained to me about the small tip
...,...,...,...,...,...,...,...
582,mizu is the japenese find in grammercy .,1.0,False,False,False,mizu is the japenese find in grammercy,mizu is the japanese find in gramercy
583,"while their kitchen food is delicious , their sushi is out of this world .",1.0,False,False,False,while their kitchen food is delicious their sushi is out of this world,while their kitchen food is delicious their sushi is out of this world
584,mizu is home to creative and unique rolls not to found anywhere else .,1.0,False,False,False,mizu is home to creative and unique rolls not to found anywhere else,mizu is home to creative and unique rolls not to found anywhere else
585,"not only is the cuisine the best around , the service has always been attentive and charming .",1.0,False,False,False,not only is the cuisine the best around the service has always been attentive and charming,not only is the cuisine the best around the service has always been attentive and charming


In [30]:
df = df_pretreatment.copy()

In [31]:
df = df_pretreatment.copy()
df['has_number'] = df['text'].str.contains(r"\d")
df['has_rating_number'] = df["text"].str.contains(
    r"\b(?:give it a|rated?|score|rate|i give|i give this|its a|it's a|i rate|rating|^)(?:\s+)?(?:10|[0-9])\b",
    flags=re.IGNORECASE
)
df['final_text'] = df['text'].apply(full_preprocess)

In [32]:
def light_clean_text_for_bert(text):
    text = emoji.demojize(text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [33]:
df['bert_text'] = df['text'].apply(light_clean_text_for_bert)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587 entries, 0 to 586
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   text               587 non-null    object 
 1   target             587 non-null    float64
 2   has_number         587 non-null    bool   
 3   has_rating_number  587 non-null    bool   
 4   final_text         587 non-null    object 
 5   bert_text          587 non-null    object 
dtypes: bool(2), float64(1), object(3)
memory usage: 19.6+ KB


In [35]:
df.to_csv('data/processed/preprocessed_train.csv', index=False)