# Cleaning Pipeline

First we load the recollected data in this case from [this file](../../data/final_combined_reviews.csv)

In [None]:
import pandas as pd

flybondi_data = '../data/final_combined_reviews.csv'
df = pd.read_csv(flybondi_data)


## Initialization Process

We remove unused collumns (can be upscaled to consider more cases)

In [None]:
# remove the unnamed columns which are trash
df_cleaned = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# remove duplicates
df_cleaned = df_cleaned.drop_duplicates()



We normalize the domain of our ratings to be integers and lower the case for the names to be used as unique identifiers.

In [None]:
df_cleaned['rating_cleaned'] = df_cleaned['rating'].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)

df_cleaned['rating_cleaned'] = df_cleaned['rating_cleaned'].fillna(1).astype(int)

df_cleaned = df_cleaned.drop(columns=['rating'])
df_cleaned = df_cleaned.rename(columns={'rating_cleaned': 'rating'})

df_cleaned['name'] = df_cleaned['name'].str.lower()

df_cleaned.iloc[1300:]


Some reviews have titles, in this pipeline we concatenate them so that they can be considered part of the review.

In [None]:
df_cleaned['review'] = df_cleaned['review_title'].fillna('') + \
                       df_cleaned['review_title'].apply(lambda x: '. ' if pd.notna(x) and x != '' else '') + \
                       df_cleaned['review_text'].fillna('')

df_cleaned = df_cleaned.drop(columns=['review_title', 'review_text'])

df_cleaned['review'] = df_cleaned['review'].str.lower()

df_cleaned.tail()


Some reviews might be repeated due to scraping inconsistencies but can be easily removed by comparing the lowercase names, so we remove duplicates (comparing names keeping the longest review since some names can give multiple reviews and we just want one so there is no bias implication).

In [None]:
import pandas as pd

def longest_review(group):
    return group.loc[group['review'].str.len().idxmax()]

df_cleaned = df_cleaned.groupby('name', group_keys=False).apply(longest_review)

df_cleaned.reset_index(drop=True, inplace=True)

df_cleaned.shape


We format likes to be integers o 0 if no likes were given to the specific review.

In [None]:
df_cleaned['likes'] = df_cleaned['likes'].fillna(0)
df_cleaned['likes'] = df_cleaned['likes'].astype(int)

df_cleaned



We create a "relevance score" which will be used to allow weighting of reviews importance since some reviewer can be considered more important than others in terms of reviewing experience. This is a normalized score ranging from 0 to 1.

In [None]:
import re

def parse_experience(experience):
    resenas = 0
    fotos = 0
    local_guide = 0

    if pd.isna(experience):
        return resenas, fotos, local_guide

    if 'Local Guide' in experience:
        local_guide = 1

    resenas_match = re.search(r'(\d+[\.,]?\d*) (reseñas|opinión|opiniones)', experience)
    if resenas_match:
        resenas = int(resenas_match.group(1).replace('.', '').replace(',', '.'))

    fotos_match = re.search(r'(\d+[\.,]?\d*) fotos', experience)
    if fotos_match:
        fotos = int(fotos_match.group(1).replace('.', '').replace(',', '.'))

    return resenas, fotos, local_guide

df_cleaned[['given_reviews', 'pictures', 'local_guide']] = df_cleaned['experience'].apply(
    lambda x: pd.Series(parse_experience(x))
)

df_cleaned[['given_reviews', 'pictures', 'local_guide']]
df_cleaned = df_cleaned.drop(columns=['experience'])
df_cleaned


In [None]:
def calculate_relevance(row, W_l=0.3, W_r=0.5, W_p=0.005, W_lg=0.5):
    relevance = (
        W_l * row['likes'] +
        W_r * row['given_reviews'] +
        W_p * row['pictures'] +
        W_lg * row['local_guide']
    )
    return relevance

df_cleaned['relevance_score'] = df_cleaned.apply(calculate_relevance, axis=1)
df_cleaned['relevance_score_normalized'] = (df_cleaned['relevance_score'] - df_cleaned['relevance_score'].min()) / (df_cleaned['relevance_score'].max() - df_cleaned['relevance_score'].min())

df_cleaned


Detect the languages of each review so that they can be later translated to english using googletranslate api.

In [None]:
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

DetectorFactory.seed = 0

def detect_language(text):
    try:
        return detect(text)  # Returns a language code (e.g., 'en', 'es')
    except LangDetectException:
        return 'unknown'  # Handle cases where language detection fails

df_cleaned['language'] = df_cleaned['review'].apply(detect_language)

# Display the DataFrame with the new 'language' column
df_cleaned[['review', 'language']]

output_file_translated_reviews = '../data/cleaning_pipeline/before_translated.csv'
df_cleaned.to_csv(output_file_translated_reviews, index=False)


Translate each review to english.

In [None]:
from googletrans import Translator
from googletrans import LANGUAGES

translator = Translator()

def translate_to_spanish(text, src_lang):
    try:
        translation = translator.translate(text, src=src_lang, dest='en')
        return translation.text
    except Exception as e:
        print("could not translate: ", text)
        print(f"Error translating: {e}")
        return text

def translate_non_spanish(text, lang):
    if lang != 'en' and lang != 'unknown':
        return translate_to_spanish(text, lang)
    return text

df_cleaned['review_translated'] = df_cleaned.apply(
    lambda row: translate_non_spanish(row['review'], row['language']),
    axis=1
)

df_cleaned[['review', 'language', 'review_translated']]

# we save the df just in case we want to check the translations or the original reviews
output_file_translated_reviews = '../data/cleaning_pipeline/after_translated.csv'
df_cleaned.to_csv(output_file_translated_reviews, index=False)


We visualize as part of a fun experiment the distribution of languages in our dataset.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Use saved dataframe
# df_cleaned = pd.read_csv('your_file.csv')  

language_counts = df_cleaned['source_language'].value_counts()

top_languages = language_counts[:3]  # Top 4 languages
others_count = language_counts[3:].sum()  # Suma del resto de los idiomas
others_series = pd.Series([others_count], index=['Others'])

language_counts_modified = pd.concat([top_languages, others_series])

plt.figure(figsize=(10, 6))
plt.pie(language_counts_modified, labels=language_counts_modified.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Source Languages')
plt.axis('equal')
plt.show()


## Checkpoint 1 

We consider this a relevant checkpoint since this part takes a bit of time to execute. Here we use stanza lemmatizer to get a brief relevant and neutralized review without stopwords emojis or whatever that will not be further analyzed. So first we remove these parts which are not of importance to us.

In [None]:
from nltk.corpus import stopwords
import nltk
import string
import re
import pandas as pd



flybondi_data = '../../data/cleaning_pipeline/after_translated.csv'
df_cleaned = pd.read_csv(flybondi_data)

df_cleaned = df_cleaned[~(df_cleaned['review_translated'].isna() | (df_cleaned['review_translated'].astype(str).str.strip() == ''))]

nltk.download('stopwords')
english_stopwords = set(stopwords.words('english'))

punctuation = string.punctuation + '¡'

def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emojis
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  #  transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"  #  Dingbats
        "\U000024C2-\U0001F251"  # Enclosed characters
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

def preprocess_text(text):
    if pd.isna(text):
        return ''

    text = text.lower()

    text = remove_emojis(text)

    text = re.sub(f"[{re.escape(punctuation)}]", " ", text)

    text_words = text.split()
    text = ' '.join([word for word in text_words if word not in english_stopwords])

    return text

df_cleaned['review_translated'] = df_cleaned['review_translated'].astype(str)

df_cleaned['review_processed'] = df_cleaned['review_translated'].apply(preprocess_text)

df_cleaned[['review_translated', 'review_processed']]

df_cleaned = df_cleaned.rename(columns={'language': 'source_language'})
df_cleaned = df_cleaned.drop(columns=['review_translated', 'relevance_score', 'review', 'given_reviews', 'pictures', 'local_guide', 'likes'])

df_cleaned = df_cleaned.rename(columns={'review_processed': 'review'})
df_cleaned = df_cleaned.rename(columns={'relevance_score_normalized': 'relevance_score'})

df_cleaned


And finally we lemmatize.

In [None]:
import stanza

df = df_cleaned

stanza.download('en')
nlp = stanza.Pipeline('en')

def lemmatize_english(text):
    doc = nlp(text)
    return ' '.join([word.lemma for sent in doc.sentences for word in sent.words])

df['review'] = df['review'].apply(lemmatize_english)

output_file_lemmatized_reviews = '../../data/en_cleaned_with_lemmatized_reviews.csv'
df.to_csv(output_file_lemmatized_reviews, index=False)


In [None]:
print(df)
