Cargamos el archivo de reviews scrapeadas.

In [1]:
import pandas as pd

flybondi_data = '../data/final_combined_reviews.csv'
df = pd.read_csv(flybondi_data)

## Empezamos a limpiar.

1. Remover columnas inútiles y filas duplicadas.

In [2]:
# remove the unnamed columns which are trash
df_cleaned = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# remove duplicates
df_cleaned = df_cleaned.drop_duplicates()

Limpiamos ratings para que tenga unico formato.

In [3]:
df_cleaned['rating_cleaned'] = df_cleaned['rating'].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)
df_cleaned = df_cleaned.drop(columns=['rating'])
df_cleaned = df_cleaned.rename(columns={'rating_cleaned': 'rating'})
df_cleaned['rating'] = df_cleaned['rating'].fillna(1.0)

df_cleaned.iloc[1300:]


Unnamed: 0,name,experience,review_text,likes,review_title,rating
1300,Silvia Elena P,,Excelente el servicio. Tripulación súper atent...,,Altamente recomendable. Equipo amable y eficie...,5.0
1301,Martina B,,"Todo un desastre, el vuelo de ida se atraso 3 ...",,Peor experiencia,1.0
1302,carolina c,,Que les puedo decir que no sepamos: es una aer...,,HDP AIRLINES,1.0
1303,Jime Ache 🤩,,Quiero destacar que todo fue impecable en el v...,,Todo fue un 10,5.0
1304,Cami D,,Desastre. Nos cambiaron el horario de vuelo mu...,,Desastre,1.0
...,...,...,...,...,...,...
2357,Pablo Romero,1 opinión,Si quieren saber el significado de las palabra...,,SERVICIO NEFASTO FLYBONDI,1.0
2358,Aurélien C,6 opiniones,"Compré un primer boleto, el cheque se cerró co...",,Estafa total,1.0
2359,Marcos Medvescig,1 opinión,La peor experience. Me cambiarion el horario d...,,La peor basura voladora del mundo,1.0
2360,Silvia Elena Perez Sbarbatti,1 opinión,La empresa cumplió con las condiciones pautada...,,"Excelente la puntualidad, la atención y el sev...",5.0


Concatenamos titulos con reviews.

In [4]:
df_cleaned['review'] = df_cleaned['review_title'].fillna('') + '. ' + df_cleaned['review_text'].fillna('')
df_cleaned = df_cleaned.drop(columns=['review_title', 'review_text'])
df_cleaned['review'] = df_cleaned['review'].str.lower()
df_cleaned.tail()


Unnamed: 0,name,experience,likes,rating,review
2357,Pablo Romero,1 opinión,,1.0,servicio nefasto flybondi. si quieren saber el...
2358,Aurélien C,6 opiniones,,1.0,"estafa total. compré un primer boleto, el cheq..."
2359,Marcos Medvescig,1 opinión,,1.0,la peor basura voladora del mundo. la peor exp...
2360,Silvia Elena Perez Sbarbatti,1 opinión,,5.0,"excelente la puntualidad, la atención y el sev..."
2361,Nazarena Sebastianelli,2 opiniones,,3.0,realmente lo pensaría dos veces antes…. realme...


Removemos reviews que tengan nombre repetido.

In [5]:
import pandas as pd

def longest_review(group):
    return group.loc[group['review'].str.len().idxmax()]

df_cleaned = df_cleaned.groupby('name', group_keys=False).apply(longest_review)

df_cleaned.reset_index(drop=True, inplace=True)


df_cleaned.head()

df_cleaned.shape

  df_cleaned = df_cleaned.groupby('name', group_keys=False).apply(longest_review)


(1902, 5)

Limpiamos "Likes"

In [6]:
df_cleaned['likes'] = df_cleaned['likes'].fillna(0)
df_cleaned['likes'] = df_cleaned['likes'].astype(int)

df_cleaned

Unnamed: 0,name,experience,likes,rating,review
0,22fortinero,,0,1.0,malisima. malisimo..la peor.\nvuando fuimos pe...
1,23russellv,,0,2.0,one of the worst we have flown. we have flown ...
2,4family,,0,1.0,terrible service. flight was first delayed 25 ...
3,5travellers602013,,0,1.0,rubbish low cost airline. bought 6 tickets via...
4,885David_R885,,0,1.0,not refunding cancelled flights. not refunding...
...,...,...,...,...,...
1897,Валерия Шульга,1 reseña,3,1.0,. el vuelo fue reprogramado sin informarnos al...
1898,Вика Мегалис,3 reseñas,2,1.0,. me gustaría decirles a todos: nunca compren ...
1899,Дарья Венедиктова,1 reseña,5,1.0,". esta empresa ya hace un año que no viene, fu..."
1900,יסמין י,,0,1.0,dont buy here!!!! scam. it's completely ridicu...


Creamos una función que transforma la experience y los likes en un único formato.

In [7]:
import re

def parse_experience(experience):
    resenas = 0
    fotos = 0
    local_guide = 0

    if pd.isna(experience):
        return resenas, fotos, local_guide

    if 'Local Guide' in experience:
        local_guide = 1

    resenas_match = re.search(r'(\d+[\.,]?\d*) (reseñas|opinión|opiniones)', experience)
    if resenas_match:
        resenas = int(resenas_match.group(1).replace('.', '').replace(',', '.'))

    fotos_match = re.search(r'(\d+[\.,]?\d*) fotos', experience)
    if fotos_match:
        fotos = int(fotos_match.group(1).replace('.', '').replace(',', '.'))

    return resenas, fotos, local_guide

df_cleaned[['given_reviews', 'pictures', 'local_guide']] = df_cleaned['experience'].apply(
    lambda x: pd.Series(parse_experience(x))
)

df_cleaned[['given_reviews', 'pictures', 'local_guide']]
df_cleaned = df_cleaned.drop(columns=['experience'])


In [8]:
def calculate_relevance(row, W_l=0.3, W_r=0.5, W_p=0.005, W_lg=0.5):
    relevance = (
        W_l * row['likes'] +
        W_r * row['given_reviews'] +
        W_p * row['pictures'] +
        W_lg * row['local_guide']
    )
    return relevance

df_cleaned['relevance_score'] = df_cleaned.apply(calculate_relevance, axis=1)
df_cleaned['relevance_score_normalized'] = (df_cleaned['relevance_score'] - df_cleaned['relevance_score'].min()) / (df_cleaned['relevance_score'].max() - df_cleaned['relevance_score'].min())

df_cleaned[['likes', 'given_reviews', 'pictures', 'local_guide', 'relevance_score', 'relevance_score_normalized']]

Unnamed: 0,likes,given_reviews,pictures,local_guide,relevance_score,relevance_score_normalized
0,0,0,0,0,0.00,0.000000
1,0,0,0,0,0.00,0.000000
2,0,0,0,0,0.00,0.000000
3,0,0,0,0,0.00,0.000000
4,0,0,0,0,0.00,0.000000
...,...,...,...,...,...,...
1897,3,0,0,0,0.90,0.001745
1898,2,3,0,0,2.10,0.004072
1899,5,0,0,0,1.50,0.002909
1900,0,0,0,0,0.00,0.000000


In [9]:
df_cleaned

Unnamed: 0,name,likes,rating,review,given_reviews,pictures,local_guide,relevance_score,relevance_score_normalized
0,22fortinero,0,1.0,malisima. malisimo..la peor.\nvuando fuimos pe...,0,0,0,0.00,0.000000
1,23russellv,0,2.0,one of the worst we have flown. we have flown ...,0,0,0,0.00,0.000000
2,4family,0,1.0,terrible service. flight was first delayed 25 ...,0,0,0,0.00,0.000000
3,5travellers602013,0,1.0,rubbish low cost airline. bought 6 tickets via...,0,0,0,0.00,0.000000
4,885David_R885,0,1.0,not refunding cancelled flights. not refunding...,0,0,0,0.00,0.000000
...,...,...,...,...,...,...,...,...,...
1897,Валерия Шульга,3,1.0,. el vuelo fue reprogramado sin informarnos al...,0,0,0,0.90,0.001745
1898,Вика Мегалис,2,1.0,. me gustaría decirles a todos: nunca compren ...,3,0,0,2.10,0.004072
1899,Дарья Венедиктова,5,1.0,". esta empresa ya hace un año que no viene, fu...",0,0,0,1.50,0.002909
1900,יסמין י,0,1.0,dont buy here!!!! scam. it's completely ridicu...,0,0,0,0.00,0.000000


In [10]:
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

DetectorFactory.seed = 0

def detect_language(text):
    try:
        return detect(text)  # Returns a language code (e.g., 'en', 'es')
    except LangDetectException:
        return 'unknown'  # Handle cases where language detection fails

df_cleaned['language'] = df_cleaned['review'].apply(detect_language)

# Display the DataFrame with the new 'language' column
df_cleaned[['review', 'language']]


Unnamed: 0,review,language
0,malisima. malisimo..la peor.\nvuando fuimos pe...,es
1,one of the worst we have flown. we have flown ...,en
2,terrible service. flight was first delayed 25 ...,en
3,rubbish low cost airline. bought 6 tickets via...,en
4,not refunding cancelled flights. not refunding...,en
...,...,...
1897,. el vuelo fue reprogramado sin informarnos al...,es
1898,. me gustaría decirles a todos: nunca compren ...,es
1899,". esta empresa ya hace un año que no viene, fu...",es
1900,dont buy here!!!! scam. it's completely ridicu...,en


In [11]:
from googletrans import Translator
from googletrans import LANGUAGES

translator = Translator()

def translate_to_spanish(text, src_lang):
    try:
        translation = translator.translate(text, src=src_lang, dest='es')  # 'es' for Spanish
        return translation.text
    except Exception as e:
        print("could not translate: ", text)
        print(f"Error translating: {e}")
        return text

def translate_non_spanish(text, lang):
    if lang != 'es' and lang != 'unknown':
        return translate_to_spanish(text, lang)
    return text

df_cleaned['review_translated'] = df_cleaned.apply(
    lambda row: translate_non_spanish(row['review'], row['language']),
    axis=1
)

df_cleaned[['review', 'language', 'review_translated']]

# Optionally, save the updated DataFrame
output_file_translated_reviews = '../data/cleaned_with_translated_non_es_reviews.csv'
df_cleaned.to_csv(output_file_translated_reviews, index=False)


In [12]:
from nltk.corpus import stopwords
import string
import re

flybondi_data = '../data/cleaned_with_translated_non_es_reviews.csv'
df_cleaned = pd.read_csv(flybondi_data)

spanish_stopwords = set(stopwords.words('spanish'))
punctuation = string.punctuation + '¡'

def remove_emojis(text):
    # Regular expression pattern to match emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"  # other symbols
        "\U000024C2-\U0001F251"  # enclosed characters
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove emojis
    text = remove_emojis(text)
    # Remove punctuation by translating all punctuation characters to None
    text = text.translate(str.maketrans('', '', punctuation))
    # Remove Spanish stopwords by iterating through the words
    for stopword in spanish_stopwords:
        text = text.replace(f" {stopword} ", " ")  # Replace only whole words
    return text

df_cleaned['review_processed'] = df_cleaned['review_translated'].apply(preprocess_text)

df_cleaned[['review_translated', 'review_processed']]
# remove review_translated column
df_cleaned = df_cleaned.drop(columns=['review_translated', 'language', 'relevance_score', 'review', 'given_reviews', 'pictures', 'local_guide', 'likes'])
#rename review_processed to review
df_cleaned = df_cleaned.rename(columns={'review_processed': 'review'})
df_cleaned = df_cleaned.rename(columns={'relevance_score_normalized': 'relevance_score'})
df_cleaned

Unnamed: 0,name,rating,relevance_score,review
0,22fortinero,1.0,0.000000,malisima malisimola peor\nvuando perdi dia hot...
1,23russellv,2.0,0.000000,uno peores voladohemos volado cientos aerolíne...
2,4family,1.0,0.000000,terrible servicioel vuelo retrasó primera vez ...
3,5travellers602013,1.0,0.000000,basco aerolínea bajo costocompré 6 boletos tra...
4,885David_R885,1.0,0.000000,no reembolsar vuelos canceladosno reembolsar v...
...,...,...,...,...
1897,Валерия Шульга,1.0,0.001745,vuelo reprogramado informarnos respecto pagam...
1898,Вика Мегалис,1.0,0.004072,gustaría decirles nunca compren mosca flybond...
1899,Дарья Венедиктова,1.0,0.002909,empresa hace año viene pérdida tiempo venir
1900,יסמין י,1.0,0.000000,estafaes completamente ridículo vuelo retrasó ...


# Lemmatizador
Ahora que tenemos un dataset con todas las reviews limpias y traducidas procedemos a lemmatizar el texto.

In [13]:
import stanza

df = df_cleaned

stanza.download('es')
nlp = stanza.Pipeline('es')

def lemmatize_spanish(text):
    doc = nlp(text)

    return ' '.join([word.lemma for sent in doc.sentences for word in sent.words])

# Apply the lemmatization function only to Spanish reviews
df['review'] = df.apply(
    lambda row: lemmatize_spanish(row['review']),
    axis=1
)

# Optionally, save the updated DataFrame with lemmatized reviews
output_file_lemmatized_reviews = '../data/cleaned_with_lemmatized_reviews.csv'
df.to_csv(output_file_lemmatized_reviews, index=False)


  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 51.9MB/s]                    
2024-10-18 21:08:58 INFO: Downloaded file to /home/joaquin/stanza_resources/resources.json
2024-10-18 21:08:58 INFO: Downloading default packages for language: es (Spanish) ...
2024-10-18 21:09:00 INFO: File exists: /home/joaquin/stanza_resources/es/default.zip
2024-10-18 21:09:06 INFO: Finished downloading models and saved to /home/joaquin/stanza_resources
2024-10-18 21:09:06 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 42.5MB/s]                    
2024-10-18 21:09:06 INFO: Downloaded file to /home/joaquin/stanza_resources/resource