Cargamos el archivo de reviews scrapeadas.

In [1]:
import pandas as pd

flybondi_data = '../data/final_combined_reviews.csv'
df = pd.read_csv(flybondi_data)

FileNotFoundError: [Errno 2] No such file or directory: '../data/final_combined_reviews.csv'

## Empezamos a limpiar.

1. Remover columnas inútiles y filas duplicadas.

In [2]:
# remove the unnamed columns which are trash
df_cleaned = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# remove duplicates
df_cleaned = df_cleaned.drop_duplicates()

Limpiamos ratings para que tenga unico formato (int), pasamos nombres a lower para id.

In [None]:
# Extraer solo los números de la columna 'rating' y convertir a float
df_cleaned['rating_cleaned'] = df_cleaned['rating'].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)

df_cleaned['rating_cleaned'] = df_cleaned['rating_cleaned'].fillna(1).astype(int)

# Eliminar la columna original 'rating' y renombrar la nueva columna a 'rating'
df_cleaned = df_cleaned.drop(columns=['rating'])
df_cleaned = df_cleaned.rename(columns={'rating_cleaned': 'rating'})

# Convertir a minúsculas la columna 'name'
df_cleaned['name'] = df_cleaned['name'].str.lower()

# Mostrar las filas a partir de la 1300
df_cleaned.iloc[1300:]


Concatenamos titulos con reviews.

In [None]:
# Concatenar solo si 'review_title' no está vacío
df_cleaned['review'] = df_cleaned['review_title'].fillna('') + \
                       df_cleaned['review_title'].apply(lambda x: '. ' if pd.notna(x) and x != '' else '') + \
                       df_cleaned['review_text'].fillna('')

# Eliminar las columnas 'review_title' y 'review_text'
df_cleaned = df_cleaned.drop(columns=['review_title', 'review_text'])

# Convertir todo a minúsculas
df_cleaned['review'] = df_cleaned['review'].str.lower()

# Mostrar las últimas filas
df_cleaned.tail()


Removemos reviews que tengan nombre repetido dejando el review más largo ya que esto fue error de scrapping.

In [None]:
import pandas as pd

# Función para obtener la fila con la reseña más larga dentro de cada grupo
def longest_review(group):
    # Asegurarte de que 'name' esté en el resultado
    return group.loc[group['review'].str.len().idxmax()]
# Aplicar la función longest_review al grupo, sin que 'name' desaparezca
df_cleaned = df_cleaned.groupby('name', group_keys=False).apply(longest_review)

# Restablecer el índice para tener el DataFrame limpio
df_cleaned.reset_index(drop=True, inplace=True)

# Verificar que 'name' esté presente
df_cleaned.shape


Formateamos "likes" para que sea int y los que sean Nan sean 0.

In [None]:
df_cleaned['likes'] = df_cleaned['likes'].fillna(0)
df_cleaned['likes'] = df_cleaned['likes'].astype(int)

df_cleaned

Creamos una función que transforma la experience y los likes columnas para transformar después en un único valor.

In [None]:
import re

def parse_experience(experience):
    resenas = 0
    fotos = 0
    local_guide = 0

    if pd.isna(experience):
        return resenas, fotos, local_guide

    if 'Local Guide' in experience:
        local_guide = 1

    resenas_match = re.search(r'(\d+[\.,]?\d*) (reseñas|opinión|opiniones)', experience)
    if resenas_match:
        resenas = int(resenas_match.group(1).replace('.', '').replace(',', '.'))

    fotos_match = re.search(r'(\d+[\.,]?\d*) fotos', experience)
    if fotos_match:
        fotos = int(fotos_match.group(1).replace('.', '').replace(',', '.'))

    return resenas, fotos, local_guide

df_cleaned[['given_reviews', 'pictures', 'local_guide']] = df_cleaned['experience'].apply(
    lambda x: pd.Series(parse_experience(x))
)

df_cleaned[['given_reviews', 'pictures', 'local_guide']]
df_cleaned = df_cleaned.drop(columns=['experience'])
df_cleaned


In [None]:
def calculate_relevance(row, W_l=0.3, W_r=0.5, W_p=0.005, W_lg=0.5):
    relevance = (
        W_l * row['likes'] +
        W_r * row['given_reviews'] +
        W_p * row['pictures'] +
        W_lg * row['local_guide']
    )
    return relevance

df_cleaned['relevance_score'] = df_cleaned.apply(calculate_relevance, axis=1)
df_cleaned['relevance_score_normalized'] = (df_cleaned['relevance_score'] - df_cleaned['relevance_score'].min()) / (df_cleaned['relevance_score'].max() - df_cleaned['relevance_score'].min())

df_cleaned

In [None]:
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

DetectorFactory.seed = 0

def detect_language(text):
    try:
        return detect(text)  # Returns a language code (e.g., 'en', 'es')
    except LangDetectException:
        return 'unknown'  # Handle cases where language detection fails

df_cleaned['language'] = df_cleaned['review'].apply(detect_language)

# Display the DataFrame with the new 'language' column
df_cleaned[['review', 'language']]

output_file_translated_reviews = '../data/cleaning_pipeline/before_translated.csv'
df_cleaned.to_csv(output_file_translated_reviews, index=False)

Vemos que porcentajes de reviews hay escritos en cada idiomas.

In [None]:
from googletrans import Translator
from googletrans import LANGUAGES

translator = Translator()

def translate_to_spanish(text, src_lang):
    try:
        translation = translator.translate(text, src=src_lang, dest='en')
        return translation.text
    except Exception as e:
        print("could not translate: ", text)
        print(f"Error translating: {e}")
        return text

def translate_non_spanish(text, lang):
    if lang != 'en' and lang != 'unknown':
        return translate_to_spanish(text, lang)
    return text

df_cleaned['review_translated'] = df_cleaned.apply(
    lambda row: translate_non_spanish(row['review'], row['language']),
    axis=1
)

df_cleaned[['review', 'language', 'review_translated']]

# Optionally, save the updated DataFrame
output_file_translated_reviews = '../data/cleaning_pipeline/after_translated.csv'
df_cleaned.to_csv(output_file_translated_reviews, index=False)


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Simular el DataFrame con datos de ejemplo
# df_cleaned = pd.read_csv('your_file.csv')  # Si ya tienes el archivo cargado

# Contar la ocurrencia de cada idioma
language_counts = df_cleaned['source_language'].value_counts()

# Separar los 4 idiomas más comunes y agrupar el resto en "Others"
top_languages = language_counts[:3]  # Top 4 languages
others_count = language_counts[3:].sum()  # Suma del resto de los idiomas
others_series = pd.Series([others_count], index=['Others'])

# Usar pd.concat en lugar de append
language_counts_modified = pd.concat([top_languages, others_series])

# Crear el gráfico de torta
plt.figure(figsize=(10, 6))
plt.pie(language_counts_modified, labels=language_counts_modified.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Source Languages')
plt.axis('equal')  # Mantener aspecto igual para que la torta sea circular
plt.show()


## Checkpoint 1 

desde acá se puede ejecutar sin ejecutar lo previo si se cuenta con un archivo traducido en cleaning_pipeline. Es posible que sea necesario volver a ejecutar el import de algunas librerías.

In [2]:
from nltk.corpus import stopwords
import nltk
import string
import re
import pandas as pd



# Cargar los datos
flybondi_data = '../../data/cleaning_pipeline/after_translated.csv'
df_cleaned = pd.read_csv(flybondi_data)

# Filtrar las filas donde 'review_translated' es nulo o está vacío
df_cleaned = df_cleaned[~(df_cleaned['review_translated'].isna() | (df_cleaned['review_translated'].astype(str).str.strip() == ''))]

# Cambiar a stopwords en inglés
nltk.download('stopwords')
english_stopwords = set(stopwords.words('english'))

# Definir la puntuación, pero en vez de eliminarla, la reemplazamos con un espacio
punctuation = string.punctuation + '¡'

def remove_emojis(text):
    # Expresión regular para detectar emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticones
        "\U0001F300-\U0001F5FF"  # símbolos y pictogramas
        "\U0001F680-\U0001F6FF"  # símbolos de transporte y mapas
        "\U0001F1E0-\U0001F1FF"  # banderas
        "\U00002702-\U000027B0"  # otros símbolos
        "\U000024C2-\U0001F251"  # caracteres cerrados
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

def preprocess_text(text):
    # Asegurar que el texto no sea nulo o NaN
    if pd.isna(text):
        return ''

    # Convertir a minúsculas
    text = text.lower()

    # Eliminar emojis
    text = remove_emojis(text)

    # Reemplazar puntuación con espacios en lugar de simplemente eliminarla
    text = re.sub(f"[{re.escape(punctuation)}]", " ", text)

    # Eliminar stopwords en inglés
    text_words = text.split()
    text = ' '.join([word for word in text_words if word not in english_stopwords])

    return text

# Convertir todos los valores en 'review_translated' a string, y manejar NaN
df_cleaned['review_translated'] = df_cleaned['review_translated'].astype(str)

# Aplicar el preprocesamiento a la columna 'review_translated'
df_cleaned['review_processed'] = df_cleaned['review_translated'].apply(preprocess_text)

# Mostrar las columnas procesadas
df_cleaned[['review_translated', 'review_processed']]

# Eliminar la columna 'review_translated'
df_cleaned = df_cleaned.rename(columns={'language': 'source_language'})
df_cleaned = df_cleaned.drop(columns=['review_translated', 'relevance_score', 'review', 'given_reviews', 'pictures', 'local_guide', 'likes'])

# Renombrar 'review_processed' a 'review'
df_cleaned = df_cleaned.rename(columns={'review_processed': 'review'})
df_cleaned = df_cleaned.rename(columns={'relevance_score_normalized': 'relevance_score'})

df_cleaned


[nltk_data] Downloading package stopwords to /home/jgirod/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,name,rating,relevance_score,source_language,review
0,22fortinero,1,0.000000,es,terrible terrible worst vuando lost hotel day ...
1,23russellv,2,0.000000,en,one worst flown flown hundreds airlines possib...
2,4family,1,0.000000,en,terrible service flight first delayed 25 min s...
3,5travellers602013,1,0.000000,en,rubbish low cost airline bought 6 tickets via ...
4,885david_r885,1,0.000000,en,refunding cancelled flights refunding canceled...
...,...,...,...,...,...
1888,валерия шульга,1,0.001745,es,flight reprogrammed without informing us paid ...
1889,вика мегалис,1,0.004072,es,would like tell everyone never buy flybondi fl...
1890,дарья венедиктова,1,0.002909,es,company come year ago waste time coming
1891,יסמין י,1,0.000000,en,dont buy scam completely ridiculous flight del...


# Lemmatizador
Ahora que tenemos un dataset con todas las reviews limpias y traducidas procedemos a lemmatizar el texto.

In [3]:
import stanza

df = df_cleaned

# Descargar el modelo de inglés
stanza.download('en')
nlp = stanza.Pipeline('en')

def lemmatize_english(text):
    doc = nlp(text)
    return ' '.join([word.lemma for sent in doc.sentences for word in sent.words])


# Aplicar la función de lematización a las reviews en inglés
df['review'] = df['review'].apply(lemmatize_english)

# Guardar el DataFrame actualizado con las reviews lematizadas
output_file_lemmatized_reviews = '../../data/en_cleaned_with_lemmatized_reviews.csv'
df.to_csv(output_file_lemmatized_reviews, index=False)


  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 45.4MB/s]                    
2024-10-20 20:51:41 INFO: Downloaded file to /home/jgirod/stanza_resources/resources.json
2024-10-20 20:51:41 INFO: Downloading default packages for language: en (English) ...
2024-10-20 20:51:42 INFO: File exists: /home/jgirod/stanza_resources/en/default.zip
2024-10-20 20:51:45 INFO: Finished downloading models and saved to /home/jgirod/stanza_resources
2024-10-20 20:51:45 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 30.7MB/s]                    
2024-10-20 20:51:45 INFO: Downloaded file to /home/jgirod/stanza_resources/resources.js

In [1]:
print(df)

NameError: name 'df' is not defined