In [17]:
import pandas as pd
import os

In [18]:
# Carga de archivo en cualquier formato
def cargar_dataset(archivo):
    """Carga archivos en formato CSV, XLSX, HTML o JSON a un DataFrame."""
    extension = os.path.splitext(archivo)[1].lower()
    if extension == '.csv':
        return pd.read_csv(archivo)
    elif extension == '.xlsx':
        return pd.read_excel(archivo)
    elif extension == '.json':
        return pd.read_json(archivo)
    elif extension == '.html':
        return pd.read_html(archivo)[0]  # Obtiene la primera tabla de la lista
    else:
        raise ValueError(f"Formato de archivo no soportado: {extension}")

In [19]:
# Archivo CSV de mi ciudad
sicilia_data= pd.read_csv("Sicily.csv").head()
pd.options.display.max_rows = 200
sicilia_data

Unnamed: 0.1,Unnamed: 0,name,description,neighborhood_overview,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,...,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_communication.1,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,reviews_per_month
0,0,SEASIDE - Historic Antique Residence,WINEYARD WITH COLOURS OF THE SEASIDE.<br /><br...,,Yumi Adriano,13/05/09,"Sicily, Italy","Hi, this is Yumi, I am a Japanese, I speak Ja...",,,...,4.69,4.66,4.86,4.83,4.72,4.72,4.59,10,10,0.18
1,1,BnB center near station with balcony and break...,Near the Central Station and the beginning of ...,"The neighborhood is in a residential area, cen...",Giuseppe,05/03/10,"Agrigento, Italy","Di carattere tranquillo e dinamico, sposato, c...",within an hour,100%,...,4.8,4.87,4.89,4.85,4.81,4.81,4.85,3,0,2.48
2,2,Etna-sea a panoramic villa,"Our country house, swathed in the greenery of ...","Contact with nature, relaxation, remarkable pa...",Serena,24/06/10,"Piedimonte Etneo, Italy",.,within a few hours,100%,...,4.79,4.82,4.83,4.89,4.81,4.81,4.76,5,5,0.9
3,3,Two steps from the sea,Very nice flat in front of the beach with a wo...,,Gaspare,05/08/10,"Trapani, Italy",Managing parents' houses in Trapani and in San...,within an hour,100%,...,4.68,4.78,4.57,4.97,4.94,4.94,4.64,2,2,0.49
4,4,Villa Verdemare,Wonderful villa that can easily accommodate up...,,Gaspare,05/08/10,"Trapani, Italy",Managing parents' houses in Trapani and in San...,within an hour,100%,...,4.85,4.82,4.69,4.99,4.92,4.92,4.72,2,2,0.46


In [20]:
sicilia_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 51 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Unnamed: 0                                   5 non-null      int64  
 1   name                                         5 non-null      object 
 2   description                                  5 non-null      object 
 3   neighborhood_overview                        2 non-null      object 
 4   host_name                                    5 non-null      object 
 5   host_since                                   5 non-null      object 
 6   host_location                                5 non-null      object 
 7   host_about                                   5 non-null      object 
 8   host_response_time                           4 non-null      object 
 9   host_response_rate                           4 non-null      object 
 10  host_a

In [21]:
dataframe = sicilia_data

In [22]:
def cuenta_valores_nulos(dataframe):
    # valores nulos por columna
    valores_nulos_cols = dataframe.isnull().sum()
    # valores nulos por dataframe
    valores_nulos_df = dataframe.isnull().sum().sum()

    return("Valores nulos por columna", valores_nulos_cols,
           "Valores nulos por dataframe", valores_nulos_df)

In [23]:
# LLamamos a la función "cuenta_valores_nulos" y guardamos el retorno
cuenta_valores_nulos(sicilia_data)

('Valores nulos por columna',
 Unnamed: 0                                     0
 name                                           0
 description                                    0
 neighborhood_overview                          3
 host_name                                      0
 host_since                                     0
 host_location                                  0
 host_about                                     0
 host_response_time                             1
 host_response_rate                             1
 host_acceptance_rate                           0
 host_is_superhost                              0
 host_listings_count                            0
 host_total_listings_count                      0
 host_has_profile_pic                           0
 host_identity_verified                         0
 neighbourhood                                  3
 latitude                                       0
 longitude                                      0
 property_type      

In [24]:
def sustitucion_promedio(dataframe):
    cuantitativas_con_nulos = dataframe.select_dtypes(include=['float64', 'int64', 'int'])
    cualitativas = dataframe.select_dtypes(include=['object', 'datetime', 'category'])
    cuantitativas = cuantitativas_con_nulos.fillna(round(cuantitativas_con_nulos.mean(), 1))
    datos_sin_nulos = pd.concat([cuantitativas, cualitativas], axis=1)
    return datos_sin_nulos

In [25]:
def sustitucion_mediana(dataframe):
    cuantitativas_con_nulos = dataframe.select_dtypes(include=['float64', 'int64', 'int'])
    cualitativas = dataframe.select_dtypes(include=['object', 'datetime', 'category'])
    cuantitativas = cuantitativas_con_nulos.fillna(cuantitativas_con_nulos.median())
    datos_sin_nulos = pd.concat([cuantitativas, cualitativas], axis=1)
    return datos_sin_nulos

In [26]:
def sustitucion_ffill(dataframe):
    return dataframe.fillna(method='ffill')

In [27]:
def sustitucion_bfill(dataframe):
    return dataframe.fillna(method='bfill')

In [28]:
def sustitucion_string(dataframe, value):
    return dataframe.fillna(value)

In [29]:
def sustitucion_constante(dataframe, constant):
    return dataframe.fillna(constant)

In [32]:
def identify_nulls(df):
    """Identifica valores nulos por columna y en todo el dataframe."""
    nulls_by_column = df.isnull().sum()
    total_nulls = df.isnull().sum().sum()
    return {'nulos_por_columna': nulls_by_column, 'total_nulos': total_nulls}

In [33]:
cuenta_valores_nulos(sicilia_data)

('Valores nulos por columna',
 Unnamed: 0                                     0
 name                                           0
 description                                    0
 neighborhood_overview                          3
 host_name                                      0
 host_since                                     0
 host_location                                  0
 host_about                                     0
 host_response_time                             1
 host_response_rate                             1
 host_acceptance_rate                           0
 host_is_superhost                              0
 host_listings_count                            0
 host_total_listings_count                      0
 host_has_profile_pic                           0
 host_identity_verified                         0
 neighbourhood                                  3
 latitude                                       0
 longitude                                      0
 property_type      