In [4]:
import numpy as np
import pandas as pd

In [37]:
df = pd.read_csv("dataset_1.csv", index_col=0)

In [38]:
def remove_negative_values(df, column):
    df[column]=df[column].apply(lambda x: np.nan if x <0 else x)
    return df

In [39]:
def remove_outlines_with_zscore(df, column, threshold =2):
    column_mean= df[column].mean()
    column_std= df[column].std()
    df[column] = df[column].mask(((df[column]-column_mean) / column_std).abs() > threshold)
    return df

In [40]:
def map_column_values(df, column, mapping_dict):
    df[column] = df[column].apply(lambda value: mapping_dict.get(value, value))
    return df

In [41]:
def fill_na_in_column(df, column, fill_value):
    df[column].fillna(fill_value, inplace=True)
    return df

In [42]:
def preprocess_data(df):
    education_mapping={
        "Bachelors": "Bachelor",
        "mastre": "Master",
        "pHd": "PhD",
        "no education": "None"
    }
    gender_mapping = {
        "m": "M",
        "f": "F"
    }
    return(df.pipe(remove_negative_values, "Edad")
           .pipe(remove_negative_values, "Ingresos")
           .pipe(remove_negative_values, "Hijos")
           .pipe(remove_outlines_with_zscore, "Edad")
           .pipe(remove_outlines_with_zscore, "Ingresos")
           .pipe(remove_outlines_with_zscore, "Altura")
           .pipe(remove_outlines_with_zscore, "Hijos")
           .pipe(map_column_values, "Nivel_Educación", education_mapping)
           .pipe(map_column_values, "Género", gender_mapping)
           .pipe(fill_na_in_column, "Ciudad", "Desconocido")
           .pipe(fill_na_in_column, "Nivel_Educación", "Desconocido")
           .pipe(fill_na_in_column, "Género", "Desconocido")
           .pipe(fill_na_in_column, "Edad", df["Edad"].median())
           .pipe(fill_na_in_column, "Hijos", df["Hijos"].median())
           .pipe(fill_na_in_column, "Ingresos", df["Ingresos"].mean())
           .pipe(fill_na_in_column, "Altura", df["Altura"].mean())
           )


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Edad             100000 non-null  int64  
 1   Género           90000 non-null   object 
 2   Ingresos         100000 non-null  int64  
 3   Altura           100000 non-null  float64
 4   Ciudad           90000 non-null   object 
 5   Nivel_Educación  77563 non-null   object 
 6   Hijos            100000 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 6.1+ MB


In [44]:
df = preprocess_data(df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(fill_value, inplace=True)


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Edad             100000 non-null  float64
 1   Género           100000 non-null  object 
 2   Ingresos         100000 non-null  float64
 3   Altura           100000 non-null  float64
 4   Ciudad           100000 non-null  object 
 5   Nivel_Educación  100000 non-null  object 
 6   Hijos            100000 non-null  float64
dtypes: float64(4), object(3)
memory usage: 6.1+ MB
