In [11]:
import pandas as pd
ruta = '../data/raw/housing.csv' 
df_original = pd.read_csv(ruta)
df_trabajo = df_original.copy()


In [12]:
mediana = df_trabajo['total_bedrooms'].median()
df_trabajo['total_bedrooms'].fillna(mediana, inplace=True) 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_trabajo['total_bedrooms'].fillna(mediana, inplace=True)


In [13]:
print(f"Nulos restantes en total_bedrooms: {df_trabajo['total_bedrooms'].isnull().sum()}")

Nulos restantes en total_bedrooms: 0


In [14]:
df_trabajo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


aplicar one-hot-encoding y agregar al dataset de trabajo

In [15]:
dummies = pd.get_dummies(df_trabajo['ocean_proximity'], prefix='ocean', dtype=int)
df_trabajo = pd.concat([df_trabajo, dummies], axis=1)

In [16]:
df_trabajo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
 10  ocean_<1H OCEAN     20640 non-null  int64  
 11  ocean_INLAND        20640 non-null  int64  
 12  ocean_ISLAND        20640 non-null  int64  
 13  ocean_NEAR BAY      20640 non-null  int64  
 14  ocean_NEAR OCEAN    20640 non-null  int64  
dtypes: float64(9), int64(5), object(1)
memory usage: 2.4+

Eliminar la columna antigua y dejar las codificadas 

In [17]:
df_trabajo = df_trabajo.drop('ocean_proximity', axis=1)

In [18]:
df_trabajo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_<1H OCEAN     20640 non-null  int64  
 10  ocean_INLAND        20640 non-null  int64  
 11  ocean_ISLAND        20640 non-null  int64  
 12  ocean_NEAR BAY      20640 non-null  int64  
 13  ocean_NEAR OCEAN    20640 non-null  int64  
dtypes: float64(9), int64(5)
memory usage: 2.2 MB


In [20]:
df_trabajo.filter(like='ocean_').head()

Unnamed: 0,ocean_<1H OCEAN,ocean_INLAND,ocean_ISLAND,ocean_NEAR BAY,ocean_NEAR OCEAN
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0


Guardamos un dataset con los datos limpios para utilizarlo

In [21]:
output_path ='../data/raw/housing_limpio.csv'
df_trabajo.to_csv(output_path, index=False)