In [11]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [12]:
df = pd.read_csv(r"../data/raw/casas_dev.csv") 

df.sample(5, random_state=42)


Unnamed: 0,area,area_units,is_house,has_pool,age,price,lat,lon,rooms
70,872.0,sqft,0,0,11.0,630.0,40.718433,-74.008699,2.0
827,1076.0,sqft,0,0,7.0,772.0,40.707037,-73.998823,3.0
231,137.0,m2,0,0,10.0,214.0,-34.550223,-58.391268,4.0
588,775.0,sqft,1,1,10.0,751.0,40.726223,-74.043929,2.0
39,154.0,m2,1,0,16.0,273.0,-34.563525,-58.347365,4.0


In [13]:
# same 5 rows with at least 1 nan value (feature)
nan_rows = df[df.isna().any(axis=1)].sample(5, random_state=42)
# save index of the rows
nan_rows_index = nan_rows.index
nan_rows, nan_rows_index

(      area area_units  is_house  has_pool  age  price        lat        lon  \
 874   98.0         m2         0         0  NaN  232.0 -34.591521 -58.389672   
 784  119.0         m2         1         1  NaN  226.0 -34.650660 -58.376238   
 122  102.0         m2         0         0  7.0  193.0 -34.615997 -58.372730   
 107   93.0         m2         1         0  7.0  184.0 -34.674335 -58.391795   
 343  431.0       sqft         0         0  2.0  324.0  40.703436 -74.002579   
 
      rooms  
 874    3.0  
 784    4.0  
 122    NaN  
 107    NaN  
 343    NaN  ,
 Index([874, 784, 122, 107, 343], dtype='int64'))

In [14]:
# convert sqft to m2
df.loc[df['area_units'] == 'sqft', 'area'] = df['area'] * 0.092903
df.loc[df['area_units'] == 'sqft', 'area_units'] = 'm2'

df.sample(5, random_state=42), df.shape

(           area area_units  is_house  has_pool   age  price        lat  \
 70    81.011416         m2         0         0  11.0  630.0  40.718433   
 827   99.963628         m2         0         0   7.0  772.0  40.707037   
 231  137.000000         m2         0         0  10.0  214.0 -34.550223   
 588   71.999825         m2         1         1  10.0  751.0  40.726223   
 39   154.000000         m2         1         0  16.0  273.0 -34.563525   
 
            lon  rooms  
 70  -74.008699    2.0  
 827 -73.998823    3.0  
 231 -58.391268    4.0  
 588 -74.043929    2.0  
 39  -58.347365    4.0  ,
 (900, 9))

In [15]:
# show the rows that have nans in different columns, then categorize the rows that have nans in the same columns

nan_features = df.isnull().sum()
nan_features = nan_features[nan_features > 0]
nan_features = nan_features.index.tolist()


nan_features



['age', 'rooms']

In [16]:

has_nan_age = df[df['age'].isnull() & df['rooms'].notnull()]
has_nan_rooms = df[df['rooms'].isnull() & df['age'].notnull()]

has_nan_age_and_rooms = df[df['age'].isnull() & df['rooms'].isnull()]

has_all_features = df[df['age'].notnull() & df['rooms'].notnull()]

has_nan_age.shape, has_nan_rooms.shape, has_nan_age_and_rooms.shape, has_all_features.shape


((117, 9), (59, 9), (13, 9), (711, 9))

In [17]:
from models import LinearRegressor

#TODO: Ver por que el modelo predice age solo de 10. 
#TODO: redondea al int mas cercano rooms y age

def train_model_for_feature(X, y):
    # create the model
    model = LinearRegressor()

    # split the data
    X_train, X_test, y_train, y_test = model.split_data(X, y, test_size=0.2)

    # train the model
    model.fit(X_train, y_train)

    return model, X_test, y_test



age_model, X_age_test, y_age_test = train_model_for_feature(has_all_features[['rooms']], has_all_features['age'])


rooms_model, X_rooms_test, y_rooms_test = train_model_for_feature(has_all_features[['age']], has_all_features['rooms'])


age_and_room_model, X_multi_test, y_multi_test = train_model_for_feature(has_all_features[['area']], has_all_features[['age', 'rooms']])




In [18]:


age_mse = age_model.mse_score(X_age_test, y_age_test, round=True)
rooms_mse = rooms_model.mse_score(X_rooms_test, y_rooms_test)
multi_mse = age_and_room_model.mse_score(X_multi_test, y_multi_test)

print("Mean Squared Error for Age Model:", age_mse)
print("Mean Squared Error for Rooms Model:", rooms_mse)
print("Mean Squared Error for Multi Model:", multi_mse)

Mean Squared Error for Age Model: 17.718309859154928
Mean Squared Error for Rooms Model: 0.8827650263854401
Mean Squared Error for Multi Model: 8.993013181356662


In [19]:
#predict the missing values and rounding to the nearest integer
df.loc[df['age'].isnull(), 'age'] = np.round(age_model.predict(df[df['age'].isnull()][['rooms']]))
df.loc[df['rooms'].isnull(), 'rooms'] = np.round(rooms_model.predict(df[df['rooms'].isnull()][['age']]))
df.loc[df['age'].isnull() & df['rooms'].isnull(), ['age', 'rooms']] = np.round(age_and_room_model.predict(df[df['age'].isnull() & df['rooms'].isnull()][['area']]))
missing_values = df.isnull().sum()



if missing_values.sum() > 0:
    print("There are still missing values in the DataFrame:", missing_values)
else:
    print("All missing values have been successfully filled.")

df.shape
# see df as a csv
df.to_csv(r"../data/processed/casas_dev.csv", index=False)

All missing values have been successfully filled.
