In [103]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [104]:
df = pd.read_csv(r"../data/raw/casas_dev.csv") 

df.sample(5, random_state=42)


Unnamed: 0,area,area_units,is_house,has_pool,age,price,lat,lon,rooms
70,872.0,sqft,0,0,11.0,630.0,40.718433,-74.008699,2.0
827,1076.0,sqft,0,0,7.0,772.0,40.707037,-73.998823,3.0
231,137.0,m2,0,0,10.0,214.0,-34.550223,-58.391268,4.0
588,775.0,sqft,1,1,10.0,751.0,40.726223,-74.043929,2.0
39,154.0,m2,1,0,16.0,273.0,-34.563525,-58.347365,4.0


In [105]:
# same 5 rows with at least 1 nan value (feature)
nan_rows = df[df.isna().any(axis=1)].sample(5, random_state=42)

nan_rows


Unnamed: 0,area,area_units,is_house,has_pool,age,price,lat,lon,rooms
874,98.0,m2,0,0,,232.0,-34.591521,-58.389672,3.0
784,119.0,m2,1,1,,226.0,-34.65066,-58.376238,4.0
122,102.0,m2,0,0,7.0,193.0,-34.615997,-58.37273,
107,93.0,m2,1,0,7.0,184.0,-34.674335,-58.391795,
343,431.0,sqft,0,0,2.0,324.0,40.703436,-74.002579,


In [106]:
# convert sqft to m2
df.loc[df['area_units'] == 'sqft', 'area'] = df['area'] * 0.092903
df.loc[df['area_units'] == 'sqft', 'area_units'] = 'm2'

df.sample(5, random_state=42), df.shape

(           area area_units  is_house  has_pool   age  price        lat  \
 70    81.011416         m2         0         0  11.0  630.0  40.718433   
 827   99.963628         m2         0         0   7.0  772.0  40.707037   
 231  137.000000         m2         0         0  10.0  214.0 -34.550223   
 588   71.999825         m2         1         1  10.0  751.0  40.726223   
 39   154.000000         m2         1         0  16.0  273.0 -34.563525   
 
            lon  rooms  
 70  -74.008699    2.0  
 827 -73.998823    3.0  
 231 -58.391268    4.0  
 588 -74.043929    2.0  
 39  -58.347365    4.0  ,
 (900, 9))

In [107]:
from utils import get_nan_features


nan_features = get_nan_features(df)
nan_features



['age', 'rooms']

In [108]:
from utils import split_by_nan_features


has_nan_age, has_nan_rooms, has_nan_age_and_rooms, has_all_features = split_by_nan_features(df, nan_features)

has_nan_age.shape, has_nan_rooms.shape, has_nan_age_and_rooms.shape, has_all_features.shape


((117, 9), (59, 9), (13, 9), (711, 9))

In [None]:
from models import LinearRegressor, PolinomialRegressor
from utils import train_model_for_feature

#TODO: Ver por que el modelo predice age solo de 10. 
#TODO: redondea al int mas cercano rooms y age



age_model, X_age_test, y_age_test = train_model_for_feature(PolinomialRegressor.change_degree(1),has_all_features[['rooms']], has_all_features['age'],seed=42)


rooms_model, X_rooms_test, y_rooms_test = train_model_for_feature(LinearRegressor,has_all_features[['age']], has_all_features['rooms'])


age_and_room_model, X_multi_test, y_multi_test = train_model_for_feature(LinearRegressor,has_all_features[['area']], has_all_features[['age', 'rooms']])





In [110]:


age_mse = age_model.mse_score(X_age_test, y_age_test, round=True)
rooms_mse = rooms_model.mse_score(X_rooms_test, y_rooms_test)
multi_mse = age_and_room_model.mse_score(X_multi_test, y_multi_test)

print("Mean Squared Error for Age Model:", age_mse)
print("Mean Squared Error for Rooms Model:", rooms_mse)
print("Mean Squared Error for Multi Model:", multi_mse)

Mean Squared Error for Age Model: 17.718309859154928
Mean Squared Error for Rooms Model: 0.8827650263854409
Mean Squared Error for Multi Model: 8.993013181356668


In [111]:
#predict the missing values and rounding to the nearest integer
df.loc[df['age'].isnull(), 'age'] = np.round(age_model.predict(df[df['age'].isnull()][['rooms']]))
df.loc[df['rooms'].isnull(), 'rooms'] = np.round(rooms_model.predict(df[df['rooms'].isnull()][['age']]))
df.loc[df['age'].isnull() & df['rooms'].isnull(), ['age', 'rooms']] = np.round(age_and_room_model.predict(df[df['age'].isnull() & df['rooms'].isnull()][['area']]))
missing_values = df.isnull().sum()



if missing_values.sum() > 0:
    print("There are still missing values in the DataFrame:", missing_values)
else:
    print("All missing values have been successfully filled.")

# see df as a csv
df.to_csv(r"../data/processed/casas_dev.csv", index=False)

df.shape


All missing values have been successfully filled.


(900, 9)