# **<center> Modeling: Kaggle Competition </center>**
![Immune](https://i.imgur.com/0TSSaqL.png)  

In [46]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:
import os
import pandas as pd
import json
import pickle
import optuna
import xgboost as xgb
from xgboost  import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from paths import RAW_DIR, PROCESSED_DIR, SUBMISSIONS_DIR, MODELS_DIR
from functions import compute_metrics, plot_errors
from transformers import ExtractDataNeighborhood, FillNA, RemoveOutliersTransformer
import warnings
from datetime import datetime

warnings.filterwarnings('ignore')

raw_train_df = pd.read_csv(os.path.join(RAW_DIR, 'train.csv'))
raw_predict_df = pd.read_csv(os.path.join(RAW_DIR, 'test.csv'))

In [48]:
raw_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17448 entries, 0 to 17447
Data columns (total 56 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            17448 non-null  int64  
 1   title                         17448 non-null  object 
 2   subtitle                      17448 non-null  object 
 3   sq_mt_built                   17348 non-null  float64
 4   sq_mt_useful                  6575 non-null   float64
 5   n_rooms                       17448 non-null  int64  
 6   n_bathrooms                   17434 non-null  float64
 7   n_floors                      1163 non-null   float64
 8   sq_mt_allotment               1146 non-null   float64
 9   latitude                      0 non-null      float64
 10  longitude                     0 non-null      float64
 11  raw_address                   13056 non-null  object 
 12  is_exact_address_hidden       17448 non-null  bool   
 13  s

In [49]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(raw_train_df, raw_train_df['buy_price_by_area'], test_size=0.2, random_state=42)

In [50]:
# Features for one-hot encoding
categorical_columns = ['floor', 'house_type_id', 'energy_certificate']

# Features for filling NA with False
fill_na_false_columns = [
    'has_lift', 
    'is_new_development', 
    'has_central_heating', 
    'has_ac', 
    'has_parking', 
    'is_exterior', 
    'has_garden', 
    'has_pool', 
    'has_terrace', 
    'has_balcony', 
    'has_storage_room', 
    'has_green_zones', 
    'has_fitted_wardrobes']

# Features for filling NA with True
fill_na_true_columns = [
    'is_parking_included_in_price',
    'is_renewal_needed'
]

# Numerical features
numerical_columns = ['sq_mt_built', 'sq_mt_useful', 'n_rooms', 'n_bathrooms']

numeric_etl = ColumnTransformer(
    transformers=[
        ('remove_outliers', RemoveOutliersTransformer(['rent_price']), ['rent_price']),
        ('mean_price', ExtractDataNeighborhood(data='numerical'), ['neighborhood_id']),
        ('_', 'passthrough', numerical_columns),
    ],
    remainder='drop')


numeric_transformer = Pipeline(steps=[
    ('etl', numeric_etl),
    ('scaler', StandardScaler())  # Standardize numerical features
])

categorical_etl = ColumnTransformer(
    transformers=[
        ('loc_info', ExtractDataNeighborhood(data='categorical'), ['neighborhood_id']),
        ('cat_bool', FillNA(fill_na_false_columns, type_fill='False'), fill_na_false_columns),
        ('cat_bool_2', FillNA(fill_na_true_columns, type_fill='True'), fill_na_true_columns),
        ('_', 'passthrough', categorical_columns)
    ],
    remainder='drop')

categorical_transformer = Pipeline(steps=[
    ('etl', categorical_etl),
    ('onehot', OneHotEncoder(drop='if_binary'))
])

etl_pipeline = FeatureUnion(
    transformer_list=[
        ('numeric', numeric_transformer),
        ('categorical', categorical_transformer)
    ])


In [51]:
etl_pipeline.fit(X_train, y_train)

# Transform the data
train_transformed = pd.DataFrame.sparse.from_spmatrix(etl_pipeline.transform(X_train), columns=etl_pipeline.get_feature_names_out())
train_transformed['buy_price_by_area'] = y_train.values
test_transformed = pd.DataFrame.sparse.from_spmatrix(etl_pipeline.transform(X_test), columns=etl_pipeline.get_feature_names_out())
test_transformed['buy_price_by_area'] = y_test.values

# Save in csv format
train_transformed.to_csv(os.path.join(PROCESSED_DIR, 'train.csv'), index=False)
test_transformed.to_csv(os.path.join(PROCESSED_DIR, 'test.csv'), index=False)


# # Object columns to categorical to avoid problems with xgboost
# object_cols = X_train_transformed.select_dtypes(include=['object']).columns
# X_train_transformed[object_cols] = X_train_transformed[object_cols].astype('category')
# X_test_transformed[object_cols] = X_test_transformed[object_cols].astype('category')

# X_train_transformed.info()

# # # Save in parquet format
# pd.DataFrame(X_train_transformed).to_parquet(os.path.join(PROCESSED_DIR, 'train.parquet'))
# pd.DataFrame(X_test_transformed).to_parquet(os.path.join(PROCESSED_DIR, 'test.parquet'))

In [71]:
# Import the data and create train, validation and test sets
train_df = pd.read_csv(os.path.join(PROCESSED_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(PROCESSED_DIR, 'test.csv'))

X_train = train_df.drop('buy_price_by_area', axis=1)
y_train = train_df['buy_price_by_area']

X_test = test_df.drop('buy_price_by_area', axis=1)
y_test = test_df['buy_price_by_area']

def objective(trial):
    """Function to optimize the hyperparameters of the XGBoost model using Optuna"""
    
    # Define the hyperparameters to optimize
    params = {
        'booster': trial.suggest_categorical('booster', ['gbtree']),
        'importance_type': trial.suggest_categorical('importance_type', ['gain']),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'n_jobs': trial.suggest_int('n_jobs', -1, -1),
        'objective': trial.suggest_categorical('objective', ['reg:squarederror']),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 1, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 1, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.001, 1, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'random_state': trial.suggest_int('random_state', 1234, 1234),
        'eval_metric': trial.suggest_categorical('eval_metric', ['mape'])
    }

    dtrain = xgb.DMatrix(X_train, label=y_train)
    
    # xgbost cross validation
    results = xgb.cv(params, 
                     dtrain,
                     nfold=20,
                     metrics="mape",
                     seed=1234,
                     maximize=False)
                     
    print(f"{results}")

    return results['test-mape-mean'].mean()

# Optimize the hyperparameters
study = optuna.create_study(direction='minimize')

study.optimize(objective, n_trials=100, show_progress_bar=True)

# Get the best hyperparameters
best_params = study.best_params

# Create the XGBoost model with the best hyperparameters
model = XGBRegressor(**best_params)

# Fit the model
model.fit(X_train, y_train)

[I 2023-09-14 19:53:25,485] A new study created in memory with name: no-name-a68349ef-9543-45e7-b654-6ed7802462e5


  0%|          | 0/100 [00:00<?, ?it/s]

   train-mape-mean  train-mape-std  test-mape-mean  test-mape-std
0         0.441906        0.000787        0.442010       0.014162
1         0.422954        0.000882        0.423080       0.013694
2         0.405824        0.000853        0.406058       0.013299
3         0.386588        0.000801        0.386880       0.012893
4         0.368779        0.000758        0.369159       0.012370
5         0.352415        0.000697        0.352742       0.012003
6         0.339388        0.000683        0.339769       0.011584
7         0.327396        0.000737        0.327876       0.011343
8         0.315380        0.000681        0.315930       0.011213
9         0.305098        0.000723        0.305704       0.011089
[I 2023-09-14 19:53:27,026] Trial 0 finished with value: 0.3669207145332849 and parameters: {'booster': 'gbtree', 'importance_type': 'gain', 'learning_rate': 0.06817250842562333, 'max_depth': 4, 'n_estimators': 137, 'n_jobs': -1, 'objective': 'reg:squarederror', 'reg_alpha'

KeyboardInterrupt: 

In [69]:
# Save the model in pickle file
model_name = 'xgboost_model_' + datetime.now().strftime("%Y%m%d-%H%M%S") + '.pkl'
model_path = os.path.join(MODELS_DIR, model_name)

pickle.dump(model, open(model_path, 'wb'))

model

In [70]:
# Make predictions on the test set
y_train_pred = model.predict(X_train)
y_pred = model.predict(X_test)

# Eval Model 
metrics = compute_metrics(model, y_test, y_pred, y_train, y_train_pred)


Train/Test split results:
XGBRegressor r2 is 0.865
XGBRegressor mean_squared_error is 492398.970
XGBRegressor mean_absolute_error is 383.432
XGBRegressor mape test is 0.090
XGBRegressor mape train is 0.048


In [41]:
results = xgb.cv(params, X_train, num_boost_round=10, nfold=3, early_stopping_rounds=10, metrics="mape", as_pandas=True, seed=1234)


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_sparse(data):
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_categorical_dtype(dtype):
  if is

-0.07665604623278431

In [32]:
# Save metrics and model parameters in json format
metrics_names = ('r2', 'mse', 'mae', 'mape_test', 'mape_train')

metrics_dict = {metric_name: metric_value for metric_name, metric_value in zip(metrics_names, metrics)}

to_save = {
    'model_fit_params': best_params,
    'model_params': model.get_params(),
    'metrics': metrics_dict
    }

with open(os.path.join(MODELS_DIR, f'metrics_{model.__class__.__name__}_{round(metrics_dict["mape_test"], 4)}_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.json'), 'w') as f:
    json.dump(to_save, f)

In [33]:
#plot_errors(model, 'mape')

In [34]:
pd.DataFrame(zip(model.get_booster().feature_names ,model.feature_importances_)).sort_values(1,ascending=False).head(10)

Unnamed: 0,0,1
1,numeric__mean_price__neighborhood_id,0.125971
106,categorical__loc_info__neighborhood_id_102,0.110615
151,categorical__loc_info__district_id_21,0.095982
146,categorical__loc_info__district_id_15,0.073596
152,categorical__cat_bool__has_lift_True,0.054764
153,categorical__cat_bool__is_new_development_True,0.042387
188,categorical_____house_type_id_HouseType 2: Cas...,0.032145
0,numeric__remove_outliers__rent_price,0.018304
174,categorical_____floor_8,0.017875
166,categorical__cat_bool_2__is_renewal_needed_True,0.012991


## KAGGLE SUBMISSION

In [35]:
def eval_best_model(final_model, valid_df):

    Id_aux = valid_df[['id']]

    X_valid = etl_pipeline.transform(valid_df)
    y_valid_pred = final_model.predict(X_valid)

    submission = pd.DataFrame({'id': Id_aux['id'],
                               'buy_price_by_area': y_valid_pred})
    return (submission)

In [36]:
submission = eval_best_model(model, raw_predict_df)

submission.to_csv(os.path.join(SUBMISSIONS_DIR, f'submission_{model.__class__.__name__}_{round(metrics_dict["mape_test"], 4)}_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.csv'), index=False)