## Instructions {-}

- This is the template for the code and report on the Prediction Problem assignments.

- Your code in steps 1, 3, 4, and 5 will be executed sequentially, and must produce the RMSE / accuracy claimed on Kaggle.

- Your code in step 2 will also be executed, and must produce the optimal hyperparameter values used to train the model.

## Read data

In [41]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import warnings
import re

import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score,train_test_split, KFold, GridSearchCV, ParameterGrid, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix, mean_squared_error, r2_score, mean_squared_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from lightgbm import LGBMRegressor, LGBMClassifier

import os
os.environ["OMP_NUM_THREADS"] = "1"

In [4]:
raw_train = pd.read_csv('../Datasets/train_regression.csv')
raw_test = pd.read_csv('../Datasets/test_regression.csv')

## 1) Data pre-processing

Put the data pre-processing code. You don't need to explain it. You may use the same code from last quarter.

In [5]:
# Create copies of the raw datasets
train = raw_train.copy()
test = raw_test.copy()

# Clean 'price' column: remove '$' and ',' characters, and convert to float
train['price'] = train['price'].str.replace(',', '').str.replace('$', '', regex=False).astype(float)

In [6]:
# Convert 'host_acceptance_rate' and 'host_response_rate' columns to float and scale by dividing by 100
train['acceptance_rate'] = train['host_acceptance_rate'].str.replace('%', '').astype(float) / 100
train['response_rate'] = train['host_response_rate'].str.replace('%', '').astype(float) / 100

test['acceptance_rate'] = test['host_acceptance_rate'].str.replace('%', '').astype(float) / 100
test['response_rate'] = test['host_response_rate'].str.replace('%', '').astype(float) / 100

# Drop unnecessary columns
train.drop(columns=['host_acceptance_rate', 'host_response_rate'], inplace=True)
test.drop(columns=['host_acceptance_rate', 'host_response_rate'], inplace=True)


# Extract numeric values from 'bathrooms_text' column and convert to float
train['bathrooms_num'] = train['bathrooms_text'].str.extract(r'(\d+)').astype(float)
test['bathrooms_num'] = test['bathrooms_text'].str.extract(r'(\d+)').astype(float)

# Fill missing values in 'bathrooms_num' where 'Half-bath' is mentioned in 'bathrooms_text' with 0.5
train.loc[train['bathrooms_text'].str.contains('Half-bath', case=False, na=False) & train['bathrooms_num'].isna(), 'bathrooms_num'] = 0.5
test.loc[test['bathrooms_text'].str.contains('Half-bath', case=False, na=False) & test['bathrooms_num'].isna(), 'bathrooms_num'] = 0.5


In [7]:
# # Convert date columns to datetime format
# def strip_date(row):
#     if isinstance(row, str):
#         row = datetime.strptime(row, '%Y-%m-%d').date()
#     return row

# # Apply date conversion to train dataset
# train['host_since'] = train['host_since'].apply(strip_date)
# train['first_review'] = train['first_review'].apply(strip_date)
# train['last_review'] = train['last_review'].apply(strip_date)

# # Apply date conversion to test dataset
# test['host_since'] = test['host_since'].apply(strip_date)
# test['first_review'] = test['first_review'].apply(strip_date)
# test['last_review'] = test['last_review'].apply(strip_date)

# # ----- #

# # Calculate months since various dates for train dataset
# train['host_since_in_months'] = round(((datetime.now().date() - train['host_since']).dt.days) / 30, 2)
# train['first_review_in_months'] = round(((datetime.now().date() - train['first_review']).dt.days) / 30, 2)
# train['last_review_in_months'] = round(((datetime.now().date() - train['last_review']).dt.days) / 30, 2)

# # Calculate months since various dates for test dataset
# test['host_since_in_months'] = round(((datetime.now().date() - test['host_since']).dt.days) / 30,  2)
# test['first_review_in_months'] = round(((datetime.now().date() - test['first_review']).dt.days) / 30, 2)
# test['last_review_in_months'] = round(((datetime.now().date() - test['last_review']).dt.days) / 30, 2)


# Convert the relevant columns to datetime
train['host_since'] = pd.to_datetime(train['host_since'])
train['first_review'] = pd.to_datetime(train['first_review'])
train['last_review'] = pd.to_datetime(train['last_review'])
test['host_since'] = pd.to_datetime(test['host_since'])
test['first_review'] = pd.to_datetime(test['first_review'])
test['last_review'] = pd.to_datetime(test['last_review'])

# Calculate months since various dates for train dataset
now = datetime.now()
train['host_since_in_months'] = round((now - train['host_since']).dt.days / 30, 2)
train['first_review_in_months'] = round((now - train['first_review']).dt.days / 30, 2)
train['last_review_in_months'] = round((now - train['last_review']).dt.days / 30, 2)
test['host_since_in_months'] = round((now - test['host_since']).dt.days / 30, 2)
test['first_review_in_months'] = round((now - test['first_review']).dt.days / 30, 2)
test['last_review_in_months'] = round((now - test['last_review']).dt.days / 30, 2)


try:
    # Because the review values are extremely collinear, calculate average review scores and fill missing values with 0
    train['review_scores_avg'] = np.mean(train[['review_scores_accuracy', 'review_scores_checkin', 'review_scores_communication', 'review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness']], axis=1)
    test['review_scores_avg'] = np.mean(test[['review_scores_accuracy', 'review_scores_checkin', 'review_scores_communication', 'review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness']], axis=1)
    
    train.drop(columns=['review_scores_accuracy', 'review_scores_checkin', 'review_scores_communication', 'review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness'], inplace=True)
    test.drop(columns=['review_scores_accuracy', 'review_scores_checkin', 'review_scores_communication', 'review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness'], inplace=True)

except: pass
    
# train['review_scores_avg'] = train['review_scores_avg'].fillna(value=0)
# test['review_scores_avg'] = test['review_scores_avg'].fillna(value=0)

In [8]:
## Identify outliers in 'price' and 'minimum_nights'

# top and bottom 0.04% of price
lower_val = np.percentile(train[['price']], 0.01)
upper_val = np.percentile(train[['price']], 99.99)
outliers_idx_price = list(train[(train['price'] >= upper_val) | (train['price'] <= lower_val)].index)
print("Price outliers:", list(outliers_idx_price))
outliers_idx = outliers_idx_price

# # top 0.1% of minimum_nights
# # upper_lim = np.percentile(train[['minimum_nights']], 99.9)
# outliers_idx_nights = []  # list(train[train['minimum_nights'] >= upper_lim].index)
# outliers_idx = list(outliers_idx_price) + list(outliers_idx_nights)
# # print("Min nights outliers:", list(train[train['minimum_nights'] >= upper_lim].index))


print(f"\n{len(train.iloc[outliers_idx, :]['price'])} observations dropped\n")
# train.loc[outliers_idx_price, :]['price'].sort_values()

Price outliers: [3129, 4865]

2 observations dropped



In [9]:
# Create a dictionary for response time category conversions
response_time_dict = {'within an hour': 1, 'within a few hours': 12, 'within a day': 24, 'a few days or more': 72}

def replace_response_time(row):
    if pd.notna(row):
        return response_time_dict.get(row)
    else:
        return None

train['response_time'] = train['host_response_time'].apply(replace_response_time)
test['response_time'] = test['host_response_time'].apply(replace_response_time)

In [10]:
train_clean = train.drop(outliers_idx).reset_index(drop=True)
test_clean = test.copy()

Clean Transform

In [11]:
def clean_vars(row):
    # Check if 'shared' is in 'bathrooms_text' to identify shared bathrooms
    if 'shared' in str(row['bathrooms_text']):
        row['bathrooms_shared'] = "t"
        
    # Check if 'bathrooms_text' is empty and 'room_type' is 'Shared' to identify shared bathrooms
    elif pd.isna(row['bathrooms_text']):
        if 'Shared' in row['room_type']:
            row['bathrooms_shared'] = "t"              
        else:
            row['bathrooms_shared'] = "f"
    else: 
        row['bathrooms_shared'] = "f"
        
    # Convert 'Hotel room' room type to 'Private room'
    if row.loc['room_type'] == 'Hotel room':
        row['room_type'] = 'Private room'
        
    return row

# Apply the function to clean variables to train and test datasets
train_clean = train_clean.apply(clean_vars, axis=1)
test_clean = test_clean.apply(clean_vars, axis=1)


clean neighbourhoods

In [12]:
# Group small occurrences into 'Other'
neighbourhood_counts = train_clean['neighbourhood_cleansed'].value_counts()

other_hoods = [i for i in neighbourhood_counts.index if neighbourhood_counts[i] < 50]

test_only_hoods = [i for i in test_clean['neighbourhood_cleansed'].unique() 
                   if i not in neighbourhood_counts 
                   and i != 'Other']
    

In [13]:
# Create DataFrame with unique neighbourhoods
hood_df = pd.DataFrame(index=train_clean['neighbourhood_cleansed'].unique())

# Compute mean and standard deviation for each neighbourhood
grouped = train_clean.groupby('neighbourhood_cleansed')['price']
all_mean = grouped.mean()
all_std = grouped.std()

# Add mean and std to DataFrame
hood_df['mean_price'] = all_mean
hood_df['std_price'] = all_std

# Merge with counts
hood_df = hood_df.merge(neighbourhood_counts, left_index=True, right_index=True)
hood_df.rename(columns={'neighbourhood_cleansed': 'count'}, inplace=True)


# Get the 10th percentile of standard deviations
std_90 = np.percentile(hood_df.dropna(how='any')['std_price'], 10)


# Filter DataFrame
filtered_df = hood_df[((hood_df['std_price'] < std_90) | (hood_df['count'] > 100)) & (hood_df['count'] > 20)]

keep_hoods = filtered_df.index.tolist()


In [14]:
# if neighbourhood has small std or more than 100 but no neighbourhoods with less than 20
def clean_hoods(row):
    if row.loc['neighbourhood_cleansed'] not in keep_hoods:
        row['neighbourhood_grouped'] = 'Other'
        
    else:    
        row['neighbourhood_grouped'] = row.loc['neighbourhood_cleansed']
        
    return row

train_clean = train_clean.apply(clean_hoods, axis=1)
test_clean = test_clean.apply(clean_hoods, axis=1)

Clean property type

In [15]:
words_to_remove = ['place', 'room', 'private', 'shared', 'entire', ' in', ' room', ' private', ' shared', ' entire', ' in',]

# remove filler and unnecessary words from property
def remove_words(text):
    text=text.lower()
    for word in words_to_remove:
        word = word.lower()
        text = text.replace(word, '')
    return text.strip()


train_clean['property_type'] = train_clean['property_type'].apply(remove_words)
test_clean['property_type'] = test_clean['property_type'].apply(remove_words)


In [16]:
# identify value counts and make a list of neighbourhoods with more than 10
property_counts = train_clean['property_type'].value_counts()
keep = [i for i in property_counts.index if property_counts[i] >= 5]

def clean_property(row):
    if row not in keep or row == "":
        row = 'Other'
      
    return row


train_clean['property_type_cleansed'] = train_clean['property_type'].apply(clean_property)
test_clean['property_type_cleansed'] = test_clean['property_type'].apply(clean_property)

train_filter_2 = train_clean.copy()
test_filter_2 = test_clean.copy()


In [17]:
train_filter_2.drop(columns=['host_id', 'host_since', 'first_review', 'last_review', 'neighbourhood_cleansed', 'property_type', 'bathrooms_text'], inplace=True)
test_filter_2.drop(columns=['host_id', 'host_since', 'first_review', 'last_review', 'neighbourhood_cleansed', 'property_type', 'bathrooms_text'], inplace=True)

number of verifications

In [18]:
try:
    train_filter_2['host_verifications'] = train_filter_2['host_verifications'].apply(ast.literal_eval)
except: pass

try:
    test_filter_2['host_verifications'] = test_filter_2['host_verifications'].apply(ast.literal_eval)
except: pass

In [19]:
train_filter_2['num_verifications'] = train_filter_2['host_verifications'].apply(len)
test_filter_2['num_verifications'] = test_filter_2['host_verifications'].apply(len)

In [20]:
def split_vers(df):
    def update_verification(row):
        ver_phone = 't' if 'phone' in row['host_verifications'] else 'f'
        ver_email = 't' if 'email' in row['host_verifications'] else 'f'
        ver_work_email = 't' if 'work_email' in row['host_verifications'] else 'f'
        return pd.Series({'ver_phone': ver_phone, 'ver_email': ver_email, 'ver_work_email': ver_work_email})

    df[['ver_phone', 'ver_email', 'ver_work_email']] = df.apply(update_verification, axis=1)

    return df


train_filter_2 = split_vers(train_filter_2).drop('host_verifications', axis=1)
test_filter_2 = split_vers(test_filter_2).drop('host_verifications', axis=1)

group small occurances into other

In [21]:
host_hood_counts = train_filter_2['host_neighbourhood'].value_counts()
keep_host_hood = host_hood_counts[host_hood_counts >= 5].index

train_filter_2['host_neighbourhood'] = train_filter_2['host_neighbourhood'].apply(lambda x: 'Other' if x not in keep_host_hood else x)
test_filter_2['host_neighbourhood'] = test_filter_2['host_neighbourhood'].apply(lambda x: 'Other' if x not in keep_host_hood else x)
# train_final[['host_neighbourhood']].value_counts()
# test_final[['host_neighbourhood']].value_counts()

# ----- #

host_loc_counts = train_filter_2['host_location'].value_counts()
keep_host_loc = host_loc_counts[host_loc_counts >= 10].index

train_filter_2['host_location'] = train_filter_2['host_location'].apply(lambda x: 'Other' if x not in keep_host_loc else x)
test_filter_2['host_location'] = test_filter_2['host_location'].apply(lambda x: 'Other' if x not in keep_host_loc else x)
# train_final['host_location'].value_counts()
# test_final['host_location'].value_counts()

Columns with missing Values

In [22]:
# Create a temporary dataframe to manipulate
train_filter_temp = train_filter_2.dropna(subset=['host_is_superhost']).copy()
test_filter_temp = test_filter_2.dropna(subset=['host_is_superhost']).copy()

# Change t/f to numeric 1/0
train_filter_temp['host_is_superhost'] = train_filter_temp['host_is_superhost'].astype(str).map({'f': 0, 't': 1})
test_filter_temp['host_is_superhost'] = test_filter_temp['host_is_superhost'].astype(str).map({'f': 0, 't': 1})

# Create model
superhost_model = smf.logit(formula="host_is_superhost ~ calculated_host_listings_count*number_of_reviews_ltm + response_rate", data=train_filter_temp).fit()

# Predict all values 
impute_superhost_train = (superhost_model.predict(train_filter_temp) > 0.5).map({False: 'f', True: 't'})
impute_superhost_test = (superhost_model.predict(test_filter_temp) > 0.5).map({False: 'f', True: 't'})

# fill na's with coordinating value from model imputation
train_filter_2['host_is_superhost'] = train_filter_2['host_is_superhost'].fillna(impute_superhost_train)
test_filter_2['host_is_superhost'] = test_filter_2['host_is_superhost'].fillna(impute_superhost_test)


Optimization terminated successfully.
         Current function value: 0.586738
         Iterations 8


In [23]:
train_filter_temp_accept = train_filter_2.dropna(subset=['acceptance_rate']).copy()
test_filter_temp_accept = test_filter_2.dropna(subset=['acceptance_rate']).copy()

# create model to impute acceptance rate
acceptance_model = smf.logit(formula="acceptance_rate ~ calculated_host_listings_count + accommodates", data=train_filter_temp_accept).fit()

# fill in missing values with the predictions from the model
train_filter_2['acceptance_rate'] = train_filter_2['acceptance_rate'].fillna(acceptance_model.predict(train_filter_temp_accept))
test_filter_2['response_rate'] = test_filter_2['acceptance_rate'].fillna(acceptance_model.predict(test_filter_temp_accept))


# ----- #

train_filter_temp_resp = train_filter_2.dropna(subset=['response_rate']).copy()
test_filter_temp_resp = test_filter_2.dropna(subset=['response_rate']).copy()

# Create model to impute response rate
response_model = smf.logit(formula="response_rate ~ accommodates", data=train_filter_temp_resp).fit()

# fill in missing values with the predictions from the model
train_filter_2['acceptance_rate'] = train_filter_2['response_rate'].fillna(response_model.predict(train_filter_temp_resp))
test_filter_2['response_rate'] = test_filter_2['response_rate'].fillna(response_model.predict(test_filter_temp_resp))


Optimization terminated successfully.
         Current function value: 0.192000
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.067403
         Iterations 8


naive imputation

In [24]:
# Fill in remaining missing values with median for numerical columns
train_filter_2.fillna(train_filter_2.median(numeric_only=True), inplace=True)
test_filter_2.fillna(test_filter_2.median(numeric_only=True), inplace=True)

#### Predictor Selection with VIF

In [25]:
import statsmodels.api as sm

non_numeric_columns = train_filter_2.select_dtypes(exclude=[np.number]).columns
data_numeric = train_filter_2.drop(columns=non_numeric_columns)

X = data_numeric.drop(columns=['price', 'id'])
y = data_numeric.price

vif = pd.DataFrame()
vif["Predictor"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

vif[vif['VIF'] >= 5].sort_values('VIF', ascending=False)

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,Predictor,VIF
27,response_rate,inf
26,acceptance_rate,inf
13,maximum_nights_avg_ntm,414524300000.0
10,minimum_maximum_nights,220397400000.0
11,maximum_maximum_nights,43112940000.0
2,latitude,650650.1
3,longitude,647896.4
21,calculated_host_listings_count,36158.6
22,calculated_host_listings_count_entire_homes,36006.03
32,review_scores_avg,275.709


In [26]:
train_filter_3 = train_filter_2.drop(columns=['maximum_nights_avg_ntm', 'minimum_nights_avg_ntm', 'calculated_host_listings_count', 'availability_60', 'host_listings_count', 'response_rate']) #'minimum_minimum_nights'])  #'maximum_maximum_nights', 'minimum_maximum_nights'])
test_filter_3 = test_filter_2.drop(columns=['maximum_nights_avg_ntm', 'minimum_nights_avg_ntm', 'calculated_host_listings_count', 'availability_60', 'host_listings_count', 'response_rate']) #'minimum_minimum_nights'])  # 'maximum_maximum_nights', 'minimum_maximum_nights'])

In [27]:
non_numeric_columns = train_filter_3.select_dtypes(exclude=[np.number]).columns
data_numeric = train_filter_3.drop(columns=non_numeric_columns)

X = data_numeric.drop(columns=['price', 'id', 'latitude', 'longitude'])
y = data_numeric.price

# Calculate VIF for each predictor
vif = pd.DataFrame()
vif["Predictor"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

vif.sort_values('VIF', ascending=False)

Unnamed: 0,Predictor,VIF
24,review_scores_avg,163.148824
19,acceptance_rate,146.026229
10,availability_90,22.721694
3,minimum_nights,17.498259
26,num_verifications,16.830611
5,minimum_minimum_nights,16.352001
15,calculated_host_listings_count_entire_homes,15.226515
9,availability_30,13.995721
18,reviews_per_month,11.946063
1,accommodates,11.541626


In [28]:
# train_filter_4 = train_filter_3.drop(columns=['calculated_host_listings_count'])#, 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms'])
# test_filter_4 = test_filter_3.drop(columns=['calculated_host_listings_count'])#, 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms'])

# train_filter_4 = train_filter_3.drop(columns=['host_listings_count', 'availability_90', 'availability_60', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms'])
# test_filter_4 = test_filter_3.drop(columns=['host_listings_count', 'availability_90', 'availability_60',  'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms'])

In [29]:
# non_numeric_columns = train_filter_4.select_dtypes(exclude=[np.number]).columns
# data_numeric = train_filter_4.drop(columns=non_numeric_columns)

# X = data_numeric.drop(columns=['price', 'id', 'latitude', 'longitude'])
# y = data_numeric.price

# vif = pd.DataFrame()
# vif["Predictor"] = X.columns
# vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# vif.sort_values('VIF', ascending=False)

In [30]:
# train_filter_5 = train_filter_4.drop(columns=['response_rate', 'host_listings_count', 'availability_60', 'minimum_minimum_nights', 'maximum_minimum_nights'])
# test_filter_5 = test_filter_4.drop(columns=['response_rate', 'host_listings_count',  'availability_60', 'minimum_minimum_nights', 'maximum_minimum_nights'])

# # train_filter_5 = train_filter_4.drop(columns=['acceptance_rate', 'response_rate', 'minimum_minimum_nights', 'maximum_minimum_nights'])
# # test_filter_5 = test_filter_4.drop(columns=['acceptance_rate', 'response_rate', 'minimum_minimum_nights', 'maximum_minimum_nights'])


In [31]:
# non_numeric_columns = train_filter_5.select_dtypes(exclude=[np.number]).columns
# data_numeric = train_filter_5.drop(columns=non_numeric_columns)

# X = data_numeric.drop(columns=['price', 'id', 'latitude', 'longitude'])
# y = data_numeric.price

# vif = pd.DataFrame()
# vif["Predictor"] = X.columns
# vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# vif.sort_values('VIF', ascending=False)

In [32]:
# train_filter_6 = train_filter_5.drop(columns=['accommodates', 'minimum_nights_avg_ntm', 'num_verifications', 'reviews_per_month'])
# test_filter_6 = test_filter_5.drop(columns=['accommodates', 'minimum_nights_avg_ntm', 'num_verifications', 'reviews_per_month'])

# non_numeric_columns = train_filter_6.select_dtypes(exclude=[np.number]).columns
# data_numeric = train_filter_6.drop(columns=non_numeric_columns)

# X = data_numeric.drop(columns=['price', 'id', 'latitude', 'longitude'])
# y = data_numeric.price

# vif = pd.DataFrame()
# vif["Predictor"] = X.columns
# vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# vif.sort_values('VIF', ascending=False)


## 2) Hyperparameter tuning

### How many attempts did it take you to tune the model hyperparameters?

### Which tuning method did you use (grid search / Bayes search / etc.)?

### What challenges did you face while tuning the hyperparameters, and what actions did you take to address those challenges?

### How many hours did you spend on hyperparameter tuning?

**Paste the hyperparameter tuning code below. You must show at least one hyperparameter tuning procedure.**

In [33]:
#Hyperparameter tuning code

**Paste the optimal hyperparameter values below.**

## 3) Model

Using the optimal model hyperparameters, train the model, and paste the code below.

In [34]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate, GridSearchCV, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingRegressor
import time


In [35]:
numeric_columns = train_filter_3.select_dtypes(include=['number']).drop(columns=['price', 'id']).columns

X_train = train_filter_3.drop(columns=['price', 'id'])
X_test = test_filter_3.drop(columns=['id'])

X_train_num = X_train[numeric_columns]
y_train = train_filter_3.price

sc = StandardScaler()
sc.fit(X_train_num)

X_train_scaled = sc.transform(X_train[numeric_columns])
X_test_scaled = sc.transform(X_test[numeric_columns])

X_train_num_scaled = pd.DataFrame(X_train_scaled, columns=numeric_columns)
X_test_num_scaled = pd.DataFrame(X_test_scaled, columns=numeric_columns)

In [36]:
train_testing = train_filter_3.drop(columns=['price']) 
test_testing = test_filter_3

train_testing_cat = train_testing.select_dtypes(exclude=['number'])
test_testing_cat = test_testing.select_dtypes(exclude=['number'])

In [37]:
enc = OneHotEncoder(drop='if_binary', handle_unknown='ignore')
enc.fit(train_testing_cat)

drop_enc = enc.transform(train_testing_cat)
drop_enc_test = enc.transform(test_testing_cat)

train_encoded_df = pd.DataFrame(drop_enc.toarray(), columns=enc.get_feature_names_out(train_testing_cat.columns))
test_encoded_df = pd.DataFrame(drop_enc_test.toarray(), columns=enc.get_feature_names_out(test_testing_cat.columns))

keep_cols = [col for col in train_encoded_df.columns if 'Other' not in col]
keep_cols_test = [col for col in test_encoded_df.columns if 'Other' not in col]

train_encoded_df = train_encoded_df[keep_cols]
test_encoded_df = test_encoded_df[keep_cols_test]

def clean_feature_names(columns):
    return [re.sub(r'[^\w]', '_', col) for col in columns]

train_encoded_df.columns = clean_feature_names(train_encoded_df.columns)
test_encoded_df.columns = clean_feature_names(test_encoded_df.columns)

X_train_final = pd.concat([X_train_num_scaled, train_encoded_df], axis=1)
X_test_final = pd.concat([X_test_num_scaled, test_encoded_df], axis=1)

### Model Creation

In [41]:
base_model = DecisionTreeRegressor(random_state=1)
ada_model = AdaBoostRegressor(estimator=base_model, random_state=1)

In [None]:
coarse_params = {
    'estimator__max_depth': range(35, 82, 5), 
    'n_estimators': range(175, 330, 25),
    'learning_rate': [0.1, 0.5, 1, 10]
}

coarse_cv = KFold(n_splits=3, shuffle=True, random_state=1)
gscv_coarse_model = GridSearchCV(ada_model, coarse_params, cv=coarse_cv, scoring='neg_root_mean_squared_error', verbose=3, n_jobs=int(os.getenv("SLURM_NPROCS", 1)))
gscv_coarse_model.fit(X_train_final, y_train)

gscv_coarse_model.best_params_

Fitting 3 folds for each of 280 candidates, totalling 840 fits


In [None]:
ada_model_w_lr = AdaBoostRegressor(estimator=base_model, learning_rate=1, random_state=1)

mid_params = {
    'estimator__max_depth': range(15, 26, 2), 
    'n_estimators': range(50, 120, 10)
}

mid_cv = KFold(n_splits=3, shuffle=True, random_state=1)
gscv_mid_model = GridSearchCV(ada_model_w_lr, mid_params, cv=mid_cv, scoring='neg_root_mean_squared_error', verbose=2, n_jobs=int(os.getenv("SLURM_NPROCS", 1)))
gscv_mid_model.fit(X_train_final, y_train)

gscv_mid_model.best_params_

In [None]:
fine_params = {
    'estimator__max_depth': range(19, 24, 1), 
    'n_estimators': range(60, 81, 2), 
    'learning_rate': list(np.linspace(0.1, 1, 4))+list(np.linspace(2, 10, 4))
}

fine_cv = KFold(n_splits=3, shuffle=True, random_state=1)
gscv_fine_model = GridSearchCV(ada_model, fine_params, cv=fine_cv, scoring='neg_root_mean_squared_error', verbose=2, n_jobs=int(os.getenv("SLURM_NPROCS", 1)))
gscv_fine_model.fit(X_train_final, y_train)

gscv_fine_model.best_params_

In [None]:
repeat_params = {
    'estimator__max_depth': range(15, 19, 1), 
    'n_estimators': range(72, 75, 1), 
    'learning_rate': [0.75, 1.0, 1.5, 1.75]
}

repeat_cv = RepeatedKFold(n_splits=5, n_repeats=2, random_state=1)
gscv_repeat_model = GridSearchCV(ada_model, repeat_params, cv=repeat_cv, scoring='neg_root_mean_squared_error', verbose=2, n_jobs=int(os.getenv("SLURM_NPROCS", 1)))
gscv_repeat_model.fit(X_train_final, y_train)

gscv_repeat_model.best_params_

In [None]:
# y_preds_test = gscv_repeat_model.predict(X_test_final)

#### Gradient Boosting

In [None]:
start_time = time.time()
model = GradientBoostingRegressor(random_state=1, loss='huber')
grid = {
        'n_estimators': [1000, 1200, 1400, 1800, 2000, 3000, 4000, 6000],
        'max_leaf_nodes': [4], #, 5, 6, 7, 8],
        'learning_rate': [0.1],
        'subsample': [1.0]# [0.25, 0.35, 0.4, 0.5, 0.55]         
}

cv = KFold(n_splits=3, shuffle=True, random_state=1)
rscv_coarse_grad = GridSearchCV(estimator=model, param_grid=grid, n_jobs=int(os.getenv("SLURM_NPROCS", 1)), cv=cv, verbose=True, scoring='neg_root_mean_squared_error').fit(X_train_final, y_train)

print(rscv_coarse_grad.best_params_)
print("Time taken = ",(time.time()-start_time)/60," minutes")

In [None]:
start_time = time.time()
model = GradientBoostingRegressor(random_state=1, loss='huber')
grid = {
        'n_estimators': np.arange(1000, 1400, 50),
        'max_leaf_nodes': [4, 5, 6, 7, 8],
        'learning_rate': [1.0],
        'subsample': [1.0]
}
print(grid, "\n")

cv = KFold(n_splits=3, shuffle=True, random_state=1)
rscv_coarse_grad_2 = GridSearchCV(estimator=model, param_grid=grid, n_jobs=int(os.getenv("SLURM_NPROCS", 1)), cv=cv, verbose=True, scoring='neg_root_mean_squared_error').fit(X_train_final, y_train)

print(rscv_coarse_grad_2.best_params_)
print("Time taken = ",(time.time()-start_time)/60," minutes")

In [None]:
start_time = time.time()
model = GradientBoostingRegressor(random_state=1, loss='huber')
grid = {
        'n_estimators': [1000, 1050, 1100, 1150, 1200, 1300],
        'max_leaf_nodes': [4, 5, 6, 7],
        'learning_rate': [0.01, 0.1, 1.0],
        'subsample': [0.25, 0.5, 0.75, 1.0]
}
print(grid, "\n")

cv = KFold(n_splits=3, shuffle=True, random_state=1)
rscv_coarse_grad_3 = GridSearchCV(estimator=model, param_grid=grid, n_jobs=int(os.getenv("SLURM_NPROCS", 1)), cv=cv, verbose=True, scoring='neg_root_mean_squared_error').fit(X_train_final, y_train)

print(rscv_coarse_grad_3.best_params_)
print("Time taken = ",(time.time()-start_time)/60," minutes")

# {'learning_rate': 0.05, 'max_leaf_nodes': 6, 'n_estimators': 1150, 'subsample': 0.5}

In [None]:
start_time = time.time()
model = GradientBoostingRegressor(random_state=1, loss='huber')

grid_fine = {
        'n_estimators': [800, 850, 900, 950],
        'max_leaf_nodes': [5, 6, 7, 8],
        'learning_rate': [0.05, 0.1, 0.15],
        'subsample': [0.5, 0.6, 0.7]
}

print(grid_fine, "\n")
cv = KFold(n_splits=3, shuffle=True, random_state=1)
gscv_grad = GridSearchCV(estimator=model, param_grid=grid_fine, n_jobs=int(os.getenv("SLURM_NPROCS", 1)), cv=cv, verbose=True, scoring='neg_root_mean_squared_error').fit(X_train_final, y_train)

print(gscv_grad.best_params_)
print("Time taken = ",(time.time()-start_time)/60," minutes")

In [None]:
start_time = time.time()
model = GradientBoostingRegressor(random_state=1, loss='huber')

grid_fine = {
        'n_estimators': np.arange(850, 951, 20),
        'max_leaf_nodes': [7],
        'learning_rate': [0.1],
        'subsample': [0.6]
}

print(grid_fine, "\n")
cv = KFold(n_splits=5, shuffle=True, random_state=1)
gscv_grad = GridSearchCV(estimator=model, param_grid=grid_fine, n_jobs=int(os.getenv("SLURM_NPROCS", 1)), cv=cv, verbose=True, scoring='neg_root_mean_squared_error').fit(X_train_final, y_train)

print(gscv_grad.best_params_)
print("Time taken = ",(time.time()-start_time)/60," minutes")

In [None]:
start_time = time.time()
model = GradientBoostingRegressor(random_state=1, loss='huber')

grid_fine = {
        'n_estimators': np.arange(920, 951, 10),
        'max_leaf_nodes': [6, 7, 8, 9],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.5, 0.6, 0.7]
}

print(grid_fine, "\n")
cv = KFold(n_splits=5, shuffle=True, random_state=1)
gscv_grad = GridSearchCV(estimator=model, param_grid=grid_fine, n_jobs=int(os.getenv("SLURM_NPROCS", 1)), cv=cv, verbose=True, scoring='neg_root_mean_squared_error').fit(X_train_final, y_train)

print(gscv_grad.best_params_)
print("Time taken = ",(time.time()-start_time)/60," minutes")

In [None]:
gscv_grad_impotance = pd.DataFrame({"feature":gscv_grad.best_estimator_.feature_names_in_, "importances":gscv_grad.best_estimator_.feature_importances_}).sort_values('importances', ascending=False).reset_index(drop=True)
gscv_grad_impotance

### XGBoost

In [38]:
model =  XGBRegressor(random_state = 1,
                     objective = 'reg:absoluteerror')

grid = {
    'n_estimators': np.arange(100, 2001, 100).astype(int), # Same idea as Gradient Boosting
    'max_depth': [4, 8], # Same idea as Gradient Boosting
    'subsample': [0.8], # Same idea as Gradient Boosting
    'learning_rate': [0.1], # Same idea as Gradient Boosting

    # XGBoost hyperparams
    'reg_lambda':[0.01, 0.1], # Try different orders of magnitude - maybe 0.001
    'gamma': [0, 0.1, 1], # Always try 0 and a couple of orders of magnitude (starting with only 0 can be useful)

    'colsample_bytree': [0.5, 0.75, 1.0] # You can just keep it out as well.
}

start_time = time.time()
cv = KFold(n_splits=3, shuffle=True, random_state=1)
gscv_XGB = GridSearchCV(estimator=model, param_grid=grid, n_jobs=int(os.getenv("SLURM_NPROCS", 1)), cv=cv, verbose=True, scoring='neg_root_mean_squared_error').fit(X_train_final, y_train)

print(gscv_XGB.best_params_)
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 3 folds for each of 720 candidates, totalling 2160 fits
{'colsample_bytree': 1.0, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300, 'reg_lambda': 0.1, 'subsample': 0.8}
Time taken =  17.524174976348878  minutes


In [40]:
model =  XGBRegressor(random_state = 1,
                     objective = 'reg:absoluteerror')

grid = {
    'n_estimators': np.arange(50, 201, 50).astype(int),
    'max_depth': [2, 4, 6], 
    'subsample': [0.4, 0.6, 0.8, 1.0], 
    'learning_rate': [0.01, 0.1, 1.0],

    'reg_lambda': [0.1, 1.0],
    'gamma': [0, 0.1, 1.0] # ,
#     'colsample_bytree': [1.0]
}

start_time = time.time()
cv = KFold(n_splits=3, shuffle=True, random_state=1)
gscv_XGB_2 = GridSearchCV(estimator=model, param_grid=grid, n_jobs=int(os.getenv("SLURM_NPROCS", 1)), cv=cv, verbose=True, scoring='neg_root_mean_squared_error').fit(X_train_final, y_train)

print(-gscv_XGB_2.best_score_, gscv_XGB_2.best_params_)
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 3 folds for each of 864 candidates, totalling 2592 fits
190613.4254326333 {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 50, 'reg_lambda': 0.1, 'subsample': 0.4}
Time taken =  0.8195272803306579  minutes


In [41]:
model =  XGBRegressor(random_state = 1,
                     objective = 'reg:absoluteerror')

grid = {
    'n_estimators': np.arange(20, 101, 20).astype(int),
    'max_depth': [2, 4, 6], 
    'subsample': [0.2, 0.4, 0.6], 
    'learning_rate': [0.001, 0.01, 0.1],

    'reg_lambda': [0.01, 0.1, 1.0],
    'gamma': [0, 0.001, 0.1] # ,
#     'colsample_bytree': [1.0]
}

start_time = time.time()
cv = KFold(n_splits=3, shuffle=True, random_state=1)
gscv_XGB_3 = GridSearchCV(estimator=model, param_grid=grid, n_jobs=int(os.getenv("SLURM_NPROCS", 1)), cv=cv, verbose=True, scoring='neg_root_mean_squared_error').fit(X_train_final, y_train)

print(-gscv_XGB_3.best_score_, gscv_XGB_3.best_params_)
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 3 folds for each of 1215 candidates, totalling 3645 fits
190613.4254326333 {'gamma': 0, 'learning_rate': 0.001, 'max_depth': 2, 'n_estimators': 20, 'reg_lambda': 0.01, 'subsample': 0.2}
Time taken =  1.4043469349543254  minutes


In [43]:
model =  XGBRegressor(random_state = 1,
                     objective = 'reg:absoluteerror')

grid = {
    'n_estimators': np.arange(5, 31, 5).astype(int),
    'max_depth': [2, 3, 4], 
    'subsample': [0.1, 0.2, 0.4], 
    'learning_rate': [0.0001, 0.001, 0.01],

    'reg_lambda': [0.001, 0.01, 0.1],
    'gamma': [0, 0.001],
    'colsample_bytree': [0.5, 1.0]
}

start_time = time.time()
cv = KFold(n_splits=3, shuffle=True, random_state=1)
gscv_XGB_4 = GridSearchCV(estimator=model, param_grid=grid, n_jobs=int(os.getenv("SLURM_NPROCS", 1)), cv=cv, verbose=True, scoring='neg_root_mean_squared_error').fit(X_train_final, y_train)

print(-gscv_XGB_4.best_score_, gscv_XGB_4.best_params_)
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 3 folds for each of 1944 candidates, totalling 5832 fits
190613.4254326333 {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.0001, 'max_depth': 2, 'n_estimators': 5, 'reg_lambda': 0.001, 'subsample': 0.1}
Time taken =  1.0070114215215047  minutes


In [58]:
model =  XGBRegressor(random_state = 1)

grid = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 5, 7], 
    'subsample': [0.7, 0.8, 0.9], 
    'learning_rate': [0.01, 0.1, 0.2],
    'reg_lambda': [0.01, 0.1, 1.0],
    'gamma': [0, 0.1, 0.5]
#     'colsample_bytree': [0.7, 0.8, 1.0]
}


grid
start_time = time.time()
cv = KFold(n_splits=3, shuffle=True, random_state=1)
gscv_XGB_4 = GridSearchCV(estimator=model, param_grid=grid, n_jobs=int(os.getenv("SLURM_NPROCS", 1)), cv=cv, verbose=True, scoring='neg_root_mean_squared_error').fit(X_train_final, y_train)

print(-gscv_XGB_4.best_score_, gscv_XGB_4.best_params_)
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 3 folds for each of 729 candidates, totalling 2187 fits
140.08638927969818 {'gamma': 0, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'reg_lambda': 0.01, 'subsample': 0.7}
Time taken =  176.08242254257203  minutes


In [53]:
model =  XGBRegressor(random_state = 1)

grid = {
    'n_estimators': np.arange(25, 101, 25),
    'max_depth': [3, 5], 
    'subsample': [0.5, 0.7, 0.8], 
    'learning_rate': [0.2],
    'reg_lambda': [0.01],
    'gamma': [0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}


grid
start_time = time.time()
cv = KFold(n_splits=3, shuffle=True, random_state=1)
gscv_XGB_4 = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, verbose=True, scoring='neg_root_mean_squared_error').fit(X_train_final, y_train)

print(-gscv_XGB_4.best_score_, gscv_XGB_4.best_params_)
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 3 folds for each of 72 candidates, totalling 216 fits
140.08638927969818 {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'reg_lambda': 0.01, 'subsample': 0.7}
Time taken =  4.99863965511322  minutes


In [38]:
model =  XGBRegressor(random_state = 1)

grid = {
    'n_estimators': np.arange(450, 2001, 100),
    'max_leaves': [40, 45, 50], 
    'subsample': [0.3, 0.5, 0.6], 
    'learning_rate': [0.01, 0.1],
    'reg_lambda': [10, 100],
    'gamma': [0],
    'colsample_bytree': [1.0]
}

grid
start_time = time.time()
cv = KFold(n_splits=3, shuffle=True, random_state=1)
gscv_XGB_5 = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, verbose=True, scoring='neg_root_mean_squared_error').fit(X_train_final, y_train)

print(-gscv_XGB_5.best_score_, gscv_XGB_5.best_params_)
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 3 folds for each of 576 candidates, totalling 1728 fits
128.66901496892066 {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_leaves': 45, 'n_estimators': 1150, 'reg_lambda': 100, 'subsample': 0.3}
Time taken =  9.120327126979827  minutes


In [42]:
model =  XGBRegressor(random_state = 1)

grid = {
    'n_estimators': np.arange(1050, 1251, 20),
    'max_leaves': [45, 46, 47, 48, 49], 
    'subsample': [0.1, 0.2, 0.3], 
    'learning_rate': [0.1],
    'reg_lambda': [100],
    'gamma': [0],
    'colsample_bytree': [1.0]
}

grid
start_time = time.time()
cv = KFold(n_splits=5, shuffle=True, random_state=1)
gscv_XGB_6 = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, verbose=True, scoring='neg_root_mean_squared_error').fit(X_train_final, y_train)

print(-gscv_XGB_6.best_score_, gscv_XGB_6.best_params_)
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 5 folds for each of 165 candidates, totalling 825 fits
129.04364439567183 {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_leaves': 47, 'n_estimators': 1190, 'reg_lambda': 100, 'subsample': 0.2}
Time taken =  4.817241732279459  minutes


In [46]:
best_XGB_model = XGBRegressor(gamma=0, learning_rate=0.2, max_depth=3, n_estimators=100, reg_lambda=0.01, subsample=0.7, random_state=1).fit(X_train_final, y_train)

### CatBoost

### LightGBM

In [53]:
light_model = LGBMRegressor(random_state=1, num_threads=1, verbose=-1)

grid = {
    'n_estimators': np.arange(80, 151, 20),
    'learning_rate': [0.001, 0.1, 1.0],
    'max_depth': np.arange(6, 15, 2),
    'subsample': [0.2, 0.4, 0.6], 
    'colsample_bytree': [0.5, 1.0],
    'reg_lambda': [0.1, 1.0]
}
grid

light_cv = KFold(n_splits=3, shuffle=True, random_state=1)
light_gscv = GridSearchCV(estimator=light_model, param_grid=grid, cv=light_cv, n_jobs=int(os.getenv("SLURM_NPROCS", 1)), scoring='neg_root_mean_squared_error').fit(X_train_final, y_train)

print("Best: %f using %s" % (-light_gscv.best_score_, light_gscv.best_params_))

Best: 138.052145 using {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 120, 'reg_lambda': 0.1, 'subsample': 0.2}


In [58]:
light_model = LGBMRegressor(random_state=1, num_threads=1, verbose=-1)

grid = {
    'n_estimators': np.arange(100, 141, 8),
    'learning_rate': [0.01, 0.05, 0.1, 0.5],
    'max_depth': np.arange(6, 15, 2),
    'subsample': [0.1, 0.2, 0.3], 
    'colsample_bytree': [0.3, 0.5, 0.8],
    'reg_lambda': [0.01, 0.05, 0.1, 0.5]
}
grid

light_cv = KFold(n_splits=3, shuffle=True, random_state=1)
light_gscv_2 = GridSearchCV(estimator=light_model, param_grid=grid, cv=light_cv, n_jobs=int(os.getenv("SLURM_NPROCS", 1)), scoring='neg_root_mean_squared_error', verbose=1).fit(X_train_final, y_train)

print("Best: %f using %s" % (-light_gscv_2.best_score_, light_gscv_2.best_params_))

Fitting 3 folds for each of 4320 candidates, totalling 12960 fits
Best: 137.861744 using {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 116, 'reg_lambda': 0.1, 'subsample': 0.1}


In [72]:
light_model = LGBMRegressor(random_state=1, num_threads=1, verbose=-1)

grid = {
    'n_estimators': np.arange(110, 128, 2),
    'learning_rate': [0.05, 0.1, 0.15],
    'max_depth': np.arange(8, 13, 1),
    'subsample': [0.05, 0.1, 0.15], 
    'colsample_bytree': [0.4, 0.5, 0.6, 0.7],
    'reg_lambda': [0.05, 0.1, 0.15]
}
grid

light_rep_cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
light_gscv_3 = GridSearchCV(estimator=light_model, param_grid=grid, cv=light_rep_cv, n_jobs=int(os.getenv("SLURM_NPROCS", 1)), scoring='neg_root_mean_squared_error', verbose=1).fit(X_train_final, y_train)

print("Best: %f using %s" % (-light_gscv_3.best_score_, light_gscv_3.best_params_))

Fitting 15 folds for each of 4860 candidates, totalling 72900 fits
Best: 134.049464 using {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 11, 'n_estimators': 124, 'reg_lambda': 0.05, 'subsample': 0.05}


Light - RandomForest

In [74]:
light_model = LGBMRegressor(boosting='rf', random_state=1, num_threads=1, verbose=-1)

grid = {
    'n_estimators': np.arange(80, 151, 20),
    'learning_rate': [0.001, 0.1, 1.0],
    'max_depth': np.arange(6, 15, 2),
    'subsample': [0.2, 0.4, 0.6], 
    'colsample_bytree': [0.5, 1.0],
    'reg_lambda': [0.1, 1.0]
}
grid

light_cv = KFold(n_splits=3, shuffle=True, random_state=1)
light_rf_gscv = GridSearchCV(estimator=light_model, param_grid=grid, cv=light_cv, n_jobs=int(os.getenv("SLURM_NPROCS", 1)), scoring='neg_root_mean_squared_error', verbose=1).fit(X_train_final, y_train)

print("Best: %f using %s" % (-light_rf_gscv.best_score_, light_rf_gscv.best_params_))

Fitting 3 folds for each of 720 candidates, totalling 2160 fits


[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->baggin

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->baggin

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->baggin

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->baggin

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->baggin

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->baggin

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->baggin

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->baggin

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->baggin

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->baggin

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->baggin

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /home/conda/feedstock_root/build_artifacts/lightgbm_1706274201058/work/src/boosting/rf.hpp, line 36 .

[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->baggin

Best: 149.329848 using {'colsample_bytree': 0.5, 'learning_rate': 0.001, 'max_depth': 12, 'n_estimators': 140, 'reg_lambda': 0.1, 'subsample': 0.2}


In [42]:
# Define the LightGBM model
light_model = LGBMRegressor(boosting_type='rf', random_state=1, num_threads=1, verbose=-1)

# Define the parameter grid
grid = {
    'n_estimators': np.arange(80, 281, 50),
    'learning_rate': [0.1],
    'max_depth': np.arange(5, 21, 5),
    'subsample': [0.2],
    'colsample_bytree': [0.5],
    'reg_lambda': [0.1],
    'bagging_fraction': [0.5, 0.7, 0.9],
    'feature_fraction': [0.5, 0.7, 0.9]
}

# Define the cross-validation strategy
light_cv = KFold(n_splits=3, shuffle=True, random_state=1)

# Perform the grid search
light_rf_gscv_2 = GridSearchCV(
    estimator=light_model, 
    param_grid=grid, 
    cv=light_cv, 
    n_jobs=-1, 
    scoring='neg_root_mean_squared_error', 
    verbose=1
).fit(X_train_final, y_train)

# Print the best score and parameters
print("Best: %f using %s" % (-light_rf_gscv_2.best_score_, light_rf_gscv_2.best_params_))

Fitting 3 folds for each of 180 candidates, totalling 540 fits
Best: 148.995932 using {'bagging_fraction': 0.5, 'colsample_bytree': 0.5, 'feature_fraction': 0.7, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 230, 'reg_lambda': 0.1, 'subsample': 0.2}


In [49]:
# Define the LightGBM model
light_model = LGBMRegressor(boosting_type='rf', random_state=1, num_threads=1, verbose=-1)

# Define the parameter grid
grid = {
    'n_estimators': np.arange(200, 451, 50),
    'learning_rate': [0.001, 0.01],
    'max_depth': np.arange(10, 16, 5),
    'subsample': [0.2],
    'colsample_bytree': [0.5, 1.0],
    'reg_lambda': [0.01, 0.1, 1.0],
    'bagging_fraction': [0.5, 0.7],
    'feature_fraction': [0.5, 0.7]
}

# Define the cross-validation strategy
light_cv = KFold(n_splits=3, shuffle=True, random_state=1)

# Perform the grid search
light_rf_gscv_2 = GridSearchCV(
    estimator=light_model, 
    param_grid=grid, 
    cv=light_cv, 
    n_jobs=-1, 
    scoring='neg_root_mean_squared_error', 
    verbose=1
).fit(X_train_final, y_train)

# Print the best score and parameters
print("Best: %f using %s" % (-light_rf_gscv_2.best_score_, light_rf_gscv_2.best_params_))

Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Best: 148.995107 using {'bagging_fraction': 0.5, 'colsample_bytree': 0.5, 'feature_fraction': 0.7, 'learning_rate': 0.001, 'max_depth': 15, 'n_estimators': 200, 'reg_lambda': 0.1, 'subsample': 0.2}


In [52]:
# Define the LightGBM model
light_model = LGBMRegressor(boosting_type='rf', random_state=1, num_threads=1, verbose=-1)

# Define the parameter grid
grid = {
    'n_estimators': np.arange(150, 301, 50),
    'learning_rate': [1e-5, 1e-4, 0.001],
    'max_depth': np.arange(10, 21, 5),
    'subsample': [0.2, 0.6],
    'colsample_bytree': [0.5, 1],
    'reg_lambda': [0.01, 0.1, 1.0],
    'bagging_fraction': [0.3, 0.5],
    'feature_fraction': [0.3, 0.5]
}

# Define the cross-validation strategy
light_cv = KFold(n_splits=3, shuffle=True, random_state=1)

# Perform the grid search
light_rf_gscv_2 = GridSearchCV(
    estimator=light_model, 
    param_grid=grid, 
    cv=light_cv, 
    n_jobs=-1, 
    scoring='neg_root_mean_squared_error', 
    verbose=1
).fit(X_train_final, y_train)

# Print the best score and parameters
print("Best: %f using %s" % (-light_rf_gscv_2.best_score_, light_rf_gscv_2.best_params_))

Fitting 3 folds for each of 1728 candidates, totalling 5184 fits
Best: 149.290974 using {'bagging_fraction': 0.3, 'colsample_bytree': 0.5, 'feature_fraction': 0.5, 'learning_rate': 1e-05, 'max_depth': 15, 'n_estimators': 300, 'reg_lambda': 0.1, 'subsample': 0.2}


In [46]:
y_preds_test = light_rf_gscv_2.predict(X_test_final)

## 4) Put any ad-hoc steps for further improving model accuracy

In [47]:
y_preds = y_preds_test*1.05

## 5) Export the predictions in the format required to submit on Kaggle

In [48]:
predicted_values = pd.DataFrame(y_preds, columns=['predicted'])

# add listing id to the predicted values dataframe and set the index to the id value
predicted_values = predicted_values.merge(test_filter_2['id'], left_index=True, right_index=True).set_index('id').rename(columns={0:'predicted'})
predicted_values

# predicted_values.to_csv('pred_csvs/boost_reg_model_light_23.csv')