In [102]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GroupKFold, TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import Lasso, LogisticRegression, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
import ast
import shap
from scipy.stats import uniform
import optuna
from sklearn.model_selection import cross_val_score
import os
import json
import shutil

## **Introduction:**

1.   What is Leave-One-Out (LOO)? Provide limitations and strengths.
     * LOO is the extreme version of Cross-Validation where the number of folds ($k$) equals the number of samples ($n$) in the dataset. For every iteration, the model trains on all data points except one, and tests on that single excluded point. This process repeats $n$ times.
2. How do Grid Search, Randomized Grid Search, and Bayesian optimization work?
    * Grid Search:
Mechanism: It performs an exhaustive search over a manually specified subset of the hyperparameter space. It trains a model for every possible combination of parameters defined in a "grid."
Pros/Cons: Guaranteed to find the best combination within the grid, but computationally expensive and inefficient if the grid is too fine or has too many dimensions.
   * Randomized Grid Search:
Mechanism: Instead of checking every combination, it selects a fixed number of random combinations from the specified hyperparameter distributions.
Pros/Cons: Faster than Grid Search and statistically likely to find a near-optimal solution. It is particularly effective when some hyperparameters are more important than others, as it explores more unique values for each parameter.
    * Bayesian Optimization (e.g., Optuna/Hyperopt):
Mechanism: It builds a probabilistic model (a "surrogate") of the objective function. It uses past evaluation results to predict which hyperparameters are likely to yield better performance. It balances exploration (trying uncertain regions) and exploitation (refining known good regions).
Pros/Cons: The most efficient method. It converges to optimal parameters faster because it "learns" from previous iterations rather than searching blindly.
3. Explain classification of feature selection methods.
    * Filter Methods: Select features based on statistical scores (like correlation) before training the model. They are fast and independent of the machine learning algorithm.
    * Wrapper Methods: Select features by training a model on different subsets of features and evaluating performance. They are computationally expensive but accurate because they consider feature interactions.
    * Embedded Methods: Perform feature selection during the model training process (e.g., as part of the loss function). They balance the speed of filters and the accuracy of wrappers.

In [34]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Download the data from kaggle:

In [None]:
# Install kaggle CLI if needed
if shutil.which('kaggle') is None:
    %pip install -q kaggle 

kaggle_json_path = os.path.expanduser("~/.kaggle/kaggle.json")

# Prompt for credentials
user = input("Kaggle username: ").strip()
key = input("Kaggle API key: ").strip()

os.makedirs(os.path.dirname(kaggle_json_path), exist_ok=True)
with open(kaggle_json_path, "w") as f:
    json.dump({"username": user, "key": key}, f)
    
# Secure the file so the Kaggle API doesn't complain
os.chmod(kaggle_json_path, 0o600)

# Download the competition dataset into ./data
!kaggle competitions download -c two-sigma-connect-rental-listing-inquiries -p ./data

# Unzip the downloaded file and remove the zip archive to save space
!unzip -q -o ./data/two-sigma-connect-rental-listing-inquiries.zip -d ./data
!rm ./data/two-sigma-connect-rental-listing-inquiries.zip

! unzip ./data/train.json.zip -d ./data/
! unzip ./data/test.json.zip -d ./data/

print("Dataset downloaded and unzipped successfully!")

# **Introduction — do all the preprocessing from the previous lesson:**

In [35]:
train_data_full = pd.read_json('./data/train.json')
train_data_full = train_data_full.sort_values(by='created')
train_data_full

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address,interest_level
111817,1.0,1,2a21319016fe50100e0b8ebb5a4f9cf0,2016-04-01 22:12:41,X-LARGE Flex 1BR Loft! ~~ PRIME Greenwich Vill...,Astor Place,"[Doorman, Elevator, Laundry In Building]",40.7302,6811957,-73.9924,f07272f8ceb99db4c1a7cbbd9ae7b75b,[https://photos.renthop.com/2/6811957_3dad56e8...,3195,1 Astor Place,high
117995,1.0,0,104bfeddd65a0890b071c3a09cf81704,2016-04-01 22:56:00,"This Enormous Studio Features: Harwood Floors,...",East 54th Street,"[Cats Allowed, Dogs Allowed, No Fee, Laundry I...",40.7576,6811965,-73.9677,3b630ec9cb6eee53b92cfac7f42e3bf4,[https://photos.renthop.com/2/6811965_b8f942e6...,2000,230 East 54th Street,medium
114617,2.0,3,8775706158cbc96d12dd441d42e11deb,2016-04-01 22:57:15,--- East 31st St & Lexington Avenue --- This S...,East 31st St & Lexington Avenue,"[Common Outdoor Space, Cats Allowed, Private O...",40.7388,6811966,-73.9851,3b630ec9cb6eee53b92cfac7f42e3bf4,[https://photos.renthop.com/2/6811966_8b83c24d...,5850,105 Lexington Avenue,high
117474,1.0,1,bc4e62116277654d4df66ab77a1152f8,2016-04-01 23:26:07,Reduced Fee!! Priced To Rent!\rLarge Newly Upd...,West End Ave,"[Common Outdoor Space, Cats Allowed, Dogs Allo...",40.7939,6811973,-73.9738,262471a6a678adb458e879b092b23b4b,[https://photos.renthop.com/2/6811973_c87c8f6d...,2745,700 West End Ave,medium
103891,1.0,1,18c5b031bad8cef779efa7e2398a42a3,2016-04-02 00:48:13,Phenomenal deal of the century!! This spacious...,E 88th street,"[Cats Allowed, Dogs Allowed, Doorman, Elevator...",40.7784,6811975,-73.9491,7c5e4fc025b70c6540d6b0e06716b9dd,[https://photos.renthop.com/2/6811975_370cb787...,2400,401 E 88th street,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,1.0,2,b2cc0f022ed6e0909559b6f4ea2c93df,2016-06-29 17:47:34,"Newly renovated, clean, quiet & very bright 2...",At Thompson St,[],40.7261,7234386,-74.0014,0bafd514443193d057e8a60e45cb7ea6,[https://photos.renthop.com/2/7234386_4c24ffa2...,3045,132 Thompson St #20,medium
26349,1.0,1,8c86e6c7f2fca184602f484168e2c9a6,2016-06-29 17:56:12,Historic conversion in Greenpoint with awesome...,"100 Dupont St, Brooklyn, NY 11222","[Cats Allowed, Dogs Allowed, Fitness Center, L...",40.7358,7234391,-73.9560,6c2a16187e6855c132bb496b875a4ef7,[https://photos.renthop.com/2/7234391_e3cb9f08...,2648,100 Dupont St,low
622,1.0,1,a61b2882bb7a66c691471523811b19b8,2016-06-29 18:14:48,"Large, sunny one bedroom apartment with a sepa...",West 45th St. and 8th Ave.,"[Cats Allowed, Dogs Allowed, Elevator, Laundry...",40.7597,7234398,-73.9890,634f618895493a04f7722f113a89947b,[https://photos.renthop.com/2/7234398_60419693...,2650,305 West 45th St.,low
34914,1.0,3,2d4de807c29963b5aac44d4300e35dc9,2016-06-29 18:30:41,LOOK NO FURTHER!!\r\rMassive three bedroom apa...,williamsburg,"[Laundry In Unit, Cats Allowed, Dogs Allowed, ...",40.7156,7234403,-73.9540,699c325b818541f314b691b76f3238d7,[https://photos.renthop.com/2/7234403_27a683b8...,3200,32 Havemeyer Street,medium


In [36]:
target_encoded = train_data_full['interest_level']
target_encoded = target_encoded.map({'low':0, 'medium':1, 'high':2})
target_encoded

Unnamed: 0,interest_level
111817,2
117995,1
114617,2
117474,1
103891,1
...,...
335,1
26349,0
622,0
34914,1


In [37]:
building_id = train_data_full['building_id']
building_id

Unnamed: 0,building_id
111817,2a21319016fe50100e0b8ebb5a4f9cf0
117995,104bfeddd65a0890b071c3a09cf81704
114617,8775706158cbc96d12dd441d42e11deb
117474,bc4e62116277654d4df66ab77a1152f8
103891,18c5b031bad8cef779efa7e2398a42a3
...,...
335,b2cc0f022ed6e0909559b6f4ea2c93df
26349,8c86e6c7f2fca184602f484168e2c9a6
622,a61b2882bb7a66c691471523811b19b8
34914,2d4de807c29963b5aac44d4300e35dc9


In [38]:
date_field = train_data_full['created'].astype('datetime64[ns]').sort_values()
date_field

Unnamed: 0,created
111817,2016-04-01 22:12:41
117995,2016-04-01 22:56:00
114617,2016-04-01 22:57:15
117474,2016-04-01 23:26:07
103891,2016-04-02 00:48:13
...,...
335,2016-06-29 17:47:34
26349,2016-06-29 17:56:12
622,2016-06-29 18:14:48
34914,2016-06-29 18:30:41


In [39]:
train_data_full['features'].map(type).value_counts()

Unnamed: 0_level_0,count
features,Unnamed: 1_level_1
<class 'list'>,49352


In [40]:
train_data_feat_exploded = train_data_full['features'].explode()
train_data_feat_exploded = train_data_feat_exploded.str.replace(
    r'[^\w]',
    '',
    regex=True,
    ).str.lower()
train_data_feat_clean = train_data_feat_exploded.groupby(level=0).agg(list)
train_data_feat_clean

Unnamed: 0,features
4,"[diningroom, prewar, laundryinbuilding, dishwa..."
6,"[doorman, elevator, laundryinbuilding, dishwas..."
9,"[doorman, elevator, laundryinbuilding, laundry..."
10,[nan]
15,"[doorman, elevator, fitnesscenter, laundryinbu..."
...,...
124000,"[elevator, dishwasher, hardwoodfloors]"
124002,"[commonoutdoorspace, catsallowed, dogsallowed,..."
124004,"[diningroom, elevator, prewar, laundryinbuildi..."
124008,"[prewar, laundryinunit, dishwasher, nofee, out..."


In [41]:
feature_list = ['Elevator', 'HardwoodFloors', 'CatsAllowed', 'DogsAllowed', 'Doorman', 'Dishwasher', 'NoFee', 'FitnessCenter', 'PreWar', 'LaundryinUnit', 'RoofDeck', 'OutdoorSpace', 'DiningRoom', 'HighSpeedInternet', 'Balcony', 'SwimmingPool', 'LaundryInBuilding', 'NewConstruction', 'Terrace']
train_data_encoded = pd.DataFrame(train_data_full['created'].astype('datetime64[ns]'))

for a_feature in feature_list:
  col_name = a_feature
  a_feature = a_feature.lower()
  train_data_encoded[col_name] = [1 if a_feature in row else 0 for row in train_data_feat_clean]

train_data_encoded = train_data_encoded.drop('created', axis=1)
train_data_encoded

Unnamed: 0,Elevator,HardwoodFloors,CatsAllowed,DogsAllowed,Doorman,Dishwasher,NoFee,FitnessCenter,PreWar,LaundryinUnit,RoofDeck,OutdoorSpace,DiningRoom,HighSpeedInternet,Balcony,SwimmingPool,LaundryInBuilding,NewConstruction,Terrace
111817,0,1,1,1,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0
117995,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0
114617,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0
117474,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
103891,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
26349,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
622,1,1,1,1,0,1,1,0,1,1,0,0,1,0,0,0,1,0,0
34914,0,0,0,0,0,1,1,0,1,1,0,1,0,0,0,0,0,0,0


# **Implement the next methods:**

In [42]:
def split_random_2_parts(data, test_size, rand_seed):
  train, test = train_test_split(data, test_size=test_size, random_state=rand_seed)

  return train, test

def split_random_3_parts(data, test_size, validation_size, rand_seed):
  relative_validation_size = validation_size / (1 - test_size)

  temp_train, test = train_test_split(data, test_size=test_size, random_state=rand_seed)
  train, validation = train_test_split(temp_train, test_size=relative_validation_size, random_state=rand_seed)

  return train, validation, test

def split_date_2_parts(data, date_field: pd.Series, date_split: np.datetime64):
    data = data.copy()

    data['created'] = date_field

    data = data.sort_values(by='created')

    date_split = np.datetime64(date_split)
    train = data[data['created'] < date_split].copy()
    test = data[data['created'] >= date_split].copy()

    train = train.drop(columns='created')
    test = test.drop(columns='created')

    return train, test

def split_date_3_parts(data, date_field: pd.Series, validation_date: np.datetime64, test_date: np.datetime64):
    data = data.copy()
    data = pd.DataFrame(data)

    data['created'] = date_field

    data = data.sort_values(by='created')

    val_date = np.datetime64(validation_date)
    test_date = np.datetime64(test_date)

    train = data[data['created'] < val_date].copy()
    validation = data[(data['created'] >= val_date) & (data['created'] < test_date)].copy()
    test = data[data['created'] >= test_date].copy()

    train = train.drop(columns='created')
    validation = validation.drop(columns='created')
    test = test.drop(columns='created')

    return train, validation, test

# **Implement the next cross-validation methods:**

In [43]:
def custom_kfold(data, k):
  data = pd.DataFrame(data)
  indices_len = len(data.index)
  indices = np.arange(indices_len)
  np.random.shuffle(indices)

  fold_size = indices_len // k
  remainder = indices_len % k

  start = 0
  result = []

  for i in range(k):
    current_fold_size = fold_size + (1 if i < remainder else 0)
    end = start + current_fold_size

    test_indices = indices[start:end]
    train_indices = np.setdiff1d(indices, test_indices)

    test_row = data.index[test_indices].to_numpy()
    train_row = data.index[train_indices].to_numpy()

    start = end
    result.append([train_row, test_row])

  return result

def custom_grouped_kfold(data, k, group_field):
  data = pd.DataFrame(data)
  indices_len = len(data.index)
  indices = np.arange(indices_len)

  groups_unique = group_field.unique()
  np.random.shuffle(groups_unique)

  groups_uniq_len = len(groups_unique)

  groups_per_fold = groups_uniq_len // k
  remainder = groups_uniq_len % k

  start = 0
  result = []

  for i in range(k):
    current_fold_size = groups_per_fold + (1 if i < remainder else 0)
    end = start + current_fold_size

    test_groups = groups_unique[start:end]
    train_groups = np.setdiff1d(groups_unique, test_groups)

    is_test_indices = np.isin(group_field, test_groups)

    test_indices = indices[is_test_indices]
    train_indices = indices[~is_test_indices]

    test_row = data.index[test_indices]
    train_row = data.index[train_indices]

    result.append([train_row, test_row])
    start = end

  return result

def custom_stratified_kfold(data, k, stratify_field):
  data = pd.DataFrame(data)
  indices_len = len(data.index)
  indices = np.arange(indices_len)

  stratify_field_uniq = stratify_field.unique()
  result = [[] for i in range(k)]

  for target in stratify_field_uniq:
    is_target_indices = np.isin(stratify_field, target)
    target_indices = indices[is_target_indices]

    np.random.shuffle(target_indices)

    fold_size = len(target_indices) // k
    remainder = len(target_indices) % k

    start = 0
    for i in range(k):
      current_fold_size = fold_size + (1 if i < remainder else 0)
      end = start + current_fold_size

      crnt_target_indices = target_indices[start:end]
      result[i].extend(crnt_target_indices)
      start = end

  final_result = []
  for i in range(k):
    test_indices = result[i]
    train_indices = np.setdiff1d(indices, test_indices)

    test_rows = data.index[test_indices]
    train_rows = data.index[train_indices]

    final_result.append([train_rows, test_rows])

  return final_result

def custom_time_series_split(data, k, date_field):
  data = pd.DataFrame(data)
  data[date_field] = pd.to_datetime(data[date_field])

  indices_len = len(data.index)
  indices = np.arange(indices_len)

  original_indices = data.sort_values(by=date_field).index

  fold_size = indices_len // (k + 1)
  remainder = indices_len % (k + 1)

  train_start = 0
  train_end = fold_size + (1 if remainder != 0 else 0)
  result = []
  for i in range(k):
    current_fold_size = fold_size + (1 if (i+1) < remainder else 0)
    test_end = train_end + current_fold_size

    test_indices = indices[train_end:test_end]
    train_indices = indices[train_start:train_end]

    test_rows = original_indices[test_indices]
    train_rows = original_indices[train_indices]

    result.append([train_rows, test_rows])
    train_end = test_end

  return result


In [44]:
kfold_indices = custom_kfold(data=train_data_full, k=10)
print(f'K-fold implementation:\n{kfold_indices}')

group_kfold_indices = custom_grouped_kfold(data=train_data_full, k=10, group_field=train_data_full['building_id'])
print(f'Group K-fold implementation:\n{group_kfold_indices}')

stratified_kfold_indices = custom_stratified_kfold(train_data_full, 10, train_data_full['interest_level'])
print(f'Stratified K-fold implementation:\n{stratified_kfold_indices}')

time_series_split_indices = custom_time_series_split(data=train_data_full, k=10, date_field='created')
print(f'Time Series Split implementation:\n{time_series_split_indices}')

K-fold implementation:
[[array([111817, 117995, 114617, ...,    622,  34914,  28466]), array([  7008, 108476,   6093, ...,  27060,  85015,  41080])], [array([117995, 117474, 103891, ...,  26349,  34914,  28466]), array([ 75305,  64161,  64994, ..., 105058,  39202,  57918])], [array([111817, 117995, 114617, ...,    622,  34914,  28466]), array([44468,  5216,    42, ..., 95277, 75183, 26076])], [array([111817, 117995, 114617, ...,    622,  34914,  28466]), array([  9408,  46679,   2752, ...,  15878,  13028, 108170])], [array([111817, 117995, 114617, ...,    335,  26349,    622]), array([ 86315,  22722,  85828, ..., 111666,  21605,  46842])], [array([111817, 117995, 114617, ...,    622,  34914,  28466]), array([ 19468, 103348,  75189, ...,  33785,  94433,  80833])], [array([111817, 117995, 114617, ...,    622,  34914,  28466]), array([88332, 31334,  8154, ..., 95419, 57199, 17051])], [array([111817, 117995, 114617, ...,    622,  34914,  28466]), array([ 87268,  11105,  68451, ..., 123109,

In [45]:
k = 5
X = train_data_encoded
y = target_encoded
groups = building_id.to_numpy().flatten()

kf = KFold(n_splits=k, shuffle=True, random_state=21)
print('\n\n---KFold:---\n')
for i, (train_indices, test_indices) in enumerate(kf.split(X)):
  print(f'Train data fold {i+1}:\n{X.iloc[train_indices]}\n')
  print(f'Test data fold {i+1}:\n{X.iloc[test_indices]}\n\n')

print('\n\n---Stratify kfold:---\n')
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=21)
for i, (train_indices, test_indices) in enumerate(skf.split(X, y)):
  print(f'Train data fold {i+1}:\n{X.iloc[train_indices]}\n')
  print(f'Test data fold {i+1}:\n{X.iloc[test_indices]}\n\n')

gkf = GroupKFold(n_splits=k)
print('\n\n---Grouped kfold:---\n')
for i, (train_indices, test_indices) in enumerate(gkf.split(X, groups=groups)):
  print(f'Train data fold {i+1}:\n{X.iloc[train_indices]}\n')
  print(f'Test data fold {i+1}:\n{X.iloc[test_indices]}\n\n')

tss = TimeSeriesSplit(n_splits=k)
print('\n\n---Time series split:---\n')
for i, (train_indices, test_indices) in enumerate(tss.split(X)):
  print(f'Train data fold {i+1}:\n{X.iloc[train_indices]}\n')
  print(f'Test data fold {i+1}:\n{X.iloc[test_indices]}\n\n')




---KFold:---

Train data fold 1:
        Elevator  HardwoodFloors  CatsAllowed  DogsAllowed  Doorman  \
111817         0               1            1            1        0   
117995         1               1            0            0        1   
117474         0               0            0            0        0   
103891         1               0            0            0        1   
115303         1               1            0            0        1   
...          ...             ...          ...          ...      ...   
32668          1               1            0            0        1   
36879          0               1            0            0        0   
12674          0               0            1            1        0   
335            1               1            0            0        0   
622            1               1            1            1        0   

        Dishwasher  NoFee  FitnessCenter  PreWar  LaundryinUnit  RoofDeck  \
111817           1      0          

In [46]:
n = len(date_field) - 1
val_date = date_field.iloc[int(n*0.60)]
test_date = date_field.iloc[int(n*0.80)]

X_train, X_validation, X_test = split_date_3_parts(data=X, date_field=date_field, validation_date=val_date, test_date=test_date)
y_train, y_validation, y_test = split_date_3_parts(data=y, date_field=date_field, validation_date=val_date, test_date=test_date)
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_validation)
X_test_scaled = scaler.transform(X_test)


In [47]:
pd.DataFrame(X_train_scaled)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,-1.063621,1.054577,1.035412,1.099789,-0.857279,1.176542,-0.757846,-0.608805,1.905683,-0.483105,-0.407403,-0.340665,2.915916,-0.309801,-0.259267,-0.240735,1.299665,-0.238529,-0.220425
1,0.940185,1.054577,-0.965799,-0.909265,1.166482,1.176542,1.319530,-0.608805,-0.524746,-0.483105,-0.407403,-0.340665,-0.342945,-0.309801,-0.259267,-0.240735,1.299665,-0.238529,-0.220425
2,0.940185,1.054577,-0.965799,-0.909265,1.166482,1.176542,-0.757846,-0.608805,-0.524746,2.069944,-0.407403,-0.340665,-0.342945,-0.309801,-0.259267,-0.240735,1.299665,-0.238529,-0.220425
3,-1.063621,-0.948247,-0.965799,-0.909265,-0.857279,-0.849949,-0.757846,-0.608805,-0.524746,-0.483105,-0.407403,-0.340665,-0.342945,-0.309801,-0.259267,-0.240735,-0.769429,-0.238529,-0.220425
4,0.940185,-0.948247,-0.965799,-0.909265,1.166482,-0.849949,-0.757846,1.642563,-0.524746,-0.483105,-0.407403,-0.340665,-0.342945,-0.309801,-0.259267,-0.240735,1.299665,-0.238529,-0.220425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29605,-1.063621,-0.948247,1.035412,1.099789,1.166482,-0.849949,-0.757846,-0.608805,1.905683,-0.483105,-0.407403,-0.340665,-0.342945,-0.309801,-0.259267,-0.240735,-0.769429,-0.238529,-0.220425
29606,0.940185,-0.948247,1.035412,1.099789,1.166482,-0.849949,-0.757846,-0.608805,1.905683,-0.483105,-0.407403,-0.340665,-0.342945,-0.309801,-0.259267,-0.240735,-0.769429,-0.238529,-0.220425
29607,0.940185,-0.948247,1.035412,1.099789,1.166482,-0.849949,-0.757846,1.642563,-0.524746,-0.483105,2.454569,-0.340665,-0.342945,-0.309801,-0.259267,-0.240735,-0.769429,-0.238529,-0.220425
29608,0.940185,1.054577,1.035412,1.099789,1.166482,1.176542,-0.757846,-0.608805,-0.524746,-0.483105,-0.407403,2.935432,2.915916,-0.309801,-0.259267,-0.240735,1.299665,-0.238529,-0.220425


In [48]:
X_train

Unnamed: 0,Elevator,HardwoodFloors,CatsAllowed,DogsAllowed,Doorman,Dishwasher,NoFee,FitnessCenter,PreWar,LaundryinUnit,RoofDeck,OutdoorSpace,DiningRoom,HighSpeedInternet,Balcony,SwimmingPool,LaundryInBuilding,NewConstruction,Terrace
111817,0,1,1,1,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0
117995,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0
114617,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0
117474,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
103891,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48865,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
45194,1,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
74765,1,0,1,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0
60726,1,1,1,1,1,1,0,0,0,0,0,1,1,0,0,0,1,0,0


In [49]:
y_train

Unnamed: 0,interest_level
111817,2
117995,1
114617,2
117474,1
103891,1
...,...
48865,1
45194,1
74765,0
60726,1


In [50]:
lasso = Lasso(alpha=0.00001)

lasso.fit(X_train_scaled, y_train)

val_predict = lasso.predict(pd.DataFrame(X_val_scaled))
print(val_predict)

test_predict = lasso.predict(pd.DataFrame(X_test_scaled))
print(test_predict)

[0.38368703 0.38886074 0.37966573 ... 0.39867475 0.40797646 0.39991764]
[0.39001893 0.36529227 0.39096026 ... 0.37809367 0.36560621 0.39299774]


In [51]:
nan_ratio = X_train.isna().mean()
nan_ratio

Unnamed: 0,0
Elevator,0.0
HardwoodFloors,0.0
CatsAllowed,0.0
DogsAllowed,0.0
Doorman,0.0
Dishwasher,0.0
NoFee,0.0
FitnessCenter,0.0
PreWar,0.0
LaundryinUnit,0.0


In [52]:
y_train_for_corr = pd.Series(y_train['interest_level'])

correlations = X_train.corrwith(y_train_for_corr)
correlations

Unnamed: 0,0
Elevator,0.000409
HardwoodFloors,-0.000347
CatsAllowed,0.003482
DogsAllowed,0.003459
Doorman,0.004081
Dishwasher,-0.003225
NoFee,-0.004621
FitnessCenter,-0.00209
PreWar,-0.010196
LaundryinUnit,-0.002528


In [53]:
top10_features = correlations.abs().sort_values(ascending=False).head(10)
top10_features

Unnamed: 0,0
SwimmingPool,0.011333
PreWar,0.010196
Terrace,0.009912
DiningRoom,0.006093
OutdoorSpace,0.005338
NoFee,0.004621
Doorman,0.004081
CatsAllowed,0.003482
DogsAllowed,0.003459
Dishwasher,0.003225


In [54]:
top_10_list = top10_features.index.tolist()
X_train_top10 = X_train[top_10_list]
X_val_top10 = X_validation[top_10_list]

model_simple = LogisticRegression(solver='liblinear', random_state=42)
model_simple.fit(X_train_top10, y_train.values.ravel())

# 3. Predict & Score
val_preds = model_simple.predict(X_val_top10)
print(f"Accuracy with Top 10 Correlation Features: {accuracy_score(y_validation, val_preds):.4f}")

Accuracy with Top 10 Correlation Features: 0.6967


In [55]:
model_perm = LogisticRegression(solver='liblinear', random_state=21, class_weight='balanced', C=100)

model_perm.fit(X_train, y_train.values.ravel())

result = permutation_importance(
    model_perm,
    X_validation,
    y_validation.values.ravel(),
    n_repeats=5,
    random_state=21,
    n_jobs=-1,
    scoring='neg_log_loss'
)

In [86]:
perm_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': result.importances_mean
})
perm_df.sort_values(by='importance', ascending=False).head(10)

Unnamed: 0,feature,importance
2,CatsAllowed,0.000528
15,SwimmingPool,0.0003
11,OutdoorSpace,0.0002
3,DogsAllowed,0.000183
4,Doorman,0.000158
18,Terrace,0.000127
5,Dishwasher,0.000115
10,RoofDeck,6.4e-05
1,HardwoodFloors,3.8e-05
12,DiningRoom,2.2e-05


In [57]:
# 1. Look at what the model is actually predicting
print("Unique predictions:", np.unique(val_preds))
print("Prediction counts:", pd.Series(val_preds).value_counts())

Unique predictions: [0]
Prediction counts: 0    9870
Name: count, dtype: int64


In [61]:
masker = shap.maskers.Independent(data=X_train)
explainer = shap.LinearExplainer(model=model_perm, masker=masker)

In [62]:
shap_values = explainer(X_validation)

In [75]:
shap_values.values

array([[[-0.01234586,  0.01713357, -0.00573714],
        [ 0.0020677 ,  0.00511664, -0.01886958],
        [ 0.0125901 , -0.03898489,  0.05440329],
        ...,
        [-0.01292857,  0.01961422, -0.01010007],
        [ 0.00150419,  0.00019064, -0.00457002],
        [ 0.00231074, -0.0002684 , -0.00596463]],

       [[ 0.00857933, -0.01190638,  0.00398683],
        [ 0.0020677 ,  0.00511664, -0.01886958],
        [ 0.0125901 , -0.03898489,  0.05440329],
        ...,
        [ 0.01939285, -0.02942133,  0.0151501 ],
        [ 0.00150419,  0.00019064, -0.00457002],
        [ 0.00231074, -0.0002684 , -0.00596463]],

       [[ 0.00857933, -0.01190638,  0.00398683],
        [ 0.0020677 ,  0.00511664, -0.01886958],
        [-0.0107249 ,  0.03320935, -0.04634354],
        ...,
        [-0.01292857,  0.01961422, -0.01010007],
        [ 0.00150419,  0.00019064, -0.00457002],
        [ 0.00231074, -0.0002684 , -0.00596463]],

       ...,

       [[-0.01234586,  0.01713357, -0.00573714],
        [-0

In [83]:
shap_importance = np.abs(shap_values.values).mean(axis=(0, 2))
top10_shap = pd.DataFrame(
    shap_importance,
    X_validation.columns
    ).sort_values(by=0, ascending=False).head(10)

top10_shap

Unnamed: 0,0
DogsAllowed,0.037289
CatsAllowed,0.032534
Doorman,0.020605
FitnessCenter,0.020604
SwimmingPool,0.019484
LaundryInBuilding,0.016953
NoFee,0.016673
PreWar,0.016274
OutdoorSpace,0.016232
Dishwasher,0.012186


In [93]:
elastic = ElasticNet(random_state=21)

param_grid = {
    'alpha': [0.00001, 0.001, 0.1, 1, 10],
    'l1_ratio': [0.01, 0.1, 0.3, 0.6, 1.0]
}

grid_search = GridSearchCV(
    estimator=elastic,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

print("--- Starting Grid Search ---")
grid_search.fit(X_train_scaled, y_train.values.ravel())

--- Starting Grid Search ---
Fitting 3 folds for each of 25 candidates, totalling 75 fits


In [94]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score (Negative MSE): {grid_search.best_score_:.4f}")

# Save the best model for later comparison
best_grid_model = grid_search.best_estimator_
best_grid_model

Best Parameters: {'alpha': 0.1, 'l1_ratio': 0.1}
Best Score (Negative MSE): -0.3977


In [98]:
param_dist = {
    'alpha': uniform(0, 1),
    'l1_ratio': uniform(0, 1)
}

random_search = RandomizedSearchCV(
    estimator=elastic,
    param_distributions=param_dist,
    n_iter=100,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=21
)

print("--- Starting Randomized Search ---")
random_search.fit(X_train_scaled, y_train.values.ravel())

--- Starting Randomized Search ---
Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [99]:
print(f"Grid Best: {grid_search.best_score_:.4f}")
print(f"Random Best: {random_search.best_score_:.4f}")
print(f"Random Parameters: {random_search.best_params_}")

Grid Best: -0.3977
Random Best: -0.3977
Random Parameters: {'alpha': np.float64(0.08833486468591223), 'l1_ratio': np.float64(0.08828575481211265)}


In [103]:
# 1. Define the Mission (The Objective Function)
def objective(trial):
    # A. Ask Optuna for suggestions
    # "Pick an alpha between 0.0 and 1.0"
    alpha = trial.suggest_float('alpha', 0.0, 1.0)

    # "Pick an l1_ratio between 0.0 and 1.0"
    l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0)

    # B. Setup the Model with these specific suggestions
    model = ElasticNet(
        alpha=alpha,
        l1_ratio=l1_ratio,
        random_state=42
    )

    # C. Run the Test (Cross Validation)
    # We use 'neg_mean_squared_error' again (Higher is better, close to 0)
    scores = cross_val_score(
        model,
        X_train_scaled,
        y_train,
        cv=3,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )

    # D. Return the average score (The Report)
    return scores.mean()

# 2. Create the Study (The Manager)
# direction='maximize' because we want the score close to 0 (e.g., -0.3 is better than -100)
study = optuna.create_study(direction='maximize')

# 3. Start the Optimization
print("--- Starting Optuna Search ---")
study.optimize(objective, n_trials=50) # Run 50 experiments

# 4. Results
print(f"Best Score: {study.best_value:.4f}")
print(f"Best Parameters: {study.best_params}")

[I 2026-01-31 08:53:01,386] A new study created in memory with name: no-name-0e6c21a4-eb24-4ee4-a15b-d244b42b5413


--- Starting Optuna Search ---


[I 2026-01-31 08:53:08,946] Trial 0 finished with value: -0.3976536052779343 and parameters: {'alpha': 0.41116062866688163, 'l1_ratio': 0.8008342665927842}. Best is trial 0 with value: -0.3976536052779343.
[I 2026-01-31 08:53:09,292] Trial 1 finished with value: -0.39766194511379144 and parameters: {'alpha': 0.015703645474060113, 'l1_ratio': 0.25709476786418184}. Best is trial 0 with value: -0.3976536052779343.
[I 2026-01-31 08:53:09,608] Trial 2 finished with value: -0.3976536052779343 and parameters: {'alpha': 0.33411016734622445, 'l1_ratio': 0.69123807481314}. Best is trial 0 with value: -0.3976536052779343.
[I 2026-01-31 08:53:09,887] Trial 3 finished with value: -0.3976536052779343 and parameters: {'alpha': 0.47494972111684675, 'l1_ratio': 0.17404101981491427}. Best is trial 0 with value: -0.3976536052779343.
[I 2026-01-31 08:53:10,135] Trial 4 finished with value: -0.3976536052779343 and parameters: {'alpha': 0.6446388580478625, 'l1_ratio': 0.6430584772009663}. Best is trial 0 wi

Best Score: -0.3976
Best Parameters: {'alpha': 0.3314418535242491, 'l1_ratio': 0.01364356874006184}


## Final training the data:

In [107]:
scaler = StandardScaler()

X_train_final = pd.concat([X_train, X_validation], axis=0)
X_train_final_scaled = scaler.fit_transform(X_train_final)

y_train_final = pd.concat([y_train, y_validation], axis=0)

X_test_scaled = scaler.transform(X_test)
y_test = y_test

model = ElasticNet(alpha=0.3314, l1_ratio=0.01364)
model.fit(X_train_final_scaled, y_train_final.values.ravel())

In [127]:
y_predict = model.predict(X_test_scaled)
final_error = mean_squared_error(y_test, y_predict)
print(f'Final MSE result: {final_error}')

Final MSE result: 0.3731936061647032
