In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error
from sklearn.model_selection import GridSearchCV
import os
import json
import shutil

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

## 1. Derive an analytical solution to the regression problem. Use a vector form of the equation.
  ${\beta }=(X^{T}X)^{-1}X^{T}Y$
## 2. What changes in the solution when L1 and L2 regularizations are added to the loss function.
L2 added: $w = (X^T X + \lambda I)^{-1} X^T y$

L1 added: $L_{Lasso}(w) = ||Xw - y||^2 + \lambda \sum |w_i|$
## 3. Explain why L1 regularization is often used to select features. Why are there many weights equal to 0 after the model is fit?
L2 (Ridge) penalizes based on the square of the weight ($w^2$). As $w$ gets close to zero, the penalty becomes tiny ($0.01^2 = 0.0001$). The "force" pushing it toward zero disappears, so the weight hovers just above zero.

L1 (Lasso) penalizes based on the absolute value ($|w|$). The gradient is constant ($\pm \lambda$). Even if $w$ is tiny (0.00001), Lasso pushes it toward zero with the exact same force as if it were huge. This pushes the weight all the way to 0.
## 4. Explain how you can use the same models (Linear regression, Ridge, etc.) but make it possible to fit nonlinear dependencies.
We can model nonlinear relationships (curves) using linear regression by changing the Data, not the Model. This is called Basis Expansion or Feature Engineering.

Original Input: $[x]$Transformed Input: $[x, x^2]$

### Download the dataset from kaggle: 

In [None]:
# Install kaggle CLI if needed
if shutil.which('kaggle') is None:
    %pip install -q kaggle 

kaggle_json_path = os.path.expanduser("~/.kaggle/kaggle.json")

# Prompt for credentials
user = input("Kaggle username: ").strip()
key = input("Kaggle API key: ").strip()

os.makedirs(os.path.dirname(kaggle_json_path), exist_ok=True)
with open(kaggle_json_path, "w") as f:
    json.dump({"username": user, "key": key}, f)
    
# Secure the file so the Kaggle API doesn't complain
os.chmod(kaggle_json_path, 0o600)

# Download the competition dataset into ./data
!kaggle competitions download -c two-sigma-connect-rental-listing-inquiries -p ./data

# Unzip the downloaded file and remove the zip archive to save space
!unzip -q -o ./data/two-sigma-connect-rental-listing-inquiries.zip -d ./data
!rm ./data/two-sigma-connect-rental-listing-inquiries.zip

print("Dataset downloaded and unzipped successfully!")

In [None]:
! unzip ./data/train.json.zip -d ./data/
! unzip ./data/test.json.zip -d ./data/

### Read the data:

In [3]:
train_json_local = './data/train.json'
test_json_local = './data/test.json'

In [4]:
# train_data = pd.read_json(train_json_drive)
train_data = pd.read_json(train_json_local)
train_data

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address,interest_level
4,1.0,1,8579a0b0d54db803821a35a4a615e97a,2016-06-16 05:55:27,Spacious 1 Bedroom 1 Bathroom in Williamsburg!...,145 Borinquen Place,"[Dining Room, Pre-War, Laundry in Building, Di...",40.7108,7170325,-73.9539,a10db4590843d78c784171a107bdacb4,[https://photos.renthop.com/2/7170325_3bb5ac84...,2400,145 Borinquen Place,medium
6,1.0,2,b8e75fc949a6cd8225b455648a951712,2016-06-01 05:44:33,BRAND NEW GUT RENOVATED TRUE 2 BEDROOMFind you...,East 44th,"[Doorman, Elevator, Laundry in Building, Dishw...",40.7513,7092344,-73.9722,955db33477af4f40004820b4aed804a0,[https://photos.renthop.com/2/7092344_7663c19a...,3800,230 East 44th,low
9,1.0,2,cd759a988b8f23924b5a2058d5ab2b49,2016-06-14 15:19:59,**FLEX 2 BEDROOM WITH FULL PRESSURIZED WALL**L...,East 56th Street,"[Doorman, Elevator, Laundry in Building, Laund...",40.7575,7158677,-73.9625,c8b10a317b766204f08e613cef4ce7a0,[https://photos.renthop.com/2/7158677_c897a134...,3495,405 East 56th Street,medium
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,medium
15,1.0,0,bfb9405149bfff42a92980b594c28234,2016-06-28 03:50:23,Over-sized Studio w abundant closets. Availabl...,East 34th Street,"[Doorman, Elevator, Fitness Center, Laundry in...",40.7439,7225292,-73.9743,2c3b41f588fbb5234d8a1e885a436cfa,[https://photos.renthop.com/2/7225292_901f1984...,2795,340 East 34th Street,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124000,1.0,3,92bbbf38baadfde0576fc496bd41749c,2016-04-05 03:58:33,There is 700 square feet of recently renovated...,W 171 Street,"[Elevator, Dishwasher, Hardwood Floors]",40.8433,6824800,-73.9396,a61e21da3ba18c7a3d54cfdcc247e1f8,[https://photos.renthop.com/2/6824800_0682be16...,2800,620 W 171 Street,low
124002,1.0,2,5565db9b7cba3603834c4aa6f2950960,2016-04-02 02:25:31,"2 bedroom apartment with updated kitchen, rece...",Broadway,"[Common Outdoor Space, Cats Allowed, Dogs Allo...",40.8198,6813268,-73.9578,8f90e5e10e8a2d7cf997f016d89230eb,[https://photos.renthop.com/2/6813268_1e6fcc32...,2395,3333 Broadway,medium
124004,1.0,1,67997a128056ee1ed7d046bbb856e3c7,2016-04-26 05:42:03,No Brokers Fee * Never Lived 1 Bedroom 1 Bathr...,210 Brighton 15th St,"[Dining Room, Elevator, Pre-War, Laundry in Bu...",40.5765,6927093,-73.9554,a10db4590843d78c784171a107bdacb4,[https://photos.renthop.com/2/6927093_93a52104...,1850,210 Brighton 15th St,medium
124008,1.0,2,3c0574a740154806c18bdf1fddd3d966,2016-04-19 02:47:33,Wonderful Bright Chelsea 2 Bedroom apartment o...,West 21st Street,"[Pre-War, Laundry in Unit, Dishwasher, No Fee,...",40.7448,6892816,-74.0017,c3cd45f4381ac371507090e9ffabea80,[https://photos.renthop.com/2/6892816_1a8d087a...,4195,350 West 21st Street,medium


In [5]:
# test_data = pd.read_json(test_json_drive)
test_data = pd.read_json(test_json_local)
test_data

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address
0,1.0,1,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,Large with awesome terrace--accessible via bed...,Suffolk Street,"[Elevator, Laundry in Building, Laundry in Uni...",40.7185,7142618,-73.9865,b1b1852c416d78d7765d746cb1b8921f,[https://photos.renthop.com/2/7142618_1c45a2c8...,2950,99 Suffolk Street
1,1.0,2,0,2016-06-24 06:36:34,Prime Soho - between Bleecker and Houston - Ne...,Thompson Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7278,7210040,-74.0000,d0b5648017832b2427eeb9956d966a14,[https://photos.renthop.com/2/7210040_d824cc71...,2850,176 Thompson Street
2,1.0,0,0,2016-06-17 01:23:39,Spacious studio in Prime Location. Cleanbuildi...,Sullivan Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7260,7174566,-74.0026,e6472c7237327dd3903b3d6f6a94515a,[https://photos.renthop.com/2/7174566_ba3a35c5...,2295,115 Sullivan Street
3,1.0,2,f9c826104b91d868e69bd25746448c0c,2016-06-21 05:06:02,For immediate access call Bryan.<br /><br />Bo...,Jones Street,"[Hardwood Floors, Dogs Allowed, Cats Allowed]",40.7321,7191391,-74.0028,41735645e0f8f13993c42894023f8e58,[https://photos.renthop.com/2/7191391_8c2f2d49...,2900,23 Jones Street
5,1.0,1,81062936e12ee5fa6cd2b965698e17d5,2016-06-16 07:24:27,Beautiful TRUE 1 bedroom in a luxury building ...,Exchange Place,"[Roof Deck, Doorman, Elevator, Fitness Center,...",40.7054,7171695,-74.0095,a742cf7dd3b2627d83417bc3a1b3ec96,[https://photos.renthop.com/2/7171695_089ffee2...,3254,20 Exchange Place
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124003,1.0,1,bd863d28a6b119ac3bc72d5f27b07f24,2016-04-26 16:09:55,BRAND NEW TO MARKET 1BDR \r107TH AND LEXINGTON...,150 EAST 107TH STREET,[],40.7925,6928108,-73.9454,453d46f8113e1f2c730c2ee5a4469c71,[https://photos.renthop.com/2/6928108_231eb983...,1700,158 EAST 107TH STREET
124005,1.0,2,9174b75c0cd978eb0e5aa93afbad754b,2016-04-21 05:06:19,Convertible 2BR apartment features a brand new...,E 33rd St.,"[Doorman, Elevator, Laundry in Building, Dishw...",40.7456,6906674,-73.9797,2983e45f7e0ad87d677dacd13e362785,[https://photos.renthop.com/2/6906674_9fe899a8...,4195,141 E 33rd St.
124006,1.0,0,0,2016-04-20 01:31:52,"Let's get you in to see this $2,400/mo, recent...",Lexington Avenue,"[Dogs Allowed, Cats Allowed]",40.7416,6897967,-73.9829,e6472c7237327dd3903b3d6f6a94515a,[],2400,95 Lexington Avenue
124007,2.0,2,c90c010e5505365676538e64d02aa1e0,2016-04-08 02:26:45,CooperCooper.com :: Web ID #171357; Access 100...,Park Avenue,"[Doorman, Elevator, Cats Allowed, Dogs Allowed]",40.7485,6842183,-73.9800,6e5c10246156ae5bdcd9b487ca99d96a,[https://photos.renthop.com/2/6842183_b1fe51f4...,6895,41 Park Avenue


## **Training data preparation:**

In [6]:
train_features_names = train_data['features']
train_features_names

4         [Dining Room, Pre-War, Laundry in Building, Di...
6         [Doorman, Elevator, Laundry in Building, Dishw...
9         [Doorman, Elevator, Laundry in Building, Laund...
10                                                       []
15        [Doorman, Elevator, Fitness Center, Laundry in...
                                ...                        
124000              [Elevator, Dishwasher, Hardwood Floors]
124002    [Common Outdoor Space, Cats Allowed, Dogs Allo...
124004    [Dining Room, Elevator, Pre-War, Laundry in Bu...
124008    [Pre-War, Laundry in Unit, Dishwasher, No Fee,...
124009    [Dining Room, Elevator, Laundry in Building, D...
Name: features, Length: 49352, dtype: object

In [7]:
cleaned_features_names = (train_features_names
                    .astype(str)
                    .str.replace(r'[\[\]\*\'\"]', '', regex=True)
).str.split(',')

exploded_list = cleaned_features_names.explode().str.strip()
train_features_names = exploded_list.groupby(level=0).agg(list)
train_features_names

4         [Dining Room, Pre-War, Laundry in Building, Di...
6         [Doorman, Elevator, Laundry in Building, Dishw...
9         [Doorman, Elevator, Laundry in Building, Laund...
10                                                       []
15        [Doorman, Elevator, Fitness Center, Laundry in...
                                ...                        
124000              [Elevator, Dishwasher, Hardwood Floors]
124002    [Common Outdoor Space, Cats Allowed, Dogs Allo...
124004    [Dining Room, Elevator, Pre-War, Laundry in Bu...
124008    [Pre-War, Laundry in Unit, Dishwasher, No Fee,...
124009    [Dining Room, Elevator, Laundry in Building, D...
Name: features, Length: 49352, dtype: object

In [8]:
single_list_features_names = [item for sublist in train_features_names for item in sublist]
single_list_features_names[:30]

['Dining Room',
 'Pre-War',
 'Laundry in Building',
 'Dishwasher',
 'Hardwood Floors',
 'Dogs Allowed',
 'Cats Allowed',
 'Doorman',
 'Elevator',
 'Laundry in Building',
 'Dishwasher',
 'Hardwood Floors',
 'No Fee',
 'Doorman',
 'Elevator',
 'Laundry in Building',
 'Laundry in Unit',
 'Dishwasher',
 'Hardwood Floors',
 '',
 'Doorman',
 'Elevator',
 'Fitness Center',
 'Laundry in Building',
 'Doorman',
 'Elevator',
 'Loft',
 'Dishwasher',
 'Hardwood Floors',
 'No Fee',
 'Fireplace',
 'Laundry in Unit',
 'Dishwasher',
 'Hardwood Floors',
 'No Fee',
 'Elevator',
 'Laundry in Building',
 'Dishwasher',
 'Hardwood Floors',
 'No Fee',
 'Hardwood Floors',
 'Cats Allowed',
 'Dogs Allowed',
 'Doorman',
 'Elevator',
 'Laundry in Building',
 'Dogs Allowed',
 'Cats Allowed',
 'Roof Deck',
 'Doorman',
 'Elevator',
 'Fitness Center',
 'Pre-War',
 'Laundry in Building',
 'High Speed Internet',
 'Dishwasher',
 'Hardwood Floors',
 'No Fee',
 'Dogs Allowed',
 'Cats Allowed',
 'Swimming Pool',
 'Roof Deck

In [9]:
num_of_uniques_items = len(set(single_list_features_names))
num_of_uniques_items

1555

In [10]:
counted_features_names = Counter(single_list_features_names)
top_20_features = counted_features_names.most_common(20)
top_20_features

[('Elevator', 25915),
 ('Cats Allowed', 23540),
 ('Hardwood Floors', 23527),
 ('Dogs Allowed', 22035),
 ('Doorman', 20898),
 ('Dishwasher', 20426),
 ('No Fee', 18062),
 ('Laundry in Building', 16344),
 ('Fitness Center', 13252),
 ('Pre-War', 9148),
 ('Laundry in Unit', 8738),
 ('Roof Deck', 6542),
 ('Outdoor Space', 5268),
 ('Dining Room', 5136),
 ('High Speed Internet', 4299),
 ('', 3218),
 ('Balcony', 2992),
 ('Swimming Pool', 2730),
 ('Laundry In Building', 2593),
 ('New Construction', 2559)]

In [12]:
train_features = pd.DataFrame(train_features_names)

# for feature_tuple in top_20_features:
#   feature_name = feature_tuple[0]
#   train_features[feature_name] = train_features['features'].apply(
#       lambda x: 1 if feature_name in x else 0
#       )

for feature_tuple in top_20_features:
    feature_name = feature_tuple[0]
    
    # .str.contains is MUCH faster than .apply(lambda)
    # .astype(int) converts True/False to 1/0
    train_features[feature_name] = train_features['features'].str.contains(
        feature_name, regex=False, na=False
    ).astype(int)

In [13]:
train_features['bathrooms'] = train_data['bathrooms']
train_features['bedrooms'] = train_data['bedrooms']
train_features

Unnamed: 0,features,Elevator,Cats Allowed,Hardwood Floors,Dogs Allowed,Doorman,Dishwasher,No Fee,Laundry in Building,Fitness Center,...,Outdoor Space,Dining Room,High Speed Internet,Unnamed: 15,Balcony,Swimming Pool,Laundry In Building,New Construction,bathrooms,bedrooms
4,"[Dining Room, Pre-War, Laundry in Building, Di...",0,1,1,1,0,1,0,1,0,...,0,1,0,0,0,0,0,0,1.0,1
6,"[Doorman, Elevator, Laundry in Building, Dishw...",1,0,1,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,1.0,2
9,"[Doorman, Elevator, Laundry in Building, Laund...",1,0,1,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,1.0,2
10,[],0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1.5,3
15,"[Doorman, Elevator, Fitness Center, Laundry in...",1,0,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124000,"[Elevator, Dishwasher, Hardwood Floors]",1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1.0,3
124002,"[Common Outdoor Space, Cats Allowed, Dogs Allo...",1,1,0,1,1,0,1,0,0,...,0,0,0,0,0,0,1,0,1.0,2
124004,"[Dining Room, Elevator, Pre-War, Laundry in Bu...",1,1,1,1,0,1,1,1,0,...,0,1,0,0,0,0,0,0,1.0,1
124008,"[Pre-War, Laundry in Unit, Dishwasher, No Fee,...",0,0,0,0,0,1,1,0,0,...,1,0,0,0,0,0,0,0,1.0,2


In [14]:
basic_features = ['bathrooms', 'bedrooms']
feature_list = [feature_tuple[0] for feature_tuple in top_20_features]
feature_list = feature_list + basic_features
feature_list

['Elevator',
 'Cats Allowed',
 'Hardwood Floors',
 'Dogs Allowed',
 'Doorman',
 'Dishwasher',
 'No Fee',
 'Laundry in Building',
 'Fitness Center',
 'Pre-War',
 'Laundry in Unit',
 'Roof Deck',
 'Outdoor Space',
 'Dining Room',
 'High Speed Internet',
 '',
 'Balcony',
 'Swimming Pool',
 'Laundry In Building',
 'New Construction',
 'bathrooms',
 'bedrooms']

## **Testing data preparation:**

In [15]:
test_features_names = test_data['features']
test_features_names

0         [Elevator, Laundry in Building, Laundry in Uni...
1                     [Pre-War, Dogs Allowed, Cats Allowed]
2                     [Pre-War, Dogs Allowed, Cats Allowed]
3             [Hardwood Floors, Dogs Allowed, Cats Allowed]
5         [Roof Deck, Doorman, Elevator, Fitness Center,...
                                ...                        
124003                                                   []
124005    [Doorman, Elevator, Laundry in Building, Dishw...
124006                         [Dogs Allowed, Cats Allowed]
124007      [Doorman, Elevator, Cats Allowed, Dogs Allowed]
124010    [Garden/Patio, Laundry in Unit, Dishwasher, Ha...
Name: features, Length: 74659, dtype: object

In [16]:
cleaned_features_names_test = (test_features_names
                    .astype(str)
                    .str.replace(r'[\[\]\*\'\"]', '', regex=True)
).str.split(',')

exploded_list = cleaned_features_names_test.explode().str.strip()
test_features_names = exploded_list.groupby(level=0).agg(list)
test_features_names

0         [Elevator, Laundry in Building, Laundry in Uni...
1                     [Pre-War, Dogs Allowed, Cats Allowed]
2                     [Pre-War, Dogs Allowed, Cats Allowed]
3             [Hardwood Floors, Dogs Allowed, Cats Allowed]
5         [Roof Deck, Doorman, Elevator, Fitness Center,...
                                ...                        
124003                                                   []
124005    [Doorman, Elevator, Laundry in Building, Dishw...
124006                         [Dogs Allowed, Cats Allowed]
124007      [Doorman, Elevator, Cats Allowed, Dogs Allowed]
124010    [Garden/Patio, Laundry in Unit, Dishwasher, Ha...
Name: features, Length: 74659, dtype: object

In [17]:
single_list_features_names_test = [item for sublist in test_features_names for item in sublist]
single_list_features_names_test[:30]

['Elevator',
 'Laundry in Building',
 'Laundry in Unit',
 'Dishwasher',
 'Hardwood Floors',
 'Outdoor Space',
 'Pre-War',
 'Dogs Allowed',
 'Cats Allowed',
 'Pre-War',
 'Dogs Allowed',
 'Cats Allowed',
 'Hardwood Floors',
 'Dogs Allowed',
 'Cats Allowed',
 'Roof Deck',
 'Doorman',
 'Elevator',
 'Fitness Center',
 'Pre-War',
 'Laundry in Building',
 'High Speed Internet',
 'Wheelchair Access',
 'Dogs Allowed',
 'Cats Allowed',
 'Cats Allowed',
 'Dogs Allowed',
 'No Fee',
 'Doorman',
 'Elevator',
 'Fitness Center',
 'Laundry In Building',
 'Swimming Pool',
 'Roof Deck',
 'Laundry in Building',
 'Dishwasher',
 'Hardwood Floors',
 'Elevator',
 'Cats Allowed',
 'Dogs Allowed',
 'Doorman',
 'Elevator',
 'Laundry in Building',
 'Hardwood Floors',
 'No Fee',
 'No Fee',
 'Fitness Center',
 'Cats Allowed',
 'Dogs Allowed',
 '',
 'Swimming Pool',
 'Roof Deck',
 'Doorman',
 'Elevator',
 'Fitness Center',
 'Pre-War',
 'Laundry in Building',
 'Laundry in Unit',
 'Dishwasher',
 'Hardwood Floors',
 'N

In [18]:
counted_features_names_test = Counter(single_list_features_names_test)
top_20_features_test = counted_features_names_test.most_common(20)
top_20_features_test

[('Elevator', 39041),
 ('Cats Allowed', 35654),
 ('Hardwood Floors', 35544),
 ('Dogs Allowed', 33172),
 ('Doorman', 31430),
 ('Dishwasher', 30639),
 ('No Fee', 27356),
 ('Laundry in Building', 24612),
 ('Fitness Center', 20155),
 ('Pre-War', 13960),
 ('Laundry in Unit', 13142),
 ('Roof Deck', 9900),
 ('Outdoor Space', 8136),
 ('Dining Room', 7673),
 ('High Speed Internet', 6323),
 ('', 4917),
 ('Balcony', 4647),
 ('Swimming Pool', 4422),
 ('Laundry In Building', 3910),
 ('New Construction', 3807)]

In [19]:
test_features = pd.DataFrame(test_features_names)

for feature_tuple in top_20_features_test:
  feature_name = feature_tuple[0]
  test_features[feature_name] = test_features['features'].apply(
      lambda x: 1 if feature_name in x else 0
      )
test_features['bathrooms'] = test_data['bathrooms']
test_features['bedrooms'] = test_data['bedrooms']
test_features

Unnamed: 0,features,Elevator,Cats Allowed,Hardwood Floors,Dogs Allowed,Doorman,Dishwasher,No Fee,Laundry in Building,Fitness Center,...,Outdoor Space,Dining Room,High Speed Internet,Unnamed: 15,Balcony,Swimming Pool,Laundry In Building,New Construction,bathrooms,bedrooms
0,"[Elevator, Laundry in Building, Laundry in Uni...",1,0,1,0,0,1,0,1,0,...,1,0,0,0,0,0,0,0,1.0,1
1,"[Pre-War, Dogs Allowed, Cats Allowed]",0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.0,2
2,"[Pre-War, Dogs Allowed, Cats Allowed]",0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.0,0
3,"[Hardwood Floors, Dogs Allowed, Cats Allowed]",0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.0,2
5,"[Roof Deck, Doorman, Elevator, Fitness Center,...",1,1,0,1,1,0,0,1,1,...,0,0,1,0,0,0,0,0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124003,[],0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1.0,1
124005,"[Doorman, Elevator, Laundry in Building, Dishw...",1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,1.0,2
124006,"[Dogs Allowed, Cats Allowed]",0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.0,0
124007,"[Doorman, Elevator, Cats Allowed, Dogs Allowed]",1,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,2.0,2


In [20]:
basic_features = ['bathrooms', 'bedrooms']
feature_list_test = [feature_tuple[0] for feature_tuple in top_20_features_test]
feature_list_test = feature_list_test + basic_features
feature_list_test

['Elevator',
 'Cats Allowed',
 'Hardwood Floors',
 'Dogs Allowed',
 'Doorman',
 'Dishwasher',
 'No Fee',
 'Laundry in Building',
 'Fitness Center',
 'Pre-War',
 'Laundry in Unit',
 'Roof Deck',
 'Outdoor Space',
 'Dining Room',
 'High Speed Internet',
 '',
 'Balcony',
 'Swimming Pool',
 'Laundry In Building',
 'New Construction',
 'bathrooms',
 'bedrooms']

## **Model implementation:**

In [21]:
X_train = train_features[feature_list]
y_train = train_data['price']
X_test = test_features[feature_list_test]
y_test = test_data['price']

In [22]:
class LinearRegressionCustom:
  def __init__(self):
    self.learning_rate = None
    self.n_epochs = None
    self.weights = None
    self.bias = None

  def fit_analytical(self, X, y):
    X = np.array(X)
    y = np.array(y)
    ones = np.ones((len(X), 1))
    X_b = np.c_[ones, X]
    w_full = np.linalg.inv(np.dot(X_b.T, X_b)).dot(X_b.T).dot(y)
    self.weights = w_full[1:]
    self.bias = w_full[0]

  def fit_bgd(self, X, y, learning_rate=0.001, n_epochs=500):
    self.learning_rate = learning_rate
    self.n_epochs = n_epochs

    X = np.array(X)
    y = np.array(y)
    n_samples, n_features = X.shape
    self.weights = np.zeros(n_features)
    self.bias = 0

    for epoch in range(self.n_epochs):
      y_pred = np.dot(X, self.weights) + self.bias
      error = y_pred - y
      dw = (2/n_samples) * np.dot(X.T, error)
      db = (2/n_samples) * np.sum(error)

      self.weights -= self.learning_rate * dw
      self.bias -= self.learning_rate * db

  def fit_sgd(self, X, y, learning_rate=0.001, n_epochs=500, penalty=None, alpha=0.01, l1_ratio=0.5):
    self.learning_rate = learning_rate
    self.n_epochs = n_epochs

    X = np.array(X)
    y = np.array(y)
    n_samples, n_features = X.shape
    self.weights = np.zeros(n_features)
    self.bias = 0
    np.random.seed(42)

    for epoch in range(self.n_epochs):
      indices = np.random.permutation(n_samples)
      X_shuffled = X[indices]
      y_shuffled = y[indices]

      for i in range(n_samples):
        xi = X_shuffled[i]
        yi = y_shuffled[i]

        y_pred = np.dot(xi.T, self.weights) + self.bias
        error = y_pred - yi

        dw = 2 * error * xi
        db = 2 * error

        if penalty == 'l2': # Ridge
            dw += 2 * alpha * self.weights

        elif penalty == 'l1': # Lasso
            dw += alpha * np.sign(self.weights)

        elif penalty == 'elasticnet': # Both
            l1_part = alpha * l1_ratio * np.sign(self.weights)
            l2_part = alpha * (1 - l1_ratio) * 2 * self.weights
            dw += l1_part + l2_part

        self.weights -= self.learning_rate * dw
        self.bias -= self.learning_rate * db

  def predict(self, X):
    X = np.array(X)
    return np.dot(X, self.weights) + self.bias

## **Testing Metrics:**

In [23]:
def r2_score(y_true, y_pred):
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)
  ss_res = np.sum((y_true - y_pred) ** 2)
  ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
  return 1 - (ss_res / ss_tot)

def mae(y_true, y_pred):
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)
  return np.mean(np.abs(y_true - y_pred))

def rmse(y_true, y_pred):
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)
  return np.sqrt(np.mean((y_true - y_pred) ** 2))

## **Analytical Solution:**

In [24]:
model_analytical = LinearRegressionCustom()
model_analytical.fit_analytical(X_train, y_train)

print(f'Learned bias: {model_analytical.bias}')
print(f'Learned weights: {model_analytical.weights}')

Learned bias: -102.2560324307375
Learned weights: [ 515.07432271 -165.66643644 -325.67596185  511.05328552 1549.91615129
 -140.15988733 -382.63894945 -615.07696131 -458.6284542   -23.57451281
  571.26321729  -87.85839273  -82.62330144  236.75911931 -316.79666092
 -185.06096857  -85.57784792   97.31801672 -838.76962431   30.71855985
 2179.21401775  568.25443956]


In [25]:
y_pred = model_analytical.predict(X_test)

In [26]:
r2_analytical = r2_score(y_test, y_pred)
mae_analytical = mae(y_test, y_pred)
rmse_analytical = rmse(y_test, y_pred)
print(f'R2 test result: {r2_analytical}')
print(f'MAE test result: {mae_analytical}')
print(f'RMSE test result: {rmse_analytical}')

R2 test result: 0.019037512620476815
MAE test result: 1083.69514708662
RMSE test result: 9620.12635778934


## **SGD Solution:**

In [27]:
model_sgd = LinearRegressionCustom()
model_sgd.fit_sgd(X_train, y_train)

print(f'Learned bias: {model_sgd.bias}')
print(f'Learned weights: {model_sgd.weights}')

Learned bias: -77.48297052080576
Learned weights: [ 6.89491963e+02 -4.28384770e+02 -5.71746411e+01  5.75796987e+02
  2.07031170e+03 -4.18652074e+01 -2.78247347e+02 -8.73286919e+02
 -1.10694093e+03 -1.37355976e+02  7.42426559e+02 -1.45522582e+02
 -1.56215113e+01  2.55679255e+02 -3.29678193e+02 -8.15754761e-01
 -4.56661260e+02 -4.09688740e+01 -1.17063767e+03  2.70420192e+02
  1.72734903e+03  6.75655002e+02]


In [28]:
y_pred_sgd = model_sgd.predict(X_test)

In [29]:
r2_sgd = r2_score(y_test, y_pred_sgd)
mae_sgd = mae(y_test, y_pred_sgd)
rmse_sgd = rmse(y_test, y_pred_sgd)
print(f'R2 test result: {r2_sgd}')
print(f'MAE test result: {mae_sgd}')
print(f'RMSE test result: {rmse_sgd}')

R2 test result: 0.018944078714984047
MAE test result: 1129.9662507533576
RMSE test result: 9620.584491808933


## **Sklearn Solution:**

In [30]:
sklearn_model = LinearRegression()
sklearn_model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [31]:
y_pred_sklearn = sklearn_model.predict(X_test)

In [32]:
r2_sk = r2_score(y_test, y_pred_sklearn)
mae_sk = mae(y_test, y_pred_sklearn)
rmse_sk = rmse(y_test, y_pred_sklearn)
print(f"Sklearn R2: {r2_sk}")
print(f"Sklearn MAE: {mae_sk}")
print(f"Sklearn RMSE: {rmse_sk}")

Sklearn R2: 0.019037512620476815
Sklearn MAE: 1083.695147086616
Sklearn RMSE: 9620.12635778934


## **Results Comparison:**

In [33]:
results_data = {
    'Model': ['Analytical', 'Custom SGD', 'Sklearn'],
    'MAE': [mae_analytical, mae_sgd, mae_sk],
    'RMSE': [rmse_analytical, rmse_sgd, rmse_sk],
    'R2': [r2_analytical, r2_sgd, r2_sk]
}

df_results = pd.DataFrame(results_data)
df_results

Unnamed: 0,Model,MAE,RMSE,R2
0,Analytical,1083.695147,9620.126358,0.019038
1,Custom SGD,1129.966251,9620.584492,0.018944
2,Sklearn,1083.695147,9620.126358,0.019038


## **Ridge, Lasso and ElasticNet Implementation (Custom):**

In [34]:
model_ridge = LinearRegressionCustom()
model_ridge.fit_sgd(X_train, y_train, penalty='l2')

In [35]:
pred_ridge = model_ridge.predict(X_test)

In [36]:
model_lasso = LinearRegressionCustom()
model_lasso.fit_sgd(X_train, y_train, penalty='l1')

In [37]:
pred_lasso = model_lasso.predict(X_test)

In [38]:
model_elastic = LinearRegressionCustom()
model_elastic.fit_sgd(X_train, y_train, penalty='elasticnet')

In [39]:
pred_elastic = model_elastic.predict(X_test)

In [40]:
results_custom_models = {
    'Model': ['Ridge', 'Lasso', 'Elastic'],
    'MAE': [mae(y_test, pred_ridge), mae(y_test, pred_lasso), mae(y_test, pred_elastic)],
    'RMSE': [rmse(y_test, pred_ridge), rmse(y_test, pred_lasso), rmse(y_test, pred_elastic)],
    'R2': [r2_score(y_test, pred_ridge), r2_score(y_test, pred_lasso), r2_score(y_test, pred_elastic)]
}

results_df_custom_models = pd.DataFrame(results_custom_models)
results_df_custom_models

Unnamed: 0,Model,MAE,RMSE,R2
0,Ridge,1088.21219,9614.571011,0.02017
1,Lasso,1129.945123,9620.582538,0.018944
2,Elastic,1107.743718,9617.337863,0.019606


## **Ridge, Lasso and ElasticNet Implementation (Sklearn):**

In [41]:
ridge_sk = Ridge(alpha=1)
ridge_sk.fit(X_train, y_train)

0,1,2
,alpha,1
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [42]:
y_pred_sk_ridge = ridge_sk.predict(X_test)

In [43]:
lasso_sk = Lasso(alpha=1)
lasso_sk.fit(X_train, y_train)

0,1,2
,alpha,1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [44]:
y_pred_sk_lasso = lasso_sk.predict(X_test)

In [45]:
elasticnet_sk = ElasticNet(alpha=1, l1_ratio=0.5)
elasticnet_sk.fit(X_train, y_train)

0,1,2
,alpha,1
,l1_ratio,0.5
,fit_intercept,True
,precompute,False
,max_iter,1000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,


In [46]:
y_pred_sk_elasticnet = elasticnet_sk.predict(X_test)

In [47]:
results_sk_models = {
    'Model': ['Ridge (SK)', 'Lasso (SK)', 'Elastic (SK)'],
    'MAE': [mae(y_test, y_pred_sk_ridge), mae(y_test, y_pred_sk_lasso), mae(y_test, y_pred_sk_elasticnet)],
    'RMSE': [rmse(y_test, y_pred_sk_ridge), rmse(y_test, y_pred_sk_lasso), rmse(y_test, y_pred_sk_elasticnet)],
    'R2': [r2_score(y_test, y_pred_sk_ridge), r2_score(y_test, y_pred_sk_lasso), r2_score(y_test, y_pred_sk_elasticnet)]
}

results_df_sk_models = pd.DataFrame(results_sk_models)
results_df_sk_models

Unnamed: 0,Model,MAE,RMSE,R2
0,Ridge (SK),1083.646696,9620.115039,0.01904
1,Lasso (SK),1080.315415,9619.90655,0.019082
2,Elastic (SK),1048.313452,9611.638364,0.020768


## **Results of Models Comparison:**

In [48]:
pd.concat([results_df_custom_models, results_df_sk_models], axis=0)

Unnamed: 0,Model,MAE,RMSE,R2
0,Ridge,1088.21219,9614.571011,0.02017
1,Lasso,1129.945123,9620.582538,0.018944
2,Elastic,1107.743718,9617.337863,0.019606
0,Ridge (SK),1083.646696,9620.115039,0.01904
1,Lasso (SK),1080.315415,9619.90655,0.019082
2,Elastic (SK),1048.313452,9611.638364,0.020768


## **Feature Normalisation:**


******
*Mandatory:*

1.   **Distance-Based Algorithms (KNN, K-Means Clustering): These algorithms calculate the "distance" (Euclidean) between points.**
2.   **Gradient Descent-Based Models (Linear/Logistic Regression, Neural Networks): As we discussed, gradients are scaled by the input \(X\) (\(\nabla =2\cdot \text{Error}\cdot X\)). If one feature is \(0.01\) and another is \(1,000,000\), the gradients will be wildly unbalanced. This causes the "Loss Valley" to become a long, thin canyon, making the model bounce back and forth and take forever to converge.**
3. **Regularization (L1 Lasso / L2 Ridge): These techniques penalize large weights.**

*Not mandatory:*


1.   **Tree-Based Models**
2.   **Target Variables (\(y\)): In most regression cases, you don't need to normalize the target \(y\) (though it can sometimes help with stability in very deep neural networks).**

*MinMaxScaler:*

$$x_{scaled}=\frac{x-x_{min}}{x_{max}-x_{min}}$$

#### **Custom Scaler (MinMax):**

In [49]:
class MinMaxScalerCustom:
  def __init__(self) -> None:
    self.min = None
    self.max = None
    self.range_val = None

  def fit(self, X):
    X = np.array(X)
    self.min = X.min(axis=0)
    self.max = X.max(axis=0)
    self.range_val = self.max - self.min
    return self

  def transform(self, X):
    X = np.array(X)
    with np.errstate(divide='ignore', invalid='ignore'):
      X_scaled = np.where(self.range_val == 0, 0, (X - self.min)/(self.range_val))

    return X_scaled

  def fit_transform(self, X):
    X = np.array(X)
    self.fit(X)
    return self.transform(X)

In [50]:
scaler_custom = MinMaxScalerCustom()
scaled_custom = scaler_custom.fit_transform(X_train)
pd.DataFrame(scaled_custom)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10,0.125
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10,0.250
2,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10,0.250
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.15,0.375
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49347,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10,0.375
49348,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.10,0.250
49349,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10,0.125
49350,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10,0.250


#### **Sklearn Scaler(MinMax):**

In [51]:
scaler_sk = MinMaxScaler()
scaled_sk = scaler_sk.fit_transform(X_train)
pd.DataFrame(scaled_sk)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10,0.125
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10,0.250
2,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10,0.250
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.15,0.375
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49347,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10,0.375
49348,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.10,0.250
49349,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10,0.125
49350,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10,0.250


#### **Result Comparison:**

In [52]:
difference = np.abs(scaled_custom - scaled_sk).sum()

print(f"\nTotal Difference between my code and Sklearn: {difference}")

if difference < 1e-5:
    print("✅ SUCCESS: Scaler matches Sklearn perfectly!")
else:
    print("❌ FAIL: The math is different. Something went wrong.")


Total Difference between my code and Sklearn: 6.369904603786836e-14
✅ SUCCESS: Scaler matches Sklearn perfectly!


#### **StandardScaler Custom:**

In [53]:
class StandardScalerCustom:
  def __init__(self):
    self.mean = None
    self.std = None

  def fit(self, X):
    X = np.array(X)
    self.mean = np.mean(X, axis=0)
    self.std = np.std(X, axis=0)
    return self

  def transform(self, X):
    X = np.array(X)
    epsilon = 1e-8
    return (X - self.mean)/(self.std + epsilon)

  def fit_transform(self, X):
    X = np.array(X)
    self.fit(X)
    return self.transform(X)

In [54]:
stand_sc_custom = StandardScalerCustom()
stand_sc_scaled_custom = stand_sc_custom.fit(X_train).transform(X_train)
pd.DataFrame(stand_sc_scaled_custom)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,-1.051537,1.047147,1.047700,1.113422,-0.857000,1.190015,-0.759766,1.421119,-0.605881,2.096387,...,-0.345686,2.934115,-0.308903,-0.264109,-0.254044,-0.241984,-0.235488,-0.233854,-0.423163,-0.485772
1,0.950989,-0.954976,1.047700,-0.898132,1.166861,1.190015,1.316194,1.421119,-0.605881,-0.477011,...,-0.345686,-0.340818,-0.308903,-0.264109,-0.254044,-0.241984,-0.235488,-0.233854,-0.423163,0.411083
2,0.950989,-0.954976,1.047700,-0.898132,1.166861,1.190015,-0.759766,1.421119,-0.605881,-0.477011,...,-0.345686,-0.340818,-0.308903,-0.264109,-0.254044,-0.241984,-0.235488,-0.233854,-0.423163,0.411083
3,-1.051537,-0.954976,-0.954472,-0.898132,-0.857000,-0.840325,-0.759766,-0.703671,-0.605881,-0.477011,...,-0.345686,-0.340818,-0.308903,3.786322,-0.254044,-0.241984,-0.235488,-0.233854,0.574016,1.307938
4,0.950989,-0.954976,-0.954472,-0.898132,1.166861,-0.840325,-0.759766,1.421119,1.650490,-0.477011,...,-0.345686,-0.340818,-0.308903,-0.264109,-0.254044,-0.241984,-0.235488,-0.233854,-0.423163,-1.382628
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49347,0.950989,-0.954976,1.047700,-0.898132,-0.857000,1.190015,-0.759766,-0.703671,-0.605881,-0.477011,...,-0.345686,-0.340818,-0.308903,-0.264109,-0.254044,-0.241984,-0.235488,-0.233854,-0.423163,1.307938
49348,0.950989,1.047147,-0.954472,1.113422,1.166861,-0.840325,1.316194,-0.703671,-0.605881,-0.477011,...,-0.345686,-0.340818,-0.308903,-0.264109,-0.254044,-0.241984,4.246502,-0.233854,-0.423163,0.411083
49349,0.950989,1.047147,1.047700,1.113422,-0.857000,1.190015,1.316194,1.421119,-0.605881,2.096387,...,-0.345686,2.934115,-0.308903,-0.264109,-0.254044,-0.241984,-0.235488,-0.233854,-0.423163,-0.485772
49350,-1.051537,-0.954976,-0.954472,-0.898132,-0.857000,1.190015,1.316194,-0.703671,-0.605881,2.096387,...,2.892795,-0.340818,-0.308903,-0.264109,-0.254044,-0.241984,-0.235488,-0.233854,-0.423163,0.411083


#### **StandardScaler Sklearn:**

In [55]:
stand_sc_sk = StandardScaler()
stand_sc_scaled_sk = stand_sc_sk.fit_transform(X_train)
pd.DataFrame(stand_sc_scaled_sk)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,-1.051537,1.047147,1.047700,1.113422,-0.857000,1.190015,-0.759766,1.421119,-0.605881,2.096387,...,-0.345686,2.934116,-0.308903,-0.264109,-0.254044,-0.241984,-0.235488,-0.233854,-0.423163,-0.485772
1,0.950989,-0.954976,1.047700,-0.898132,1.166861,1.190015,1.316194,1.421119,-0.605881,-0.477011,...,-0.345686,-0.340818,-0.308903,-0.264109,-0.254044,-0.241984,-0.235488,-0.233854,-0.423163,0.411083
2,0.950989,-0.954976,1.047700,-0.898132,1.166861,1.190015,-0.759766,1.421119,-0.605881,-0.477011,...,-0.345686,-0.340818,-0.308903,-0.264109,-0.254044,-0.241984,-0.235488,-0.233854,-0.423163,0.411083
3,-1.051537,-0.954976,-0.954472,-0.898132,-0.857000,-0.840325,-0.759766,-0.703671,-0.605881,-0.477011,...,-0.345686,-0.340818,-0.308903,3.786322,-0.254044,-0.241984,-0.235488,-0.233854,0.574016,1.307938
4,0.950989,-0.954976,-0.954472,-0.898132,1.166861,-0.840325,-0.759766,1.421119,1.650490,-0.477011,...,-0.345686,-0.340818,-0.308903,-0.264109,-0.254044,-0.241984,-0.235488,-0.233854,-0.423163,-1.382628
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49347,0.950989,-0.954976,1.047700,-0.898132,-0.857000,1.190015,-0.759766,-0.703671,-0.605881,-0.477011,...,-0.345686,-0.340818,-0.308903,-0.264109,-0.254044,-0.241984,-0.235488,-0.233854,-0.423163,1.307938
49348,0.950989,1.047147,-0.954472,1.113422,1.166861,-0.840325,1.316194,-0.703671,-0.605881,-0.477011,...,-0.345686,-0.340818,-0.308903,-0.264109,-0.254044,-0.241984,4.246502,-0.233854,-0.423163,0.411083
49349,0.950989,1.047147,1.047700,1.113422,-0.857000,1.190015,1.316194,1.421119,-0.605881,2.096387,...,-0.345686,2.934116,-0.308903,-0.264109,-0.254044,-0.241984,-0.235488,-0.233854,-0.423163,-0.485772
49350,-1.051537,-0.954976,-0.954472,-0.898132,-0.857000,1.190015,1.316194,-0.703671,-0.605881,2.096387,...,2.892795,-0.340818,-0.308903,-0.264109,-0.254044,-0.241984,-0.235488,-0.233854,-0.423163,0.411083


#### **Result Comparison:**

In [56]:
difference = np.abs(stand_sc_scaled_custom - stand_sc_scaled_sk).sum()

print(f"\nTotal Difference between my code and Sklearn: {difference}")


Total Difference between my code and Sklearn: 0.020808611333091948


## **Fit custom and sklearn models with normalized data:**

In [57]:
scalers = {
    'MinMax': MinMaxScalerCustom(),
    'Standard': StandardScalerCustom()
}

# Define the list of Models to test
# (We use Sklearn versions here for speed and reliability)
models = {
    'LinearReg': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1),       # Kept alpha small to avoid killing all features
    'ElasticNet': ElasticNet(alpha=0.1, l1_ratio=0.5)
}

# Place to store results
results_list = []

# --- 2. THE LOOP (Automated Testing) ---
print("🚀 Starting Batch Training...\n")

for scaler_name, scaler in scalers.items():
    print(f"--- Applying {scaler_name} Scaler ---")

    # A. Scale the data
    # Remember: Fit on Train, Transform on Train AND Test
    X_train_sc = scaler.fit_transform(X_train.values)
    X_test_sc = scaler.transform(X_test.values)

    for model_name, model in models.items():
        # B. Train the Model
        model.fit(X_train_sc, y_train)

        # C. Predict
        y_pred_train = model.predict(X_train_sc)
        y_pred_test = model.predict(X_test_sc)

        # D. Calculate Metrics (Test Set)
        mae = mean_absolute_error(y_test, y_pred_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
        r2 = r2_score(y_test, y_pred_test)

        # E. Store Result
        results_list.append({
            'Scaler': scaler_name,
            'Model': model_name,
            'MAE': mae,
            'RMSE': rmse,
            'R2': r2
        })
        print(f"   ✅ {model_name} finished.")

# --- 3. FINAL REPORT ---
results_df = pd.DataFrame(results_list)

# Sorting for better readability (Optional)
results_df = results_df.sort_values(by='MAE')

print("\n📊 Final Results Summary:")
print(results_df.sort_index())

🚀 Starting Batch Training...

--- Applying MinMax Scaler ---
   ✅ LinearReg finished.
   ✅ Ridge finished.
   ✅ Lasso finished.
   ✅ ElasticNet finished.
--- Applying Standard Scaler ---
   ✅ LinearReg finished.
   ✅ Ridge finished.
   ✅ Lasso finished.
   ✅ ElasticNet finished.

📊 Final Results Summary:
     Scaler       Model          MAE         RMSE        R2
0    MinMax   LinearReg  1083.695147  9620.126358  0.019038
1    MinMax       Ridge  1083.509502  9619.225114  0.019221
2    MinMax       Lasso  1083.253150  9619.936093  0.019076
3    MinMax  ElasticNet  1228.682971  9643.986124  0.014166
4  Standard   LinearReg  1083.695147  9620.126358  0.019038
5  Standard       Ridge  1083.683829  9620.123913  0.019038
6  Standard       Lasso  1083.537279  9620.115662  0.019040
7  Standard  ElasticNet  1060.150401  9614.955310  0.020092


## **Overfit models:**

In [63]:
# --- 1. Create the Monster (Degree 10) ---
# We use only 'bathrooms' and 'bedrooms' as requested
cols = ['bathrooms', 'bedrooms']
X_train_tiny = X_train[cols].values
X_test_tiny = X_test[cols].values

# Create features up to power 10
poly = PolynomialFeatures(degree=10, include_bias=False)
X_poly_train = poly.fit_transform(X_train_tiny)
X_poly_test = poly.transform(X_test_tiny)

print(f"Original features: {X_train_tiny.shape[1]}") # 2
print(f"Polynomial features: {X_poly_train.shape[1]}") # ~65

# --- 2. Scale (StandardScaler) ---
# Sklearn's scaler is faster and safer here
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_poly_train)
X_test_sc = scaler.transform(X_poly_test)

# --- 3. Define the Contestants ---
models = {
    'LinearReg (No Leash)': LinearRegression(),
    'Ridge (Weak Leash)': Ridge(alpha=1.0),
    'Ridge (Strong Leash)': Ridge(alpha=100.0), # Stronger regularization
    'Lasso (Aggressive)': Lasso(alpha=0.01),    # Lasso hates complex features, keep alpha small
    'ElasticNet': ElasticNet(alpha=0.1, l1_ratio=0.5)
}

# --- 4. Run the Race ---
results_list = []

print("\n🚀 Training Sklearn Models on Degree 10 Data...")

for name, model in models.items():
    # Fit
    model.fit(X_train_sc, y_train)

    # Predict
    y_pred = model.predict(X_test_sc)

    # Metrics
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    results_list.append({
        'Model': name,
        'R2': r2,
        'MAE': mae,
        'RMSE': rmse
    })

# --- 5. The Verdict ---
df_results = pd.DataFrame(results_list).sort_values(by='R2', ascending=False)
display(df_results.sort_index())

Original features: 2
Polynomial features: 65

🚀 Training Sklearn Models on Degree 10 Data...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,Model,R2,MAE,RMSE
0,LinearReg (No Leash),-1.82515e+30,4.80245e+16,1.312212e+19
1,Ridge (Weak Leash),-1.377908e+19,131954300000.0,36054940000000.0
2,Ridge (Strong Leash),-9.540083e+17,34720790000.0,9487038000000.0
3,Lasso (Aggressive),-3.042254e+18,62002790000.0,16941520000000.0
4,ElasticNet,-4.878283e+16,7851400000.0,2145300000000.0


#### ***The best model based on the results: elastic net***

## **Addition task:**

In [64]:
def remove_outliers(X: np.array, y: np.array):
  X = np.asanyarray(X)
  y = np.asanyarray(y)

  q1, q3 = np.percentile(y, [25, 75])
  iqr = q3 - q1
  higher_bound = q3 + (1.5 * iqr)
  lower_bound = q1 - (1.5 * iqr)

  mask = (y >= lower_bound) & (y <= higher_bound)
  X_clean = X[mask]
  y_clean = y[mask]
  return (X_clean, y_clean)

def log_transform(y: np.array):
  y = np.asanyarray(y)
  return np.log1p(y)

def log_detransform(y: np.array):
  y = np.asanyarray(y)
  return np.expm1(y)

In [65]:
class robust_scaler_custom:
  def __init__(self):
    self.med = None
    self.iqr_safe = None

  def fit(self, X: np.array):
    X = np.asanyarray(X)
    q1, self.med, q3 = np.percentile(X, [25, 50, 75], axis=0)
    iqr = q3 - q1
    self.iqr_safe = np.where(iqr == 0, 1e-8, iqr)
    return self

  def transform(self, X: np.array):
    X = np.asanyarray(X)
    if X.shape[1] != len(self.med):
        raise ValueError("Number of features in X must match the features seen during fit.")
    return (X - self.med)/self.iqr_safe

  def fit_transform(self, X: np.array):
    X = np.asanyarray(X)
    self.fit(X)
    return self.transform(X)

In [66]:
X_train_clean, y_train_clean = remove_outliers(X_train, y_train)

y_train_transformed = log_transform(y_train_clean)
scaler = robust_scaler_custom()
X_train_scaled = scaler.fit_transform(X_train_clean)

In [67]:
y_test_transformed = log_transform(y_test)
X_test_scaled = scaler.transform(X_test)

In [68]:
model_en = ElasticNet(alpha=0.0001, l1_ratio=0.5)
model_en.fit(X_train_scaled, y_train_transformed)
y_pred_raw = model_en.predict(X_test_scaled)

In [69]:
y_pred = log_detransform(y_pred_raw)

max_lim_pred = y_train_clean.max() * 2
y_pred = np.clip(y_pred, a_min=0, a_max=max_lim_pred)

In [70]:
res_r2 = r2_score(y_true=y_test, y_pred=y_pred)
res_mae = mean_absolute_error(y_test, y_pred)
res_mse = mean_squared_error(y_test, y_pred)

metrics = [['r2', 'mae', 'mse'], [res_r2, res_mae, res_mse]]
pd.DataFrame(metrics)

Unnamed: 0,0,1,2
0,r2,mae,mse
1,0.02297,918.495075,92175822.534036


## **GridSearchCV tool for automation:**

In [71]:
param_grid = {
    'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001],
    'l1_ratio': [0, 0.01, 0.1, 0.5]
}
model = ElasticNet()
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

grid_search.fit(X_train_scaled, y_train_transformed)

Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sk

0,1,2
,estimator,ElasticNet()
,param_grid,"{'alpha': [1, 0.1, ...], 'l1_ratio': [0, 0.01, ...]}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,0.001
,l1_ratio,0
,fit_intercept,True
,precompute,False
,max_iter,1000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,


In [72]:
print(f'Best params: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}\n\n')

Best params: {'alpha': 0.001, 'l1_ratio': 0}
Best score: 0.5032939229408949




In [73]:
best_model = grid_search.best_estimator_

Test data predictions:

In [74]:
y_pred_raw = best_model.predict(X_test_scaled)
y_pred = log_detransform(y_pred_raw)

max_lim_pred = y_train_clean.max() * 2
y_pred = np.clip(y_pred, a_min=0, a_max=max_lim_pred)

Train data predictions:

In [75]:
y_pred_train_raw = best_model.predict(X_train_scaled)
y_pred_train = log_detransform(y_pred_train_raw)

max_lim_pred = y_train_clean.max() * 2
y_pred_train = np.clip(y_pred_train, a_min=0, a_max=max_lim_pred)

**Benchmarks:**

In [76]:
res_r2 = r2_score(y_test, y_pred)
res_mae = mean_absolute_error(y_test, y_pred)
res_mse = mean_squared_error(y_test, y_pred)
res_rmse = root_mean_squared_error(y_test, y_pred)

metrics = [['r2', 'mae', 'mse'], [res_r2, res_mae, res_mse]]
pd.DataFrame(metrics)

Unnamed: 0,0,1,2
0,r2,mae,mse
1,0.022971,918.462464,92175744.602458


In [77]:
res_r2 = r2_score(y_train_clean, y_pred_train)
res_mae = mean_absolute_error(y_train_clean, y_pred_train)
res_mse = mean_squared_error(y_train_clean, y_pred_train)
res_rmse = root_mean_squared_error(y_train_clean, y_pred_train)

metrics = [['r2', 'mae', 'mse'], [res_r2, res_mae, res_mse]]
pd.DataFrame(metrics)

Unnamed: 0,0,1,2
0,r2,mae,mse
1,0.502758,596.218711,635923.434581
