In [1]:
import pandas as pd
import numpy as np

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv

df = pd.read_csv('AB_NYC_2019.csv')

df = df[['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']]

df.head()

--2021-09-20 12:06:36--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7077973 (6.8M) [text/plain]
Saving to: ‘AB_NYC_2019.csv’


2021-09-20 12:06:36 (96.1 MB/s) - ‘AB_NYC_2019.csv’ saved [7077973/7077973]



Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,40.64749,-73.97237,149,1,9,0.21,6,365
1,40.75362,-73.98377,225,1,45,0.38,2,355
2,40.80902,-73.9419,150,3,0,,1,365
3,40.68514,-73.95976,89,1,270,4.64,1,194
4,40.79851,-73.94399,80,10,9,0.1,1,0


# 1. Number of missing values

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   latitude                        48895 non-null  float64
 1   longitude                       48895 non-null  float64
 2   price                           48895 non-null  int64  
 3   minimum_nights                  48895 non-null  int64  
 4   number_of_reviews               48895 non-null  int64  
 5   reviews_per_month               38843 non-null  float64
 6   calculated_host_listings_count  48895 non-null  int64  
 7   availability_365                48895 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 3.0 MB


In [4]:
df['reviews_per_month'].isnull().sum()

10052

# 2. Median for minimum_nights

In [5]:
df.minimum_nights.median()

3.0

# 3. Best way to fill NAs

In [6]:
n = len(df)

idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

print(idx)
print(n_train, n_val, n_test)

[  879 44383 15394 ... 38158   860 15795]
29337 9779 9779


In [7]:
df_train = df.iloc[idx[:n_train]].reset_index(drop=True)
df_val = df.iloc[idx[n_train:n_train+n_val]].reset_index(drop=True)
df_test = df.iloc[idx[n_train+n_val:]].reset_index(drop=True)

display(df_train.head())

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,40.64354,-73.97777,89,3,62,0.71,1,189
1,40.70666,-73.90779,30,21,0,,1,73
2,40.76116,-73.99016,120,2,17,0.43,1,0
3,40.70763,-74.0105,470,2,5,1.88,327,272
4,40.79658,-73.93287,199,2,30,0.8,1,30


In [8]:
X_train_fill_zero = df_train.drop('price', 1).fillna(0).values
X_train_fill_mean = df_train.drop('price', 1).fillna(df_train.mean()).values

X_val_fill_zero = df_val.drop('price', 1).fillna(0).values
X_val_fill_mean = df_val.drop('price', 1).fillna(df_val.mean()).values

y_train = np.log1p(df_train['price'].values)
y_val = np.log1p(df_val['price'].values)

print(X_train_fill_zero[:5])
print(y_train[:5])

[[ 40.64354 -73.97777   3.       62.        0.71      1.      189.     ]
 [ 40.70666 -73.90779  21.        0.        0.        1.       73.     ]
 [ 40.76116 -73.99016   2.       17.        0.43      1.        0.     ]
 [ 40.70763 -74.0105    2.        5.        1.88    327.      272.     ]
 [ 40.79658 -73.93287   2.       30.        0.8       1.       30.     ]]
[4.49980967 3.4339872  4.79579055 6.15485809 5.29831737]


In [9]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [10]:
# RMSE Training set with fill value = 0
w0_zero, w_zero = train_linear_regression(X_train_fill_zero, y_train)

y_pred_zero = w0_zero + X_val_fill_zero.dot(w_zero)

print(round(rmse(y_val, y_pred_zero), 2))

0.64


In [11]:
# RMSE Training set with fill value = mean
w0_mean, w_mean = train_linear_regression(X_train_fill_mean, y_train)

y_pred_mean = w0_mean + X_val_fill_mean.dot(w_mean)

print(round(rmse(y_val, y_pred_mean), 2))

0.64


# 4. Best regularization parameter r

In [12]:
X_train = df_train.drop('price', 1).fillna(0).values
X_val = df_val.drop('price', 1).fillna(0).values

rs = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]

In [13]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [14]:
for r in rs:
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)

    y_pred = w0 + X_val.dot(w)
    score = round(rmse(y_val, y_pred), 2)
    
    print(r, w0, score)

0 -419.91265973795873 0.64
1e-06 -419.8627166795901 0.64
0.0001 -414.97649362219624 0.64
0.001 -375.2736534813748 0.64
0.01 -191.7838407751772 0.66
0.1 -32.562560558416244 0.68
1 -3.499216836898674 0.68
5 -0.7033623203748884 0.68
10 -0.35127676056538004 0.68


# 5. STD of RMSE scores for different seeds

In [15]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

rmse_scores = np.zeros(len(seeds))

for seed in seeds:
  idx = np.arange(n)
  np.random.seed(seed)
  np.random.shuffle(idx)

  df_train = df.iloc[idx[:n_train]].reset_index(drop=True)
  df_val = df.iloc[idx[n_train:n_train+n_val]].reset_index(drop=True)
  df_test = df.iloc[idx[n_train+n_val:]].reset_index(drop=True)

  X_train = df_train.drop('price', 1).fillna(0).values
  X_val = df_val.drop('price', 1).fillna(0).values
  y_train = np.log1p(df_train['price'].values)
  y_val = np.log1p(df_val['price'].values)

  w0, w = train_linear_regression_reg(X_train, y_train, 0)

  y_pred = w0 + X_val.dot(w)

  rmse_scores[seed] = rmse(y_val, y_pred)

print(rmse_scores)
print(round(np.std(rmse_scores), 3))

[0.654978   0.64625237 0.64765582 0.63751451 0.64458091 0.630581
 0.62978519 0.65061843 0.64897804 0.64375652]
0.008


# 6. RMSE on test

In [16]:
idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]

df_full_train = pd.concat([df_train, df_val]).reset_index(drop=True)
df_test = df.iloc[idx[n_train+n_val:]].reset_index(drop=True)

X_full_train = df_full_train.drop('price', 1).fillna(0).values
X_test = df_test.drop('price', 1).fillna(0).values
y_full_train = np.log1p(df_full_train['price'].values)
y_test = np.log1p(df_test['price'].values)

w0, w = train_linear_regression_reg(X_full_train, y_full_train, 0.001)

y_pred = w0 + X_test.dot(w)
score = round(rmse(y_test, y_pred), 2)

print(score)

0.65
