In [288]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, RidgeCV, LogisticRegressionCV, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error
from datetime import date, datetime
from sklearn.impute import SimpleImputer

In [289]:
raw_train = pd.read_csv('datasets/train_regression.csv')
raw_test = pd.read_csv('datasets/test_regression.csv')

## Clean and Process

### General Cleaning

In [290]:
train = raw_train.copy()
test = raw_test.copy()

train['price'] = train['price'].str.replace(',', '').str.replace('$', '', regex=False).astype(float)

In [291]:
train['acceptance_rate'] = train['host_acceptance_rate'].str.replace('%', '').astype(float) / 100
train['response_rate'] = train['host_response_rate'].str.replace('%', '').astype(float) / 100

test['acceptance_rate'] = test['host_acceptance_rate'].str.replace('%', '').astype(float) / 100
test['response_rate'] = test['host_response_rate'].str.replace('%', '').astype(float) / 100

train.drop(columns=['host_acceptance_rate', 'host_response_rate'], inplace=True)
test.drop(columns=['host_acceptance_rate', 'host_response_rate'], inplace=True)

In [292]:
train['bathrooms_num'] = train['bathrooms_text'].str.extract('(\d+)').astype(float)
test['bathrooms_num'] = test['bathrooms_text'].str.extract('(\d+)').astype(float)

train.loc[train['bathrooms_text'].str.contains('Half-bath', case=False, na=False) & train['bathrooms_num'].isna(), 'bathrooms_num'] = 0.5
test.loc[test['bathrooms_text'].str.contains('Half-bath', case=False, na=False) & test['bathrooms_num'].isna(), 'bathrooms_num'] = 0.5


In [293]:
# Convert date columns
def strip_date(row):
    if isinstance(row, str):
        row = datetime.strptime(row, '%Y-%m-%d').date()
    return row

# Apply date conversion to train dataset
train['host_since'] = train['host_since'].apply(strip_date)
train['first_review'] = train['first_review'].apply(strip_date)
train['last_review'] = train['last_review'].apply(strip_date)

# Apply date conversion to test dataset
test['host_since'] = test['host_since'].apply(strip_date)
test['first_review'] = test['first_review'].apply(strip_date)
test['last_review'] = test['last_review'].apply(strip_date)

# ----- #

# Calculate months since various dates for train dataset
train['host_since_in_months'] = round(((datetime.now().date() - train['host_since']).dt.days) / 30, 2)
train['first_review_in_months'] = round(((datetime.now().date() - train['first_review']).dt.days) / 30, 2)
train['last_review_in_months'] = round(((datetime.now().date() - train['last_review']).dt.days) / 30, 2)

# Calculate months since various dates for test dataset
test['host_since_in_months'] = round(((datetime.now().date() - test['host_since']).dt.days) / 30,  2)
test['first_review_in_months'] = round(((datetime.now().date() - test['first_review']).dt.days) / 30, 2)
test['last_review_in_months'] = round(((datetime.now().date() - test['last_review']).dt.days) / 30, 2)


In [294]:
# t_f_vars = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'has_availability', 'instant_bookable']
# train[t_f_vars] = train[t_f_vars].replace({'f': 0, 't': 1})
# test[t_f_vars] = test[t_f_vars].replace({'f': 0, 't': 1})

In [295]:
# top and bottom 0.04%
lower_val = np.percentile(train[['price']], 0.04)
upper_val = np.percentile(train[['price']], 99.96)

outliers_idx = train[(train['price'] >= upper_val) | (train['price'] <= lower_val)].index

train_clean = train.drop(outliers_idx).reset_index(drop=True)
test_clean = test.copy()

train.iloc[outliers_idx, :]['price']


523      5000.0
2067       15.0
3129    99998.0
4865       14.0
Name: price, dtype: float64

### Clean/Transform Variables 

*Check poly terms*

lat/long second order
select certain neighbourhood - northside, west_town, lakeview (+/-)
host_listing_count/host_total_listing_count
np.log(price) (can or not)

reviews_per_month fillna with median 


In [296]:
def is_shared(row):
    if 'shared' in str(row['bathrooms_text']):
        row['bathrooms_shared'] = "t"
    
    elif pd.isna(row['bathrooms_text']):
        if 'Shared' in row['room_type']:
            row['bathrooms_shared'] = "t"              
        else:
            row['bathrooms_shared'] = "f"
                           
    else: row['bathrooms_shared'] = "f"
                           
    return row
                           
    

# def clean_hoods(row):
#     if row.loc['neighbourhood_cleansed'] in other_hoods or row.loc['neighbourhood_cleansed'] in test_only_hoods:
#         row['neighbourhood_grouped'] = 'Other'
        
#     else:    
#         row['neighbourhood_grouped'] = row.loc['neighbourhood_cleansed']
        
#     return row
       
    
def clean_rooms(row):    
    if row.loc['room_type'] == 'Hotel room':
        row['room_type'] = 'Private room'
    return row



neighbourhood_counts = train_clean['neighbourhood_cleansed'].value_counts()

other_hoods = [i for i in neighbourhood_counts.index if neighbourhood_counts[i] < 100]

test_only_hoods = [i for i in test_clean['neighbourhood_cleansed'].unique() 
                   if i not in neighbourhood_counts 
                   and i != 'Other']
    

In [297]:
train_clean = train_clean.apply(is_shared, axis=1)
test_clean = test_clean.apply(is_shared, axis=1)

train_clean = train_clean.apply(clean_hoods, axis=1)
test_clean = test_clean.apply(clean_hoods, axis=1)

train_clean = train_clean.apply(clean_rooms, axis=1)
test_clean = test_clean.apply(clean_rooms, axis=1)

In [298]:
# def clean_host_listing(row):
#     if row > 80:
#         row = None
#     return row


# train_clean['calculated_host_listings_count'] = train_clean['calculated_host_listings_count'].apply(clean_host_listing)
# test_clean['calculated_host_listings_count'] = test_clean['calculated_host_listings_count'].apply(clean_host_listing)

# train_clean['calculated_host_listings_count'].fillna(train_clean['calculated_host_listings_count'].median())
# test_clean['calculated_host_listings_count'].fillna(test_clean['calculated_host_listings_count'].median())


In [299]:
train_clean = train_clean.apply(clean_rooms, axis=1)
test_clean = test_clean.apply(clean_rooms, axis=1)

In [300]:
hood_df = pd.DataFrame(index=train_clean['neighbourhood_cleansed'].unique())
neighbourhood_counts = train_clean['neighbourhood_cleansed'].value_counts()

all_mean = train_clean.groupby('neighbourhood_cleansed')['price'].mean()
all_std = train_clean.groupby('neighbourhood_cleansed')['price'].std()

hood_df = pd.concat([hood_df, all_mean, all_std], axis=1)
hood_df.columns = ['mean_price', 'std_price']
hood_df.sort_values('std_price', inplace=True)
hood_df.dropna(inplace=True)
hood_df = hood_df.merge(neighbourhood_counts, left_index=True, right_index=True)


large_std_list = hood_df[hood_df['std_price'] > 175].index.tolist()

hood_df.sort_values('std_price')
filtered_df = hood_df[((hood_df['std_price'] < 40) | (hood_df['neighbourhood_cleansed'] > 100)) & (hood_df['neighbourhood_cleansed'] > 20)]                             
keep_hoods = list(filtered_df.index)

In [301]:
def clean_hoods(row):
    if row.loc['neighbourhood_cleansed'] not in keep_hoods:
        row['neighbourhood_grouped'] = 'Other'
        
    else:    
        row['neighbourhood_grouped'] = row.loc['neighbourhood_cleansed']
        
    return row

train_clean = train_clean.apply(clean_hoods, axis=1)
train_clean['neighbourhood_grouped'].value_counts()


Other              1483
Near North Side     637
West Town           482
Lake View           355
Near West Side      338
Logan Square        261
Loop                252
Lincoln Park        174
Near South Side     152
Lower West Side     135
Uptown              111
Woodlawn            111
Edgewater           111
Irving Park         109
Bridgeport          107
Avondale            106
New City             39
South Lawndale       33
Name: neighbourhood_grouped, dtype: int64

In [302]:
avail_hoods = train_clean['neighbourhood_grouped'].unique()

def test_consistent_hoods(row):
    if row not in avail_hoods:
        row = 'Other'       
    return row

test_clean['neighbourhood_grouped'] = test_clean['neighbourhood_cleansed'].apply(test_consistent_hoods)


In [303]:
words_to_remove = ['room', 'private', 'shared', 'entire', ' in', ' room', ' private', ' shared', ' entire', ' in',]


def remove_words(text):
    text=text.lower()
    for word in words_to_remove:
        word = word.lower()
        text = text.replace(word, '')
    return text.strip()

train_clean['property_type_cleansed'] = train_clean['property_type'].apply(remove_words)
test_clean['property_type_cleansed'] = test_clean['property_type'].apply(remove_words)


property_counts = train_clean['property_type_cleansed'].value_counts()
keep = [i for i in property_counts.index if property_counts[i] > 10]


In [304]:
def clean_property(row):
    if row not in keep:
        row = 'Other'
        
    return row


train_clean['property_type_cleansed'] = train_clean['property_type_cleansed'].apply(clean_property)
test_clean['property_type_cleansed'] = test_clean['property_type_cleansed'].apply(clean_property)

train_clean['property_type_cleansed'].value_counts()


rental unit           2987
home                   734
condo                  562
serviced apartment     164
guest suite            102
hotel                  100
townhouse               94
loft                    58
bed and breakfast       50
boutique hotel          40
guesthouse              40
Other                   39
bungalow                26
Name: property_type_cleansed, dtype: int64

In [305]:
upper_lim = np.percentile(train_clean[['minimum_nights']], 99.9)
outliers_idx = train_clean[train_clean['minimum_nights'] >= upper_lim].index

train_clean = train_clean.drop(outliers_idx).reset_index(drop=True)

In [306]:
train_clean['review_scores_avg'] = np.mean(train_clean[['review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness']], axis=1)
test_clean['review_scores_avg'] = np.mean(test_clean[['review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness']], axis=1)

train_clean['review_scores_avg'] = train_clean['review_scores_avg'].fillna(value=0)
test_clean['review_scores_avg'] = test_clean['review_scores_avg'].fillna(value=0)


In [307]:
train_filter_2 = train_clean.copy()
test_filter_2 = test_clean.copy()

### Inspect and impute columns with missing values

#### Model imputation

In [308]:
train_filter_temp = train_filter_2.copy()
test_filter_temp = test_filter_2.copy()

train_filter_temp['host_is_superhost'] = train_filter_2['host_is_superhost'].replace({'f': 0, 't': 1})
test_filter_temp['host_is_superhost'] = test_filter_2['host_is_superhost'].replace({'f': 0, 't': 1})

superhost_model = smf.logit(formula="host_is_superhost ~ calculated_host_listings_count*number_of_reviews_ltm + response_rate", data=train_filter_temp).fit()

train_filter_temp['host_is_superhost_imputed'] = superhost_model.predict(train_filter_temp) > 0.486
train_filter_temp['host_is_superhost'].fillna(train_filter_temp['host_is_superhost_imputed'], inplace=True)
train_filter_temp.drop(columns=['host_is_superhost_imputed'], inplace=True)

test_filter_temp['host_is_superhost_imputed'] = superhost_model.predict(test_filter_temp) > 0.486
test_filter_temp['host_is_superhost'].fillna(test_filter_temp['host_is_superhost_imputed'], inplace=True)
test_filter_temp.drop(columns=['host_is_superhost_imputed'], inplace=True)


train_filter_temp['host_is_superhost'] = train_filter_temp['host_is_superhost'].replace({0:'f', 1:'t'})
test_filter_temp['host_is_superhost'] = test_filter_temp['host_is_superhost'].replace({0:'f', 1:'t'})


train_filter_2['host_is_superhost'] = train_filter_temp['host_is_superhost']
test_filter_2['host_is_superhost'] = test_filter_temp['host_is_superhost']


Optimization terminated successfully.
         Current function value: 0.586773
         Iterations 8


In [309]:
acceptance_model = smf.logit(formula="acceptance_rate ~ calculated_host_listings_count + accommodates", data=train_filter_2).fit()
acceptance_model.summary()

missing_acceptance = train_filter_2[train_filter_2['acceptance_rate'].isnull()]
predicted_acceptance = acceptance_model.predict(missing_acceptance)
train_filter_2.loc[missing_acceptance.index, 'acceptance_rate'] = predicted_acceptance


response_model = smf.logit(formula="response_rate ~ accommodates", data=train_filter_2).fit()
response_model.summary()

missing_response = train_filter_2[train_filter_2['response_rate'].isnull()]
predicted_response = response_model.predict(missing_response)
train_filter_2.loc[missing_response.index, 'response_rate'] = predicted_response


Optimization terminated successfully.
         Current function value: 0.192040
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.067415
         Iterations 8


#### Naive imputation

In [310]:
temp_data_train = train_filter_2.dropna(how='any')

for col in temp_data_train.select_dtypes(include='number').columns:
    if train_filter_2.isna().sum()[col] != 0:
        train_filter_2[col].fillna(value=train_filter_2[col].median(), inplace=True) 

train_final = train_filter_2.copy()


In [311]:
temp_data_test = test_filter_2.dropna(how='any')

for col in temp_data_test.select_dtypes(include='number').columns:
    if test_filter_2.isna().sum()[col] != 0:
        test_filter_2[col].fillna(value=test_filter_2[col].median(), inplace=True)  

test_final = test_filter_2.copy()


## Sklearn

In [312]:
cols_for_sklearn = ['acceptance_rate', 'accommodates', 'availability_30', 'availability_90',
                    'bathrooms_num', 'bathrooms_shared', 'beds', 
                    'calculated_host_listings_count', 'last_review_in_months',
                    'host_is_superhost','host_since_in_months', 'id', 'latitude', 'longitude', 
                    'maximum_nights', 'minimum_nights', 
                    'neighbourhood_grouped', 'number_of_reviews_ltm', 
                    'price', 'property_type_cleansed', 'response_rate',  
                    'review_scores_avg', 'reviews_per_month', 'room_type',
                    'maximum_nights_avg_ntm', 'minimum_nights_avg_ntm',]
                    # ,'first_review_in_months', 'last_review_in_months' 


cols_for_sklearn_test = [name for name in cols_for_sklearn if name != 'price']

subset_train = train_final[cols_for_sklearn].copy()
subset_test = test_final[cols_for_sklearn_test].copy()


In [313]:
X_train = subset_train.drop(columns='price')
y_train = np.log(subset_train.price)

X_test = subset_test.copy()

In [314]:
X_train_preprocessed = pd.get_dummies(X_train, drop_first=True)
X_test_preprocessed = pd.get_dummies(X_test, drop_first=True)

In [315]:
room_type_cols = [name for name in X_train_preprocessed.columns if 'room_type' in name]
neighbourhood_groups = [name for name in X_train_preprocessed.columns if 'neighbourhood_grouped' in name]
property_groups = [name for name in X_train_preprocessed.columns if 'property' in name]


interaction_pairs = [('bathrooms_num', 'bathrooms_shared_t'),
                     ('beds', 'accommodates'),
                     ('acceptance_rate', 'response_rate'),
                     
                     ('accommodates', 'availability_30'),
                     ('accommodates', 'availability_90'),
                     ('acceptance_rate', 'host_is_superhost_t'), 
                     ('response_rate', 'host_is_superhost_t'),
                     
                    ('number_of_reviews_ltm', 'review_scores_avg'),
                    ('last_review_in_months', 'number_of_reviews_ltm'),
                    ('reviews_per_month', 'number_of_reviews_ltm'),
                    ('reviews_per_month', 'review_scores_avg'), 
                    ('review_scores_avg', 'calculated_host_listings_count')] 
        # room_type*accommodates, room_type*neighbourhood_grouped, room_type*property_type, 
        # room_type*availability_90, property_type*neighbourhood_grouped        
        
        

for i in room_type_cols:
    interaction_pairs.append((i, 'beds'))
#     interaction_pairs.append((i, 'availability_30'))
    
    for j in neighbourhood_groups: 
        interaction_pairs.append((j, i))
    
    for k in property_groups:
        interaction_pairs.append((i, k))

        
for i in property_groups:
    for j in neighbourhood_groups:
        interaction_pairs.append((i, j))

        
        
### ----- ###

interaction_cols = []
for t in interaction_pairs:
    for item in t:
        interaction_cols.append(item)
        
        
        
interaction_df = pd.DataFrame()
interaction_df_test = pd.DataFrame()


for pair in interaction_pairs:
    interaction_columns = pair
    
    X_train_interaction_subset = X_train_preprocessed[list(interaction_columns)]
    X_test_interaction_subset = X_test_preprocessed[list(interaction_columns)]
    
    poly_features = PolynomialFeatures(interaction_only=True, include_bias=False)
    interaction_terms_train = poly_features.fit_transform(X_train_interaction_subset)
    interaction_terms_test = poly_features.transform(X_test_interaction_subset)
    
    interaction_column_names = poly_features.get_feature_names_out(input_features=interaction_columns)
    
    interaction_df_pair = pd.DataFrame(interaction_terms_train, columns=interaction_column_names)
    interaction_df_pair_test = pd.DataFrame(interaction_terms_test, columns=interaction_column_names)
    

    interaction_df = pd.concat([interaction_df, interaction_df_pair], axis=1)
    interaction_df_test = pd.concat([interaction_df_test, interaction_df_pair_test], axis=1)

    

In [316]:
X_train_processed = pd.concat([X_train_preprocessed.drop(columns=interaction_cols), interaction_df], axis=1)
X_test_processed = pd.concat([X_test_preprocessed.drop(columns=interaction_cols), interaction_df_test], axis=1)        

X_train_processed = X_train_processed.loc[:,~X_train_processed.columns.duplicated()].copy()
X_train_processed = X_train_processed.reindex(sorted(X_train_processed.columns), axis=1)
X_test_processed = X_test_processed.loc[:,~X_test_processed.columns.duplicated()].copy()
X_test_processed = X_test_processed.reindex(sorted(X_test_processed.columns), axis=1)


In [317]:
X_train_processed['maximum_nights^2'] = X_train_processed['maximum_nights'] ** 2
X_train_processed['maximum_nights^3'] = X_train_processed['maximum_nights'] ** 3
X_train_processed['minimum_nights^2'] = X_train_processed['minimum_nights'] ** 2
X_train_processed['minimum_nights^3'] = X_train_processed['minimum_nights'] ** 3
X_train_processed['longitude^2'] = X_train_processed['longitude'] ** 2
X_train_processed['latitude^2'] = X_train_processed['latitude'] ** 2
X_train_processed['longitude^3'] = X_train_processed['longitude'] ** 3
X_train_processed['latitude^3'] = X_train_processed['latitude'] ** 3
X_train_processed['calculated_host_listings_count_log'] = np.log(X_train_processed['calculated_host_listings_count'])
X_train_processed['calculated_host_listings_count_log^2'] = np.log(X_train_processed['calculated_host_listings_count'])**2
X_train_processed['calculated_host_listings_count_log^3'] = np.log(X_train_processed['calculated_host_listings_count'])**3
X_train_processed['reviews_per_month^2'] = X_train_processed['reviews_per_month'] ** 2
X_train_processed['number_of_reviews_ltm_root'] = np.sqrt(X_train_processed['number_of_reviews_ltm'])

X_test_processed['maximum_nights^2'] = X_test_processed['maximum_nights'] ** 2
X_test_processed['maximum_nights^3'] = X_test_processed['maximum_nights'] ** 3
X_test_processed['minimum_nights^2'] = X_test_processed['minimum_nights'] ** 2
X_test_processed['minimum_nights^3'] = X_test_processed['minimum_nights'] ** 3
X_test_processed['longitude^2'] = X_test_processed['longitude'] ** 2
X_test_processed['latitude^2'] = X_test_processed['latitude'] ** 2
X_test_processed['longitude^3'] = X_test_processed['longitude'] ** 3
X_test_processed['latitude^3'] = X_test_processed['latitude'] ** 3
X_test_processed['calculated_host_listings_count_log'] = np.log(X_test_processed['calculated_host_listings_count'])
X_test_processed['calculated_host_listings_count_log^2'] = np.log(X_test_processed['calculated_host_listings_count'])**2
X_test_processed['calculated_host_listings_count_log^3'] = np.log(X_test_processed['calculated_host_listings_count'])**3
X_test_processed['reviews_per_month^2'] = X_test_processed['reviews_per_month'] ** 2
X_test_processed['number_of_reviews_ltm_root'] = np.sqrt(X_test_processed['number_of_reviews_ltm'])


In [318]:
cat_cols = X_train.select_dtypes(exclude='number').columns
cat_dummy_dict = {}

for cat in cat_cols:
    curr_cat_dummies = pd.get_dummies(X_train[cat], prefix=cat, drop_first=True).columns
    cat_dummy_dict[cat] = curr_cat_dummies

    
possible_cat_cat = []
for k in cat_dummy_dict.keys():
    dummy_list = list(cat_dummy_dict[k])
    
    for j in cat_dummy_dict.keys():
        dummy_list2 = list(cat_dummy_dict[j])
        
        
        for dummy in dummy_list:
            for dummy2 in dummy_list2:
    
                if k != j and dummy != dummy2:
                    final_dummy = dummy + ' ' + dummy2
                    possible_cat_cat.append(final_dummy)

                    
plain_dummies = list(pd.get_dummies(X_train[cat_cols], drop_first=True).columns)

binary_cols = plain_dummies + possible_cat_cat

new_cat_cols = [col for col in X_train_processed.columns if col in binary_cols]
new_num_cols = [col for col in X_train_processed.columns if col not in new_cat_cols]


In [319]:
sc = StandardScaler()
sc.fit(X_train_processed[new_num_cols])

train_scaled = sc.transform(X_train_processed[new_num_cols])
test_scaled = sc.transform(X_test_processed[new_num_cols]) 

X_train_scaled = pd.DataFrame(train_scaled, columns=new_num_cols)
X_test_scaled = pd.DataFrame(test_scaled, columns=new_num_cols)


In [320]:
X_train_processed[new_num_cols] = X_train_scaled
X_test_processed[new_num_cols] = X_test_scaled

In [321]:
X_train_final = X_train_processed.drop(columns=['calculated_host_listings_count'])
X_test_final = X_test_processed.drop(columns=['calculated_host_listings_count'])

X_train_final = X_train_processed.copy()
X_test_final = X_test_processed.copy()

In [322]:
from sklearn.linear_model import LinearRegression
lrm = LinearRegression()
lrm.fit(X_train_final, y_train)

y_pred_train = np.exp(lrm.predict(X_train_final))*1.05
curr_rmse = mean_squared_error(train_final.price, y_pred_train, squared = False) 
curr_mae = mean_absolute_error(train_final.price, y_pred_train) 

print(round(curr_rmse, 4))
print("Diff rmse-mae:", round(curr_rmse-curr_mae, 4))


127.756
Diff rmse-mae: 74.6614


In [333]:
predicted_values = pd.DataFrame(np.exp(lrm.predict(X_test_final))*1.05, columns=['predicted'])
predicted_values = predicted_values.merge(test_final['id'], left_index=True, right_index=True).set_index('id').rename(columns={0:'predicted'})
predicted_values

# predicted_values.to_csv('liner_model_7.csv')

##### Log 

In [324]:
# 125.0812
# Diff rmse-mae: 72.5318

#### mean for review scores avg and fill na with 0
### 124.7993
### Diff rmse-mae: 72.237

## mean for review scores avg and fill na with mean
# 124.8374
# Diff rmse-mae: 72.2813

## mean for review scores avg and fill na with 1
# 124.8009
# Diff rmse-mae: 72.2385


#### Property types from >5 to > 10
### 127.7623
### Diff rmse-mae: 74.6855
# Test rmse 119.39


## Only drop upper outliers for price
# 127.9261
# Diff rmse-mae: 74.8203
# Test 119.51

## price percentile to 0.05
# Test 119.41


## interact lising count with review average and listing count with superhost
# 127.7953
# Diff rmse-mae: 74.7491

## just interact lising count with review average, no listing count and superhost
# 127.7801
# Diff rmse-mae: 74.7144

## raise hood count from 100 to 120 
# 128.5801
# Diff rmse-mae: 74.8061
# test Rmse 119.43

#### Raise minimum neighbourhood count to 20
### 127.756
### Diff rmse-mae: 74.6614
    
    

In [325]:
# residuals = np.exp(y_train) - y_pred_train

# # Calculate the standard deviation of residuals
# residual_std = np.std(residuals)

# # Calculate studentized residuals
# studentized_residuals = residuals / residual_std

# # Convert to DataFrame for easy manipulation
# data = {'residuals': residuals, 'studentized_residuals': studentized_residuals}
# stud_res_df = pd.DataFrame(data)
# stud_res_df


In [326]:
# from scipy import stats

# N = train_final.shape[0]
# p = ridgeCV.coef_.shape[0]
# alpha = 0.05

# critic_val = stats.t.ppf(1 - alpha / 2, N - p - 1)

# outliers_idx = np.array(stud_res_df[stud_res_df.studentized_residuals > critic_val].index)

# train_final.iloc[outliers_idx, :]['price'].describe()

In [327]:
# # Compute hat matrix
# H = X_train_processed.dot(np.linalg.pinv(X_train_processed.T.dot(X_train_processed)).dot(X_train_processed.T))

# # Get diagonal elements of hat matrix as leverage
# leverage = np.diag(H)

# # Calculate average leverage
# avg_leverage = np.mean(leverage)

# # # Find indices of observations with leverage greater than 4 times the average leverage
# high_leverage_indices = np.where(leverage >= 4*avg_leverage)

# # avg_leverage

In [328]:
# high_inf = np.intersect1d(outliers_idx, high_leverage_indices)
# train_final.iloc[high_inf, :]['price']

In [329]:
# X_train_clean = X_train_final.drop(high_inf).reset_index(drop=True)
# y_train_clean = y_train.drop(high_inf).reset_index(drop=True)

In [330]:
# alphas = 10**np.linspace(1, 0, 100)

# final_model = RidgeCV(alphas=alphas, cv=10, scoring='neg_root_mean_squared_error')
# final_model.fit(X_train_clean, y_train_clean)
# optimal_alpha = final_model.alpha_
# print(optimal_alpha)


In [331]:
# clean_train_pred = np.exp(final_model.predict(X_train_clean))*1.05
# rmse = mean_squared_error(y_train_clean, clean_train_pred, squared = False) 
# mae = mean_absolute_error(y_train_clean, clean_train_pred) 

# rmse, mae, rmse-mae