In [50]:
import pandas as pd
import numpy as np
from sklearn import metrics, impute
from sklearn.linear_model import Ridge, Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error, accuracy_score, mean_absolute_error
from sklearn.linear_model import RidgeCV, LassoCV, LogisticRegressionCV
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate

In [52]:
train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')
train.head()

Unnamed: 0,id,host_response_rate,host_acceptance_rate,host_identity_verified,accommodates,bedrooms,beds,price,minimum_nights,maximum_nights,...,review_scores_value,instant_bookable,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,bathrooms,is_entire_unit,location_chicago,location_kauai,has_luxury_words
0,917611.0,100.0,81.0,1,1.0,1.0,1.0,125.0,3.0,89.0,...,4.94,0,1.0,0.0,2.02,1.0,0,True,False,1
1,298170.0,100.0,94.0,1,8.0,3.0,3.0,230.0,32.0,1125.0,...,4.71,1,0.0,0.0,0.67,2.0,1,True,False,1
2,386102.0,100.0,100.0,0,5.0,2.0,3.0,208.0,3.0,28.0,...,4.88,0,0.0,0.0,4.19,1.0,1,True,False,0
3,762410.0,100.0,100.0,1,2.0,1.0,1.0,167.0,32.0,125.0,...,4.86,0,0.0,0.0,1.94,1.0,1,True,False,1
4,409023.0,100.0,100.0,1,1.0,1.0,1.0,73.0,32.0,120.0,...,4.85,0,0.0,0.0,0.29,1.0,0,True,False,0


In [54]:
features = [col for col in train.columns if col not in ['id', 'price']]


X_train = train[features]
y_train = np.log(train['price'])
X_test = test[features] 

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Ridge

In [57]:
# create an array of hyperparameter values
alphas = 10**np.linspace(10,-2,200)*0.5 # 200 alpha values

# create an empty list
cv_scores = []

# now, we will use ridge, lasso and cross_val_score
for alpha in alphas: # for each hyperparam
    model = Ridge(alpha=alpha) # create a model with that that hyperparam
    cv_score = cross_val_score(model, X_train_scaled, y_train, scoring = 'neg_mean_absolute_error')
    cv_scores.append(cv_score)

avg_cv_results = -np.array(cv_scores).mean(axis=1)

# first, find the best score: lowest MAE
print('Best avg cv performance:', np.min(avg_cv_results))
print('Best hyperparam:', alphas[np.argmin(avg_cv_results)])

Best avg cv performance: 0.4036094930543285
Best hyperparam: 505.81898988310354


In [61]:
# CROSS VAL PREDICT


alphas = 10**np.linspace(10,-2,200)*0.5
cv_preds = []

for alpha in alphas: # for each alpha
    model = Ridge(alpha=alpha) # create model with that alpha
    cv_pred = cross_val_predict(model, X_train_scaled, y_train) #no scoring because the output will be the preds, not performances

    cv_preds.append(cv_pred)

# each row: the prediction for each obs in the training data WHEN that obs was iin the assessment fold

# convert the log predictions into the linear scale 
cv_pred_errors = np.exp(np.array(cv_preds)) - np.array(np.exp(y_train))

# calculate MAE for each alpha
cv_maes = np.abs((cv_pred_errors).mean(axis=1))

print('Price error value:', np.min(cv_maes))
print('Best alpha:', alphas[np.argmin(cv_maes)]) # same as above

Price error value: 47.362838087529376
Best alpha: 0.005


In [67]:
model = Ridge(alpha = 0.005)
model.fit(X_train_scaled, y_train)
y_pred = np.exp(model.predict(X_test_scaled))
y_pred

array([ 69.18426212, 119.30070217, 137.9171165 , ..., 269.23091239,
       282.0485179 , 452.94598416])

In [75]:
submission = pd.DataFrame({
    'id': test['id'],           # original id column from the test data
    'predicted': y_pred         # your model’s predictions
})

submission.to_csv('submission.csv', index=False)

In [71]:
test.head()

Unnamed: 0,id,host_response_rate,host_acceptance_rate,host_identity_verified,accommodates,bedrooms,beds,minimum_nights,maximum_nights,maximum_minimum_nights,...,review_scores_value,instant_bookable,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,bathrooms,is_entire_unit,location_chicago,location_kauai,has_luxury_words
0,329273.0,100.0,100.0,1,2.0,1.0,1.0,2.0,60.0,2.0,...,4.76,0,0.0,0.0,2.98,1.0,1,True,False,0
1,133046.0,100.0,97.0,1,4.0,2.0,2.0,4.0,120.0,4.0,...,4.72,1,0.0,0.0,0.88,1.0,1,True,False,0
2,960346.0,100.0,89.0,1,6.0,3.0,3.0,2.0,10.0,2.0,...,4.72,0,0.0,0.0,1.4,2.0,0,True,False,1
3,727148.0,100.0,97.0,0,2.0,1.0,1.0,3.0,60.0,3.0,...,4.84,0,1.0,0.0,0.77,1.0,0,True,False,1
4,162411.0,100.0,85.0,1,1.0,1.0,1.0,3.0,90.0,3.0,...,4.94,0,0.0,0.0,0.99,1.0,1,True,False,0
