2018-08-27

Machine Learning Fundamentals: Predicting Airbnb Prices

https://www.dataquest.io/blog/machine-learning-tutorial/

In [1]:
import pandas as pd

In [2]:
dc_listings = pd.read_csv('/data/tutorial/listings.csv')
dc_listings.shape

(3723, 92)

In [3]:
dc_listings.head(2)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,7087327,https://www.airbnb.com/rooms/7087327,20151002231825,2015-10-03,Historic DC Condo-Walk to Capitol!,Professional pictures coming soon! Welcome to ...,,Professional pictures coming soon! Welcome to ...,none,,...,,f,,"DISTRICT OF COLUMBIA, WASHINGTON",f,flexible,f,f,18,
1,975833,https://www.airbnb.com/rooms/975833,20151002231825,2015-10-03,Spacious Capitol Hill Townhouse,,Beautifully renovated Capitol Hill townhouse. ...,Beautifully renovated Capitol Hill townhouse. ...,none,,...,9.0,f,,"DISTRICT OF COLUMBIA, WASHINGTON",f,strict,f,f,1,2.11


In [4]:
dc_listings.loc[0,'accommodates']

4

In [5]:
import numpy as np
our_acc_value = 3
first_living_space_value = dc_listings.loc[0, 'accommodates']
first_distance = np.abs(first_living_space_value - our_acc_value)
first_distance

1

In [6]:
dc_listings['distance'] = np.abs(dc_listings.accommodates - our_acc_value)
dc_listings.distance.value_counts()

1     2294
2      503
0      461
3      279
5       73
4       35
7       22
6       17
9       12
13       8
8        7
12       6
11       4
10       2
Name: distance, dtype: int64

In [7]:
dc_listings.distance.value_counts().sort_index()

0      461
1     2294
2      503
3      279
4       35
5       73
6       17
7       22
8        7
9       12
10       2
11       4
12       6
13       8
Name: distance, dtype: int64

In [8]:
dc_listings.distance.value_counts?

In [9]:
dc_listings = dc_listings.sample(frac=1, random_state=0)
dc_listings = dc_listings.sort_values('distance')
dc_listings.price.head()

2645     $75.00
2825    $120.00
2145     $90.00
2541     $50.00
3349    $105.00
Name: price, dtype: object

In [10]:
dc_listings['price'] = dc_listings.price.str.replace('\$|,', '').astype(float)
mean_price = dc_listings.price.iloc[:5].mean()
mean_price

88.0

In [11]:
dc_listings.drop('distance', axis=1)
train_df = dc_listings.copy().iloc[:2792]
test_df = dc_listings.copy().iloc[:2792:]

In [12]:
def predict_price(new_listing_value, feature_column):
    temp_df = train_df
    # 每一个数据都与数据集中其他数据比较得到所有的距离，然后再取排序。distance这一列对每一行都会重新运算一次。
    temp_df['distance'] = np.abs(dc_listings[feature_column] - new_listing_value)  # simple distance...
    temp_df = temp_df.sort_values('distance')
    knn_5 = temp_df.price.iloc[:5]  # key point, so simple...
    predicted_price = knn_5.mean()
    return predicted_price

In [13]:
test_df['predicted_price'] = test_df.accommodates.apply(predict_price, feature_column='accommodates')

In [14]:
test_df['squared_error'] = (test_df['predicted_price'] - test_df['price']) ** 2
mse = test_df['square_feet'].mean()
rmse = np.sqrt(mse)
rmse

28.96933244350346

In [15]:
for feature in ['accommodates','bedrooms','bathrooms','number_of_reviews']:
    # 固定为accommodates列？
    test_df['predicted_price'] = test_df.accommodates.apply(predict_price,feature_column=feature)

    test_df['squared_error'] = (test_df['predicted_price'] - test_df['price'])**(2)
    mse = test_df['squared_error'].mean()
    rmse = mse ** (1/2)
    print("RMSE for the {} column: {}".format(feature,rmse))

RMSE for the accommodates column: 82.33690060096424
RMSE for the bedrooms column: 150.02436449688233
RMSE for the bathrooms column: 94.71267173006166
RMSE for the number_of_reviews column: 83.28364658182072


In [16]:
for feature in ['accommodates','bedrooms','bathrooms','number_of_reviews']:
    # 固定为accommodates列？
    test_df['predicted_price'] = test_df[feature].apply(predict_price,feature_column=feature)

    test_df['squared_error'] = (test_df['predicted_price'] - test_df['price'])**(2)
    mse = test_df['squared_error'].mean()
    rmse = mse ** (1/2)
    print("RMSE for the {} column: {}".format(feature,rmse))

RMSE for the accommodates column: 82.33690060096424
RMSE for the bedrooms column: 86.19206288789624
RMSE for the bathrooms column: 85.354356087735
RMSE for the number_of_reviews column: 85.55654692694789


In [17]:
from scipy.spatial import distance

first_listing = dc_listings.iloc[0][['accommodates', 'bathrooms']]
fifth_listing = dc_listings.iloc[20][['accommodates', 'bathrooms']]
first_fifth_distance = distance.euclidean(first_listing, fifth_listing)
first_fifth_distance

0.0

In [18]:
def predict_price_multivariate(new_listing_value,feature_columns):
    temp_df = norm_train_df
    
    # 依然是让整列的数据与字典里单独的数据进行距离计算，得到所有的距离并进行排序
    temp_df['distance'] = distance.cdist(temp_df[feature_columns],[new_listing_value[feature_columns]])
    temp_df = temp_df.sort_values('distance')
    knn_5 = temp_df.price.iloc[:5]
    predicted_price = knn_5.mean()
    return(predicted_price)

norm_test_df = test_df
norm_train_df = train_df

cols = ['accommodates', 'bathrooms']
norm_test_df['predicted_price'] = norm_test_df[cols].apply(predict_price_multivariate,feature_columns=cols,axis=1)    
norm_test_df['squared_error'] = (norm_test_df['predicted_price'] - norm_test_df['price'])**(2)
mse = norm_test_df['squared_error'].mean()
rmse = mse ** (1/2)
print(rmse)

87.13074036702227


## sklearn part

In [19]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [20]:
knn.fit(norm_train_df[cols].fillna(0), norm_train_df['price'])

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [21]:
two_features_predictions = knn.predict(norm_test_df[cols].fillna(0))
two_features_predictions

array([116.6, 140. , 140. , ...,  66.8,  66.8,  66.8])

In [22]:
from sklearn.metrics import mean_squared_error
two_features_rmse = mean_squared_error(norm_test_df['price'], two_features_predictions) ** (1/2)
two_features_rmse

85.62648497028687

In [23]:
knn = KNeighborsRegressor(algorithm='brute')

cols = ['accommodates','bedrooms','bathrooms','beds']

knn.fit(norm_train_df[cols].fillna(0), norm_train_df['price'])
four_features_predictions = knn.predict(norm_test_df[cols].fillna(0))
four_features_mse = mean_squared_error(norm_test_df['price'], four_features_predictions)
four_features_rmse = four_features_mse ** (1/2)
four_features_rmse

82.62119386826703