In [86]:
import numpy as np
import pandas as pd
from scaler import StandardScaler

In [45]:
df = pd.read_csv('./data/San Francisco-listings.csv')

df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'access', 'interaction', 'house_rules',
       'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url',
       'host_id', 'host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'street',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms',

In [46]:
df = df[['latitude', 'longitude', 'accommodates', 'property_type', 'room_type', 'number_of_reviews', 'review_scores_rating', 'price']]

In [76]:
# 欠損値(review_scores_rating)を 0 へ変換
df = df.fillna(0)

# type についてその種類の確認
print("=== variety of property_type ===")
vc_property = df['property_type'].value_counts()
print(vc_property)

print("=== variety of room_type ===")
vc_room = df['room_type'].value_counts()
print(vc_room)

=== variety of property_type ===
Apartment             440
House                 267
Condominium            93
Guest suite            71
Boutique hotel         24
Loft                   21
Townhouse              17
Serviced apartment     16
Hostel                 15
Hotel                  11
Guesthouse              8
Bungalow                3
Resort                  3
Bed and breakfast       2
Cottage                 2
Aparthotel              2
Timeshare               2
Tiny house              1
Other                   1
Boat                    1
Name: property_type, dtype: int64
=== variety of room_type ===
Entire home/apt    612
Private room       360
Shared room         28
Name: room_type, dtype: int64


In [77]:
# ランダムに 1000 件抽出
df = df.sample(n=1000)
df.head(3)

Unnamed: 0,latitude,longitude,accommodates,property_type,room_type,number_of_reviews,review_scores_rating,price
136,37.805912,-122.408299,4,Condominium,Entire home/apt,4,93.0,$275.00
1503,37.749329,-122.410792,6,Apartment,Entire home/apt,41,95.0,$300.00
4239,37.784313,-122.420136,2,Boutique hotel,Private room,57,94.0,$150.00


In [78]:
def dollartofloat(s):
    s = s.replace('$', '')
    s = s.replace(',', '')
    return float(s)

prices = df['price'].values
y = [dollartofloat(p) for p in prices]

In [100]:
df_processed = df[['latitude', 'longitude', 'accommodates', 'number_of_reviews', 'review_scores_rating']]
X = df_processed.values

# room_type は onehot vector に変換
df_room = pd.get_dummies(df[['room_type']], drop_first=True)
X = np.hstack((X, df_room.values))
df_room.head(3)

Unnamed: 0,room_type_Private room,room_type_Shared room
136,0,0
1503,0,0
4239,1,0


In [101]:
def addothers(s):
    if s in ['Apartment', 'House', 'Condominium', 'Guest suite']:
        return s
    else:
        return 'Others'

# property_type は Apartment, House, Condominium, Guest Suit, Others に分類したのち、one-hot vector に変換
df_property = df[['property_type']]
df_property = df_property.applymap(addothers)
df_property = pd.get_dummies(df_property, drop_first=True)
X = np.hstack((X, df_property.values))
df_property.head(3)

Unnamed: 0,property_type_Condominium,property_type_Guest suite,property_type_House,property_type_Others
136,1,0,0,0
1503,0,0,0,0
4239,0,0,0,1


In [102]:
X = X.astype(np.float32)
print(X.shape)
# X に StandardScalerを適用
sc = StandardScaler()
sc.fit_transform(X)

(1000, 11)


array([[ 1.7195199 ,  0.84687126,  0.3719947 , ..., -0.27645302,
        -0.60353655, -0.38484493],
       [-0.73719734,  0.75047463,  1.3760291 , ..., -0.27645302,
        -0.60353655, -0.38484493],
       [ 0.7817505 ,  0.3882479 , -0.63203967, ..., -0.27645302,
        -0.60353655,  2.5984492 ],
       ...,
       [-0.57753474,  1.0115736 ,  0.87401193, ...,  3.6172512 ,
        -0.60353655, -0.38484493],
       [ 0.50565344, -0.4946982 , -0.63203967, ..., -0.27645302,
        -0.60353655, -0.38484493],
       [ 1.1081976 ,  0.7661465 , -0.63203967, ..., -0.27645302,
        -0.60353655, -0.38484493]], dtype=float32)