<a href="https://colab.research.google.com/github/djvaroli/CS542/blob/master/airbnb_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import datetime
from datetime import datetime as dt
import pytz

full_df = pd.read_csv("data.csv")
# print(full_df.zipcode.unique())

zipcodes_df = full_df.zipcode

zipcodes = np.zeros(len(zipcodes_df))

iZip = 0
for zipcode in zipcodes_df:
  if type(zipcode) == str:
    zipcode = zipcode.replace('TX','')
    zipcode = zipcode.replace(' ','')
    zipcodes[iZip]= float(zipcode)
  else:
    zipcodes[iZip] = np.nan
  iZip += 1

full_df['zipcode'] = list(zipcodes)


non_nan = [
        'accommodates',
        'bathrooms',
        'longitude',
        'latitude',
        'beds',
        'bedrooms'
        ]

In [0]:
# replace nans in square feet with the median value
non_binary_nan_features = [
                           'review_scores_rating',
                           'square_feet',
                          ]

for f in non_binary_nan_features:
  med = np.nanmedian(full_df[f])
  full_df[f].fillna(med,inplace=True)

# replace nan with f, assume that if data not available then not verified etc
binary_nan_feautures = ['host_is_superhost','host_identity_verified','is_location_exact']
for bf in binary_nan_feautures:
  full_df[bf].fillna('f',inplace=True)
binary_dict = {'t':1,'f':0}
full_df.replace(binary_dict,inplace=True)


# replace using dictionaries 
features_to_engineer = ['room_type']
room_type_dict = {'Entire home/apt':3, 'Private room':1, 'Hotel room':2, 'Shared room':0}
# bed_type_dict = {'Real Bed': 4,'Futon':1, 'Couch': 0, 'Airbed': 2, 'Pull-out Sofa': 3}
full_df.replace(room_type_dict,inplace=True)
full_df.replace(bed_type_dict,inplace=True)

cols = non_nan + non_binary_nan_features + binary_nan_feautures + features_to_engineer

In [0]:
amenities_weights = {
    'Free parking on premises':1,
    'Free street parking':1,
    'Patio or balcony':1,
    'Gym':1,
    'Pool':1,
    'Air conditioning':1,
    'BBQ grill':1,
    'Wifi':1,
    'Heating':1,
    'Kitchen': 1,
    'Hot water': 1,
    'Internet' :1,
    'Elevator': 1 
  }

chars_to_strip = {"'","{","}",'"'}
am_entries = full_df.amenities
scores = np.zeros(len(am_entries))

i = 0
for entry in am_entries:
  for char_to_strip in chars_to_strip:
    entry = entry.replace(char_to_strip,'')

  entry = entry.split(",")
  score = 0
  for amenity in entry:
    try:
      amenity_weight = amenities_weights[amenity]
    except:
      amenity_weight = 1
    score += amenity_weight
  scores[i] = score
  i += 1

full_df['amenities'] = list(scores)

In [0]:
def normalize(full_df,normalization = '0-mean'):
  if normalization == '0-mean':
    for col in cols:
      col_mean = np.mean(full_df[col])
      col_std = np.std(full_df[col])
      full_df[col] = (full_df[col] - col_mean)/col_std

  elif normalization == 'scaling':
    for col in cols:
      col_max = np.max(full_df[col])
      col_min = np.min(full_df[col])
      full_df[col] = (full_df[col] - col_min)/(col_max - col_min)

  return full_df

full_df = normalize(full_df,'scaling')

In [0]:

def predict_price_multivariate(new_listing, feature_columns, k = 8):
    def compute_eucl_dist(new_listing,train_df,list_of_feautures):
      sum_squares = 0
      for feature in list_of_feautures:
        sum_squares += np.square(train_df[feature] - new_listing[feature])
      return np.sqrt(sum_squares)
      
    temp_df = train_df
    temp_df['distance'] = compute_eucl_dist(new_listing, temp_df, feature_columns)
    temp_df = temp_df.sort_values('distance')
    knn = temp_df.price.iloc[:k]
    predicted_price = knn.mean()
    return predicted_price


In [408]:

test_ind = pd.read_csv("test.csv")
train_ind = pd.read_csv("train.csv")
val_ind = pd.read_csv("val.csv")

train_df = full_df.merge(train_ind, on=["id"])
val_df = full_df.merge(val_ind, on=["id"])
test_df = full_df.merge(test_ind, on=["id"])

k_ = 15
val_df['predicted_price'] = val_df[cols].apply(predict_price_multivariate,feature_columns=cols,k=k_,axis=1)

val_df['squared_error'] = (val_df['predicted_price'] - val_df['price'])**(2)
mse = val_df['squared_error'].mean()
rmse = mse ** (1/2)
print("RMSE with k = %d and %d features: %.2f" % (k_,len(cols),rmse))



RMSE with k = 15 and 12 features: 100.85


In [411]:
predicted_prices = test_df[cols].apply(predict_price_multivariate,feature_columns=cols,axis=1)

tz_NY = pytz.timezone('America/New_York') 
datetime_NY = dt.now(tz_NY)
time_stamp = datetime_NY.strftime("%H_%M_%S")

answ_df = pd.read_csv("test.csv")
answ_df.insert(1,'price',predicted_prices)
answ_df.to_csv("%d_k%d_f%d_%s.csv" % (rmse,k_,len(cols),time_stamp),index=False)

print("Finished")

Finished


In [0]:
# amenities_unique = {}
# chars_to_strip = {"'","{","}",'"'}
# for entry in am_entries:
#   for char_to_strip in chars_to_strip:
#     entry = entry.replace(char_to_strip,'')

#   entry = entry.split(",")

#   for amenity in entry:
#     if amenity in amenities_unique:
#         amenities_unique[amenity] += 1
#     else:
#         amenities_unique[amenity] = 1


# amenities_unique_sorted = dict(sorted(amenities_unique.items(), key=lambda item: item[1],reverse=True))

# amenities = list(amenities_unique_sorted.keys())
# freq = list(amenities_unique_sorted.values())
# amenities_df = pd.DataFrame({'Amenity Name':amenities, 'Num. Occurences':freq})
# amenities_df.to_csv("amenities.csv",index=False)





In [0]:
# # replace nans in square feet with the median value
# sq_feet_med = np.nanmedian(full_df.square_feet)
# full_df.square_feet.fillna(sq_feet_med,inplace=True)

# review_scores_rating_med = np.nanmedian(full_df.review_scores_rating)
# full_df.review_scores_rating.fillna(review_scores_rating_med,inplace=True)

# review_scores_location_med = np.nanmedian(full_df.review_scores_location)
# full_df.review_scores_location.fillna(review_scores_location_med,inplace=True)

# # replace nan with f, assume that if data not available then not verified etc
# binary_feautures = ['host_is_superhost','host_identity_verified','is_location_exact']
# for bf in binary_feautures:
#   full_df[bf].fillna('f',inplace=True)

# # replace using dictionaries 
# binary_dict = {'t':1,'f':0}
# room_type_dict = {'Entire home/apt':3, 'Private room':1, 'Hotel room':2, 'Shared room':0}
# bed_type_dict = {'Real Bed': 4,'Futon':1, 'Couch': 0, 'Airbed': 2, 'Pull-out Sofa': 3}
# full_df.replace(binary_dict,inplace=True)
# full_df.replace(room_type_dict,inplace=True)
# full_df.replace(bed_type_dict,inplace=True)


# amenities_weights = {
#     'Free parking on premises':14,
#     'Free street parking':13,
#     'Patio or balcony':12,
#     'Gym':11,
#     'Pool':10,
#     'Air conditioning':9,
#     'BBQ grill':6,
#     'Wifi':5,
#     'Heating':5,
#     'Kitchen': 5,
#     'Hot water': 5,
#     'Internet' :5,
#     'Elevator': 5 
#   }