<a href="https://colab.research.google.com/github/dernameistegal/airbnb_price/blob/main/data_utils/data_preparation/listings_transformations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [171]:
%%capture
!pip install ast
import numpy as np
import ast
import pandas as pd
import numpy as np
from google.colab import drive
import matplotlib.pyplot as plt
import json
drive.mount('/content/drive')

# Data preparation translated_listings.pickle

In [172]:
path= ("/content/drive/MyDrive/Colab/airbnb/data/translations/translated_listings.pickle")
listings = pd.read_pickle(path)

In [173]:
# set id as row index and rename to listing_id
listings.set_index('id', inplace=True)
listings.index.name = "listing_id"

In [174]:
# remove columns from listings that do not make sense as predictors

drop_vars = ["name", "listing_url", "scrape_id", "last_scraped", "picture_url", 
       "neighborhood_overview", "description", "host_id", 
       "host_url", "host_name", "host_location", "host_about",
       "host_thumbnail_url", "host_picture_url", "host_neighbourhood", "host_total_listings_count",
       "host_has_profile_pic", "neighbourhood", "neighbourhood_group_cleansed", "bathrooms",
       "maximum_nights", "maximum_nights",	"minimum_minimum_nights", 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_updated',"has_availability", "calendar_last_scraped",
       "number_of_reviews_ltm", "number_of_reviews_l30d", "license"]

listings = listings.drop(drop_vars, axis=1)

In [175]:
# transform price
listings["price"] = (
    listings["price"].str.replace("$", "").str.replace(",", "").astype(float)
)

# save listing ids where price is zero in missing_data file
with open("/content/drive/MyDrive/Colab/airbnb/data/data1/missing_data.json", "r") as f:
    missing_data = json.load(f)

price_zero = listings[listings["price"] == 0].index
missing_data["price_zero"] = list(price_zero)

with open("/content/drive/MyDrive/Colab/airbnb/data/data1/missing_data.json", "w") as f:
    json.dump(missing_data, f, indent=6)

#remove zeros from price
listings = listings[listings["price"] != 0]

# log price
listings["log_price"] = np.log(listings["price"])

# plot log price
# plt.hist(listings["log_price"], bins=50)
# plt.show()

In [176]:
# recode property into three categories
property_type_entire_unit = listings["property_type"].str.contains("Entire", case=False, na=False)
property_type_entire_unit.name = "property_type_entire_unit"
property_type_shared_room = listings["property_type"].str.contains("Shared", case=False, na=False)
property_type_shared_room.name = "property_type_shared_room"
property_type_private_room = listings["property_type"].str.contains("Private", case=False, na=False)
property_type_private_room.name = "property_type_private_room"
listings = listings.drop("property_type", axis=1)

In [177]:
# recode rooms, neighbourhood_cleansed, properties, host_is_superhost, instant_bookable and host_identity_verified into dummies
rooms = pd.get_dummies(listings["room_type"], prefix="room")
listings = listings.drop("room_type", axis=1)

neighbourhood_cleansed = pd.get_dummies(listings["neighbourhood_cleansed"], prefix="neighbourhood_cleansed")
listings = listings.drop("neighbourhood_cleansed", axis=1)

listings["host_is_superhost"] = listings["host_is_superhost"].map(dict(t=1, f=0))
listings["instant_bookable"] = listings["instant_bookable"].map(dict(t=1, f=0))
listings["host_identity_verified"] = listings["host_identity_verified"].map(
    dict(t=1, f=0)
)

listings = pd.concat([listings, rooms, neighbourhood_cleansed, property_type_entire_unit, property_type_shared_room, property_type_private_room], axis=1)


In [178]:
# recode host_verifications into dummies
listings["host_verifications"] = listings["host_verifications"].apply(ast.literal_eval)
host_verifications = listings["host_verifications"]
host_verifications = pd.get_dummies(host_verifications.apply(pd.Series).stack(dropna=False), prefix="host_verification").sum(level=0)
listings = pd.concat([listings, host_verifications], axis=1)
listings = listings.drop("host_verifications", axis=1)

In [179]:
# recode bath into bath shared dummy and number of baths
bath_is_shared = listings["bathrooms_text"].str.contains("shared", case=False, na=False)
bath_is_shared.name = "bath_is_shared"
bath_is_half = listings["bathrooms_text"].str.contains("half", case=False, na=False)
bath_number = listings["bathrooms_text"].str.extract('(\d+\.?\d?)', expand=False)
bath_number[bath_is_half] = 0.5
bath_number.name = "bath_number"

listings = pd.concat([listings, bath_is_shared, bath_number], axis=1)
listings = listings.drop("bathrooms_text", axis=1)

In [118]:
#@title recode amenities functions
def contains_list(elements, series):
    df = (series == "Not existing value").astype("int")
    for el in elements:
        df = df + series.str.contains(el, case=False, na=0)
    return df
    
def to_binary(series):
  series = (series > 0).astype(int)
  return series

In [119]:
# look at observation with specific amenity
listings["amenities"].loc[contains_list(["\"sauna"], listings["amenities"]).astype(bool)].iloc[0]

'["Wine glasses", "Lock on bedroom door", "Safe", "Iron", "Hair dryer", "First aid kit", "Bed linens", "Room-darkening shades", "Paid parking on premises", "TV", "Hangers", "Long term stays allowed", "Crib", "Dedicated workspace", "Host greets you", "Fire extinguisher", "Shower gel", "Carbon monoxide alarm", "Luggage dropoff allowed", "Paid parking off premises", "Smoke alarm", "Wifi", "Elevator", "Microwave", "Laundromat nearby", "Hot water", "Sauna", "Air conditioning", "Refrigerator", "Essentials", "Body soap", "Clothing storage", "Shampoo", "Heating", "Gym", "Dishes and silverware", "Free street parking"]'

In [120]:
# print number of observations with specific amenity
np.sum(contains_list(["bathtub"], listings["amenities"]))

1491

In [121]:
#@title recode amenities

# binaries1
tmp_amenities = ["Table corner guards"]
amenities_tablecornerguards = contains_list(tmp_amenities, listings["amenities"])
amenities_tablecornerguards.name = "amenities_" + "amenities_tablecornerguards"
amenities_tablecornerguards = to_binary(amenities_tablecornerguards)

tmp_amenities = ["Cleaning before checkout"]
amenities_cleanbeforecheckout = contains_list(tmp_amenities, listings["amenities"])
amenities_cleanbeforecheckout.name = "amenities_" + "amenities_cleanbeforecheckout"
amenities_cleanbeforecheckout = to_binary(amenities_cleanbeforecheckout)

tmp_amenities = ["Cleaning products"]
amenities_cleaningproducts = contains_list(tmp_amenities, listings["amenities"])
amenities_cleaningproducts.name = "amenities_" + "amenities_cleaningproducts"
amenities_cleaningproducts = to_binary(amenities_cleaningproducts)

tmp_amenities = ["host greets you"]
amenities_greetings = contains_list(tmp_amenities, listings["amenities"])
amenities_greetings.name = "amenities_" + "greetings"
amenities_greetings = to_binary(amenities_greetings)

tmp_amenities = ["building staff"]
amenities_staff = contains_list(tmp_amenities, listings["amenities"])
amenities_staff.name = "amenities_" + "staff"
amenities_staff = to_binary(amenities_staff)

tmp_amenities = ["elevator"]
amenities_elevator = contains_list(tmp_amenities, listings["amenities"])
amenities_elevator.name = "amenities_" + "elevator"
amenities_elevator = to_binary(amenities_elevator)

tmp_amenities = ["Single level home"]
amenities_singlelevel = contains_list(tmp_amenities, listings["amenities"])
amenities_singlelevel.name = "amenities_" + "singlelevel"
amenities_singlelevel = to_binary(amenities_singlelevel)

tmp_amenities = ["keypad"]
amenities_keypad = contains_list(tmp_amenities, listings["amenities"])
amenities_keypad.name = "amenities_" + "keypad"
amenities_keypad = to_binary(amenities_keypad)

tmp_amenities = ["private entrance"]
amenities_privateentrance = contains_list(tmp_amenities, listings["amenities"])
amenities_privateentrance.name = "amenities_" + "privateentrance"
amenities_privateentrance = to_binary(amenities_privateentrance)

tmp_amenities = ["bidet"]
amenities_bidet = contains_list(tmp_amenities, listings["amenities"])
amenities_bidet.name = "amenities_" + "bidet"
amenities_bidet = to_binary(amenities_bidet)

tmp_amenities = ["bathtub"]
amenities_bathtub = contains_list(tmp_amenities, listings["amenities"])
amenities_bathtub.name = "amenities_" + "bathtub"
amenities_bathtub = to_binary(amenities_bathtub)

tmp_amenities = ["fire extinguisher"]
amenities_extinguisher = contains_list(tmp_amenities, listings["amenities"])
amenities_extinguisher.name = "amenities_" + "extinguisher"
amenities_extinguisher = to_binary(amenities_extinguisher)

tmp_amenities = ["dining table"]
amenities_diningtable = contains_list(tmp_amenities, listings["amenities"])
amenities_diningtable.name = "amenities_" + "diningtable"
amenities_diningtable = to_binary(amenities_diningtable)

tmp_amenities = ["smart lock"]
amenities_smartlock = contains_list(tmp_amenities, listings["amenities"])
amenities_smartlock.name = "amenities_" + "smartlock"
amenities_smartlock = to_binary(amenities_smartlock)

tmp_amenities = ["Security cameras on property"]
amenities_cameras = contains_list(tmp_amenities, listings["amenities"])
amenities_cameras.name = "amenities_" + "cameras"
amenities_cameras = to_binary(amenities_cameras)

tmp_amenities = ["tv"]
amenities_tv = contains_list(tmp_amenities, listings["amenities"])
amenities_tv.name = "amenities_" + "tv"
amenities_tv = to_binary(amenities_tv)

tmp_amenities = ["sound system", "record player"]
amenities_sound_system = contains_list(tmp_amenities, listings["amenities"])
amenities_sound_system.name = "amenities_" + "soundsystem"
amenities_sound_system = to_binary(amenities_sound_system)

tmp_amenities = ["AC", "air conditioning"]
amenities_aircon = contains_list(tmp_amenities, listings["amenities"])
amenities_aircon.name = "amenities_" + "aircon"
amenities_aircon = to_binary(amenities_aircon)

tmp_amenities = ["lake", "Waterfront", "Boat slip", "beachfront"]
amenities_water_access = contains_list(tmp_amenities, listings["amenities"])
amenities_water_access.name = "amenities_" + "wateraccess"
amenities_water_access = to_binary(amenities_water_access)

tmp_amenities = ["safe", "lockbox"]
amenities_safe = contains_list(tmp_amenities, listings["amenities"])
amenities_safe.name = "amenities_" + "safe"
amenities_safe = to_binary(amenities_safe)

tmp_amenities = ["piano"]
amenities_piano = contains_list(tmp_amenities, listings["amenities"])
amenities_piano.name = "amenities_" + "piano"
amenities_piano = to_binary(amenities_piano)

tmp_amenities = ["fireplace", "fire pit"]
amenities_fireplace = contains_list(tmp_amenities, listings["amenities"])
amenities_fireplace.name = "amenities_" + "fireplace"
amenities_fireplace = to_binary(amenities_fireplace)

tmp_amenities = ["bio", "ecological", "natur", "fairtrade", "organic"]
amenities_bio = contains_list(tmp_amenities, listings["amenities"])
amenities_bio.name = "amenities_" + "bio"
amenities_bio = to_binary(amenities_bio)

tmp_amenities = ["wifi", "Ethernet connection"]
amenities_wifi = contains_list(tmp_amenities, listings["amenities"])
amenities_wifi.name = "amenities_" + "wifi"
amenities_wifi = to_binary(amenities_wifi)

tmp_amenities = ["balcony"]
amenities_balcony = contains_list(tmp_amenities, listings["amenities"])
amenities_balcony.name = "amenities_" + "balcony"
amenities_balcony = to_binary(amenities_balcony)

tmp_amenities = ["\"Outdoor"]
amenities_outdoor = contains_list(tmp_amenities, listings["amenities"])
amenities_outdoor.name = "amenities_" + "outdoor"
amenities_outdoor = to_binary(amenities_outdoor)

# not sure if this is smart
tmp_amenities = ["smoke_alarm", "monoxide alarm"]
amenities_smoke_alarm = contains_list(tmp_amenities, listings["amenities"])
amenities_smoke_alarm.name = "amenities_" + "smokealarm"
amenities_smoke_alarm = to_binary(amenities_smoke_alarm)

tmp_amenities = ["pets"]
amenities_pets = contains_list(tmp_amenities, listings["amenities"])
amenities_pets.name = "amenities_" + "pets"
amenities_pets = to_binary(amenities_pets)

tmp_amenities = ["iron"]
amenities_iron = contains_list(tmp_amenities, listings["amenities"])
amenities_iron.name = "amenities_" + "iron"
amenities_iron = to_binary(amenities_iron)

tmp_amenities = ["heating"]
amenities_heating = contains_list(tmp_amenities, listings["amenities"])
amenities_heating.name = "amenities_" + "heating"
amenities_heating = to_binary(amenities_heating)

tmp_amenities = ["ev charger"]
amenities_ev_charger = contains_list(tmp_amenities, listings["amenities"])
amenities_ev_charger.name = "amenities_" + "evcharger"
amenities_ev_charger = to_binary(amenities_ev_charger)

tmp_amenities = ["\"gym\"", "Gym in building", "Shared gym"]
amenities_gym = contains_list(tmp_amenities, listings["amenities"])
amenities_gym.name = "amenities_" + "gym"
amenities_gym = to_binary(amenities_gym)

tmp_amenities = ["Baby monitor", "baby safety gates", "baby bath", 
                 "Babysitter recommendations", "Changing table", 
                 "crib", "high chair", "Outlet covers"]
amenities_baby = contains_list(tmp_amenities, listings["amenities"])
amenities_baby.name = "amenities_" + "baby"
amenities_baby = to_binary(amenities_baby)

tmp_amenities = ["Beach essentials", "Barbecue utensils", "Bikes", "Board games", "toys", "game console",
                 "ping pong", "Pool table", "bbq grill"]
amenities_games = contains_list(tmp_amenities, listings["amenities"])
amenities_games.name = "amenities_" + "games"
amenities_games = to_binary(amenities_games)

# binaries2
tmp_amenities = ["\"washer"]
amenities_washer = contains_list(tmp_amenities, listings["amenities"])
amenities_washer.name = "amenities_" + "washer"
amenities_washer = to_binary(amenities_washer)

tmp_amenities = ["Free washer"]
amenities_freewasher = contains_list(tmp_amenities, listings["amenities"])
amenities_freewasher.name = "amenities_" + "freewasher"
amenities_freewasher = to_binary(amenities_freewasher)

tmp_amenities = ["Paid washer"]
amenities_paidwasher = contains_list(tmp_amenities, listings["amenities"])
amenities_paidwasher.name = "amenities_" + "paidwasher"
amenities_paidwasher = to_binary(amenities_paidwasher)

tmp_amenities = ["\"dryer"]
amenities_dryer = contains_list(tmp_amenities, listings["amenities"])
amenities_dryer.name = "amenities_" + "dryer"
amenities_dryer = to_binary(amenities_dryer)

tmp_amenities = ["Free dryer"]
amenities_freedryer = contains_list(tmp_amenities, listings["amenities"])
amenities_freedryer.name = "amenities_" + "freedryer"
amenities_freedryer = to_binary(amenities_freedryer)

tmp_amenities = ["Paid dryer"]
amenities_paiddryer = contains_list(tmp_amenities, listings["amenities"])
amenities_paiddryer.name = "amenities_" + "paiddryer"
amenities_paiddryer = to_binary(amenities_paiddryer)

tmp_amenities = ["Private fenced garden or backyard", "Private garden or backyard"]
amenities_privategarden = contains_list(tmp_amenities, listings["amenities"])
amenities_privategarden.name = "amenities_" + "privategarden"
amenities_privategarden = to_binary(amenities_privategarden)

tmp_amenities = ["Shared fenced garden or backyard", "Shared garden or backyard"]
amenities_sharedgarden = contains_list(tmp_amenities, listings["amenities"])
amenities_sharedgarden.name = "amenities_" + "sharedgarden"
amenities_sharedgarden = to_binary(amenities_sharedgarden)

tmp_amenities = ["free carport, free driveway, free parking, free residential"]
amenities_free_parking = contains_list(tmp_amenities, listings["amenities"])
amenities_free_parking.name = "amenities_" + "freeparking"
amenities_free_parking = to_binary(amenities_free_parking)

tmp_amenities = ["paid parking, paid street"]
amenities_paid_parking = contains_list(tmp_amenities, listings["amenities"])
amenities_paid_parking.name = "amenities_" + "paidparking"
amenities_paid_parking = to_binary(amenities_paid_parking)

tmp_amenities = ["\"hot tub", "\"sauna"]
amenities_spa = contains_list(tmp_amenities, listings["amenities"])
amenities_spa.name = "amenities_" + "spa"
amenities_spa = to_binary(amenities_spa)

tmp_amenities = ["shared hot tub", "shared sauna"]
amenities_sharedspa = contains_list(tmp_amenities, listings["amenities"])
amenities_sharedspa.name = "amenities_" + "sharedspa"
amenities_sharedspa = to_binary(amenities_sharedspa)

tmp_amenities = ["private hot tub", "private sauna"]
amenities_privatespa = contains_list(tmp_amenities, listings["amenities"])
amenities_privatespa.name = "amenities_" + "privatespa"
amenities_privatespa = to_binary(amenities_privatespa)

tmp_amenities = ["\"pool\"", "indoor heated pool"]
amenities_pool = contains_list(tmp_amenities, listings["amenities"])
amenities_pool.name = "amenities_" + "pool"
amenities_pool = to_binary(amenities_pool)

tmp_amenities = ["Shared outdoor infinity rooftop pool", "Shared outdoor pool",
                 "Shared outdoor rooftop pool", "Shared pool"]
amenities_sharedpool = contains_list(tmp_amenities, listings["amenities"])
amenities_sharedpool.name = "amenities_" + "sharedpool"
amenities_sharedpool = to_binary(amenities_sharedpool)

tmp_amenities = ["private pool"]
amenities_privatepool = contains_list(tmp_amenities, listings["amenities"])
amenities_privatepool.name = "amenities_" + "privatepool"
amenities_privatepool = to_binary(amenities_privatepool)

# non-binaries
bath_amenities = ["soap", "conditioner", "shampoo", "shower gel", "bathroom essentials"]
amenities_nbath = contains_list(bath_amenities, listings["amenities"])
amenities_nbath.name = "amenities_" + "nbath"

tv_extras_amenities = ["premium", "amazon", "apple tv", "netflix", "HBO"]
amenities_tv_extras = contains_list(tv_extras_amenities, listings["amenities"])
amenities_tv_extras.name = "amenities_" + "tv_extras"

tmp_amenities = ["workspace", "office chair", "monitor"]
amenities_workspace = contains_list(tmp_amenities, listings["amenities"])
amenities_workspace.name = "amenities_" + "workspace"

tmp_amenities = ["clothing storage", "walk-in closet", "dresser", "wardrobe", "closet"]
amenities_clothingstrg = contains_list(tmp_amenities, listings["amenities"])
amenities_clothingstrg.name = "amenities_" + "clothingstrg"

amenities_wifi_speed = listings["amenities"].str.extract("(\d+)[^\d]+Mbps")[0].fillna(16).astype(int)
amenities_wifi_speed.name = "amenities_wifi_speed"

# not so sure if this is so smart maybe big guys like fridge as binary dunno
kitchen_amenities = ["stove", "oven", "refrigerator|fridge", "toaster", "rice maker", "kitchen",
                     "Coffee maker|coffee machine|nespresso machine", "Bread maker", "baking sheet", 
                     "dishwasher", "freezer", "kettle", "cooking basics", "dinnerware", "Dishes and silverware", 
                     "trash compactor", "microwave"]
amenities_nkitchen = contains_list(kitchen_amenities, listings["amenities"])
amenities_nkitchen.name = "amenities_" + "nkitchen"

# add amenities to listings
amenity_categories = [amenities_tablecornerguards, amenities_cleanbeforecheckout, amenities_cleaningproducts, amenities_greetings, amenities_staff, amenities_elevator, amenities_singlelevel, amenities_keypad, amenities_privateentrance, amenities_bidet,
                      amenities_bathtub, amenities_extinguisher, amenities_diningtable, amenities_smartlock, amenities_cameras, amenities_tv, amenities_sound_system,
                      amenities_aircon, amenities_water_access, amenities_safe, amenities_piano, amenities_fireplace, amenities_bio, amenities_wifi, amenities_balcony,
                      amenities_outdoor, amenities_smoke_alarm, amenities_pets, amenities_iron, amenities_heating, amenities_ev_charger, amenities_gym, amenities_baby,
                      amenities_games, amenities_washer, amenities_freewasher, amenities_paidwasher, amenities_dryer, amenities_freedryer, amenities_paiddryer,
                      amenities_privategarden, amenities_sharedgarden, amenities_free_parking, amenities_paid_parking, amenities_spa, amenities_sharedspa, amenities_privatespa,
                      amenities_pool, amenities_sharedpool, amenities_privatepool, amenities_nbath, amenities_tv_extras, amenities_workspace, amenities_clothingstrg,
                      amenities_nkitchen, amenities_wifi_speed]

listings = pd.concat([listings, *amenity_categories], axis=1)
listings = listings.drop("amenities", axis=1)

In [122]:
# remove text garbage	
listings["description_en"] = listings["description_en"].str.replace("<.*?>", "")
listings["name_en"] = listings["name_en"].str.replace("<.*?>", "")
listings["host_about_en"] = listings["host_about_en"].str.replace("<.*?>", "")

In [124]:
# append reviews to listings and remove zero price observations
reviews = pd.read_pickle("/content/drive/MyDrive/Colab/airbnb/data/data1/reviews_workfile.pickle")
listings = pd.concat([listings, reviews.rename("reviews")], axis=1)
listings = listings.loc[listings.index.difference(price_zero)]

In [134]:
# missing observations
for j, i in enumerate(np.sum(listings.isna())):
    if not i == 0:
        print(np.sum(listings.isna()).index[j], i)

host_since 22
host_response_time 4752
host_response_rate 4752
host_acceptance_rate 4541
host_is_superhost 22
host_listings_count 22
host_identity_verified 22
bedrooms 1353
beds 439
first_review 2328
last_review 2328
review_scores_rating 2327
review_scores_accuracy 2437
review_scores_cleanliness 2437
review_scores_checkin 2438
review_scores_communication 2436
review_scores_location 2438
review_scores_value 2439
reviews_per_month 2328
0 8
reviews 2328
reviews 2328


In [None]:
# host_since to be removed
# host_is_superhost nas to 0 (check website)
# recode host_listings_count na values to actual number of listings in dataset
# recode host_identity_verified nas to 0 (check website)
# recode bedrooms nas to 1 (scrape)
# recodes beds nas to 1 (scrape)
# recode nas in all review_scores aggregates to mean values
# make new variable "no_reviews"
# recode reviews_per_month nas to 0
# replace bath_number nas to 0

In [None]:
#### dealing with na's

# remove host_response_time, acceptance rate and response rate # remove first_review and last_review
listings.drop(["host_response_time", "host_acceptance_rate ", "first_review", "last_review"], axis=1, inplace=True)

# replace nan in review with ["no review"]






In [182]:
# get indices of missing data
missing = listings.isna()["host_identity_verified"]
missing = missing[missing].index

In [183]:
# original listing
listings_original = pd.read_pickle("/content/drive/MyDrive/Colab/airbnb/data/translations/translated_listings.pickle")
listings_original.set_index("id", inplace=True)
listings_original["host_identity_verified"][missing]

listing_id
13880640    NaN
8808555     NaN
38240023    NaN
49303478    NaN
51380533    NaN
51074863    NaN
2808957     NaN
7079941     NaN
27801673    NaN
24443812    NaN
24443823    NaN
24443832    NaN
24443833    NaN
24443846    NaN
24443909    NaN
24443919    NaN
24451666    NaN
24451904    NaN
24452272    NaN
24452650    NaN
24452917    NaN
21632198    NaN
Name: host_identity_verified, dtype: object

In [187]:
listings["bath_number"].unique()

array(['1', '1.5', '2', '3', '2.5', 0.5, '5', '4', '9', nan, '3.5', '0',
       '7', '15', '6', '4.5', '7.5', '11', '12', '8'], dtype=object)

In [101]:
# get indices of missing data
missing_amenities = listings.isna()["host_response_rate "]
missing_amenities = missing_amenities[missing_amenities].index

In [102]:
missing_amenities

Int64Index([41740620, 42583207, 43012829, 43012835, 45693243], dtype='int64', name='listing_id')

In [108]:
# original listing
listings_original = pd.read_pickle("/content/drive/MyDrive/Colab/airbnb/data/translations/translated_listings.pickle")
listings_original.set_index("id", inplace=True)
listings_original["price"][missing_amenities]

listing_id
41740620    $0.00
42583207    $0.00
43012829    $0.00
43012835    $0.00
45693243    $0.00
Name: price, dtype: object

In [None]:
# save transformed listings
path = ("/content/drive/MyDrive/Colab/airbnb/data/data1/listings_workfile.pickle")
listings.to_pickle(path)