Adapteed from tutorial <a href="https://medium.com/bigdatarepublic/advanced-pandas-optimize-speed-and-memory-a654b53be6c2">here</a>.

In [1]:
import numpy as np
import pandas as pd

In [2]:
listings = pd.read_csv('../data/listings.csv')
print(listings.shape)
listings.head()

(22552, 16)


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2015,Berlin-Mitte Value! Quiet courtyard/very central,2217,Ian,Mitte,Brunnenstr. Süd,52.534537,13.402557,Entire home/apt,60,4,118,2018-10-28,3.76,4,141
1,2695,Prenzlauer Berg close to Mauerpark,2986,Michael,Pankow,Prenzlauer Berg Nordwest,52.548513,13.404553,Private room,17,2,6,2018-10-01,1.42,1,0
2,3176,Fabulous Flat in great Location,3718,Britta,Pankow,Prenzlauer Berg Südwest,52.534996,13.417579,Entire home/apt,90,62,143,2017-03-20,1.25,1,220
3,3309,BerlinSpot Schöneberg near KaDeWe,4108,Jana,Tempelhof - Schöneberg,Schöneberg-Nord,52.498855,13.349065,Private room,26,5,25,2018-08-16,0.39,1,297
4,7071,BrightRoom with sunny greenview!,17391,Bright,Pankow,Helmholtzplatz,52.543157,13.415091,Private room,42,2,197,2018-11-04,1.75,1,26


In [3]:
reviews = pd.read_csv('../data/reviews.csv')
print(reviews.shape)
reviews.head()

(401963, 2)


Unnamed: 0,listing_id,date
0,2015,2016-04-11
1,2015,2016-04-15
2,2015,2016-04-26
3,2015,2016-05-10
4,2015,2016-05-14


### 1. Index Optimization

In [4]:
%%timeit
listings.merge(reviews, left_on='id', right_on='listing_id')

133 ms ± 1.36 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
%%timeit
lists = listings.set_index('id')
revs = reviews.set_index('listing_id')
lists.merge(revs, left_index=True, right_index=True)

46.1 ms ± 978 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
listings = listings.set_index('id', drop=False)
listings.head()

Unnamed: 0_level_0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2015,2015,Berlin-Mitte Value! Quiet courtyard/very central,2217,Ian,Mitte,Brunnenstr. Süd,52.534537,13.402557,Entire home/apt,60,4,118,2018-10-28,3.76,4,141
2695,2695,Prenzlauer Berg close to Mauerpark,2986,Michael,Pankow,Prenzlauer Berg Nordwest,52.548513,13.404553,Private room,17,2,6,2018-10-01,1.42,1,0
3176,3176,Fabulous Flat in great Location,3718,Britta,Pankow,Prenzlauer Berg Südwest,52.534996,13.417579,Entire home/apt,90,62,143,2017-03-20,1.25,1,220
3309,3309,BerlinSpot Schöneberg near KaDeWe,4108,Jana,Tempelhof - Schöneberg,Schöneberg-Nord,52.498855,13.349065,Private room,26,5,25,2018-08-16,0.39,1,297
7071,7071,BrightRoom with sunny greenview!,17391,Bright,Pankow,Helmholtzplatz,52.543157,13.415091,Private room,42,2,197,2018-11-04,1.75,1,26


In [7]:
%%timeit
listings.loc[29844866, 'name']

5.03 µs ± 55.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [8]:
%%timeit
listings.at[29844866, 'name']

2.44 µs ± 22.6 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [9]:
listings = listings.reset_index(drop=True)
listings.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2015,Berlin-Mitte Value! Quiet courtyard/very central,2217,Ian,Mitte,Brunnenstr. Süd,52.534537,13.402557,Entire home/apt,60,4,118,2018-10-28,3.76,4,141
1,2695,Prenzlauer Berg close to Mauerpark,2986,Michael,Pankow,Prenzlauer Berg Nordwest,52.548513,13.404553,Private room,17,2,6,2018-10-01,1.42,1,0
2,3176,Fabulous Flat in great Location,3718,Britta,Pankow,Prenzlauer Berg Südwest,52.534996,13.417579,Entire home/apt,90,62,143,2017-03-20,1.25,1,220
3,3309,BerlinSpot Schöneberg near KaDeWe,4108,Jana,Tempelhof - Schöneberg,Schöneberg-Nord,52.498855,13.349065,Private room,26,5,25,2018-08-16,0.39,1,297
4,7071,BrightRoom with sunny greenview!,17391,Bright,Pankow,Helmholtzplatz,52.543157,13.415091,Private room,42,2,197,2018-11-04,1.75,1,26


In [10]:
%%timeit
listings.loc[listings.id == 29844866, 'name']

118 µs ± 308 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [11]:
%%timeit
listings.iloc[22529]['name']

58.9 µs ± 778 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


### 2. Vectorize Ops

In [25]:
min_price = listings.price.min()
max_price = listings.price.max()
price_range = max_price - min_price
price_range

9000

In [26]:
%%timeit
norm_prices = np.zeros(len(listings))
for i in range(len(listings)):
    norm_prices[i] = (
        (listings.iloc[i]['price'] - min_price) / price_range)
listings['norm_prices'] = norm_prices

1.4 s ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [27]:
%%timeit
norm_prices = np.zeros(len(listings))
for i, price in enumerate(listings.price):
    norm_prices[i] = (price - min_price) / price_range
listings['nrom_price'] = norm_prices

7.3 ms ± 249 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [29]:
%%timeit
norm_prices = np.zeros(len(listings))
for i, row in listings.iterrows():
    norm_prices[i] = (row.price - min_price) / price_range
listings['norm_price'] = norm_prices

570 ms ± 4.78 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [30]:
%%timeit
norm_prices = np.zeros(len(listings))
for i in range(len(norm_prices)):
    norm_prices[i] = (
        (listings.loc[i, 'price'] - min_price) / price_range)
listings['norm_price'] = norm_prices

122 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [31]:
%%timeit
norm_prices = np.zeros(len(listings))
for i in range(len(norm_prices)):
    norm_prices[i] = (
        (listings.at[i, 'price'] - min_price) / price_range)
listings['norm_price'] = norm_prices

63.9 ms ± 402 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [32]:
%%timeit
listings['norm_price'] = listings.price.map(
    lambda x: (x - min_price) / price_range)

7.52 ms ± 24.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [33]:
%%timeit
listings['norm_price'] = listings.price.apply(
    lambda x: (x - min_price) / price_range)

7.49 ms ± 36 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [34]:
%%timeit
listings['norm_price'] = (listings.price - min_price) / price_range

163 µs ± 747 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [12]:
room_type_scores = {
    'Entire home/apt': 1,
    'Private room': 0.5,
    'Shared room': 0.2}

In [14]:
%%timeit
scores = np.zeros(len(listings))
for i in range(len(listings)):
    row = listings.loc[i]
    if row.availability_365 == 0:
        scores[i] = 0
    elif row.price > 100:
        scores[i] = 0
    else:
        room_type_score = room_type_scores[row.room_type]
        price_score = (100 - row.price) / 100
        review_score = 1 if row.number_of_reviews > 50 else 0.5
        scores[i] = room_type_score * price_score * review_score
listings['score'] = scores

1.68 s ± 11.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%%timeit
prices = listings.price.values
n_reviews = listings.number_of_reviews.values
availability = listings.availability_365.values
room_types = listings.room_type.values
scores = np.zeros(len(listings))
for i in range(len(listings)):
    if availability[i] == 0:
        scores[i] = 0
    elif prices[i] > 100:
        scores[i] = 0
    else:
        rm_type_score = room_type_scores[room_types[i]]
        price_score = (100 - prices[i]) / 100
        rev_score = 1 if n_reviews[i] > 50 else 0.5
        scores[i] = rm_type_score * price_score * rev_score
listings['score'] = scores

16.7 ms ± 140 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [19]:
%%timeit
listings.loc[
    listings.room_type == 'Entire home/apt', 'room_type_score'] = 1
listings.loc[
    listings.room_type == 'Private room', 'room_type_score'] = 0.5
listings['room_type_score'].fillna(0.2, inplace=True)
listings.loc[listings.number_of_reviews > 50, 'review_score'] = 1
listings['review_score'].fillna(0.5, inplace=True)
listings['price_score'] = (100 - listings.price) / 100
listings['score'] = (
    listings.room_type_score
    * listings.price_score
    * listings.review_score)
listings.loc[
    (listings.availability_365 == 0) | (listings.price > 100), 'score'
] = 0

4.24 ms ± 16.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
