In [1]:
import multiprocessing as mp
from multiprocessing import Pool, cpu_count
from cpu_bound_func import cpu_bound_task
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../datasets/AB_NYC_2019.csv')

In [3]:
len(df)

48895

In [4]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


# Multiprocessing

In [21]:
%%time

def split_dataframe(df, num_chunks):
    return np.array_split(df, num_chunks)

num_processes = 5
chunks = split_dataframe(df, num_processes)

CPU times: user 12.2 ms, sys: 6.83 ms, total: 19 ms
Wall time: 22.4 ms


In [22]:
%%time

with mp.Pool(processes=num_processes) as pool:
    # Apply the function to each chunk in parallel
    results = pool.map(cpu_bound_task, chunks)

    df_result = pd.concat(results)

CPU times: user 22.1 ms, sys: 27.6 ms, total: 49.7 ms
Wall time: 746 ms


In [9]:
%%time

l = []
for idx, row in df.iterrows():
    if idx % 10_000 == 0: print(idx)
    res = cpu_bound_task(row)
    l.append(res)



0
10000
20000
30000
40000
CPU times: user 1.04 s, sys: 118 ms, total: 1.15 s
Wall time: 7.46 s


In [10]:
all(df_result[:10] == l[:10])

True

In [11]:
len(df_result) == len(l)

True

In [12]:
len(df_result)

48895

# Vectorization vs Iteration

In [16]:
%%timeit

l = []
for idx, row in df.iterrows():
    l.append(row['neighbourhood_group'] + row['neighbourhood'])

666 ms ± 2.28 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [81]:
%%timeit
# axis = 0 -- column
# axis = 1 -- row
res = df.apply(lambda row: row['neighbourhood_group'] + row['neighbourhood'], axis=1)

130 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [77]:
%%timeit

df['combined'] = df['neighbourhood_group'] + ' - ' + df['neighbourhood']

3.38 ms ± 49.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [78]:
137 / 3.93

34.86005089058524

In [79]:
696 / 3.93

177.09923664122135

Lets do something more complicated

1. Normalize the price column: Subtract the mean price and divide by the standard deviation to get a normalized value.
2. Calculate a weighted price: Multiply the normalized price by the minimum_nights for each listing.
3. Group by host_name: Aggregate the data to calculate the mean price and mean weighted_price for each host.

In [53]:
start_time = time.time()

price_sums_iter = {}
price_counts_iter = {}
weighted_price_sums_iter = {}

for index, row in df.iterrows():
    host_name = row['host_name']
    normalized_price = (row['price'] - df['price'].mean()) / df['price'].std()
    weight = row['minimum_nights']
    weighted_price = normalized_price * weight

    if host_name in price_sums_iter:
        price_sums_iter[host_name] += row['price']
        price_counts_iter[host_name] += 1
        weighted_price_sums_iter[host_name] += weighted_price
    else:
        price_sums_iter[host_name] = row['price']
        price_counts_iter[host_name] = 1
        weighted_price_sums_iter[host_name] = weighted_price

average_prices_iter = {name: price_sums_iter[name] / price_counts_iter[name] for name in price_sums_iter}
weighted_averages_iter = {name: weighted_price_sums_iter[name] / price_counts_iter[name] for name in weighted_price_sums_iter}

print("Iteration time:", time.time() - start_time)

Iteration time: 6.096760988235474


In [54]:
start_time = time.time()

# Normalize price
df['normalized_price'] = (df['price'] - df['price'].mean()) / df['price'].std()

# Calculate weighted price
df['weighted_price'] = df['normalized_price'] * df['minimum_nights']

# Group by host_name and calculate average and weighted average price
grouped = df.groupby('host_name').agg({
    'price': 'mean',
    'weighted_price': 'mean'
})

average_prices_vectorized = grouped['price'].to_dict()
weighted_averages_vectorized = grouped['weighted_price'].to_dict()

print("Vectorized time:", time.time() - start_time)


Vectorized time: 0.010769844055175781


When is vectorization extremely useful?

1. Operations on Arrays/Matrices: Vectorization is ideal for operations that apply to entire arrays or matrices at once, such as element-wise addition, subtraction, multiplication, or division.

2. Statistical Operations: Many statistical operations, like computing means, sums, or variances, can be efficiently vectorized.

3. Linear Algebra: Vectorization is useful for matrix multiplications, decompositions, and other linear algebra operations.

When is vectorization <b> not </b> useful or even possible?

1. Non-Numeric Data: Vectorization generally applies to numerical data. For categorical data or text processing, other techniques are more appropriate.

2. Complex Dependencies: If the operation on an element depends on the result of the previous element (e.g., cumulative sums with conditions), vectorization may be challenging.

3. Conditional Logic: When operations involve complex conditional logic or branching that can't be easily expressed in a vectorized form, you might need to use loops or other approaches.

4. Sparse Data: For sparse matrices where most elements are zero, special techniques or libraries (e.g., SciPy's sparse matrices) might be required, and standard vectorization might not be efficient.

4. Operations with Side Effects: If your operation involves side effects, like modifying global state or interacting with I/O, vectorization may not be applicable.