Prerequisites: install pandas!

# Input

In [1]:
import numpy

selling_prices = list(numpy.random.randint(1, 1100, size=20000000))
costs_per_unit = list(numpy.random.randint(1, 1000, size=20000000))
units_sold_counts = list(numpy.random.randint(1, 2000, size=20000000))

# "Exploratory" computation 

In [2]:
import pandas

# This is completely dumb, but just for exaggeration purposes...
df = pandas.DataFrame()
df.loc[:, "selling_prices"]  = selling_prices
df.loc[:, "costs_per_unit"]  = costs_per_unit
df.loc[:, "units_sold_counts"]  = units_sold_counts

df.head()

Unnamed: 0,selling_prices,costs_per_unit,units_sold_counts
0,208,664,1520
1,445,104,1750
2,875,854,754
3,1088,860,307
4,266,135,405


In [3]:
profit_per_fruit = df.apply(lambda x: (x["selling_prices"] - x["costs_per_unit"]) * x["units_sold_counts"], axis=1)
total_profit = sum(profit_per_fruit)

print(total_profit)

997034922766


# "Simplify"

In [4]:
# Create dataframe in one go + broadcasting
df = pandas.DataFrame(
    {
        "selling_prices": selling_prices,
        "costs_per_unit": costs_per_unit,
        "units_sold_counts": units_sold_counts,
    }
)

# Compute
profit_per_fruit = (df["selling_prices"] - df["costs_per_unit"]) * df["units_sold_counts"]

assert sum(profit_per_fruit) == total_profit  # Just to check for consistency

In [5]:
# Without pandas
profit_per_fruit = []

for price, cost, unit in zip(selling_prices, costs_per_unit, units_sold_counts):
    profit = (price - cost) * unit
    profit_per_fruit.append(profit)

assert sum(profit_per_fruit) == total_profit  # Just to check for consistency

In [6]:
# With numpy

price_array = numpy.array(selling_prices)
cost_array = numpy.array(costs_per_unit)
units_array = numpy.array(units_sold_counts)

profit_per_fruit = (price_array - cost_array) * units_array

assert sum(profit_per_fruit) == total_profit  # Just to check for consistency

# Race!

In [7]:
%%timeit
# Dumbest way
df = pandas.DataFrame()
df.loc[:, "selling_prices"]  = selling_prices
df.loc[:, "costs_per_unit"]  = costs_per_unit
df.loc[:, "units_sold_counts"]  = units_sold_counts

profit_per_fruit = df.apply(lambda x: (x["selling_prices"] - x["costs_per_unit"]) * x["units_sold_counts"], axis=1)
assert sum(profit_per_fruit) == total_profit  # Just to check for consistency

4min 40s ± 3.15 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%%timeit
# Still using pandas (but slightly improved)
df = pandas.DataFrame({
    "selling_prices": selling_prices,
    "costs_per_unit": costs_per_unit,
    "units_sold_counts": units_sold_counts,
})

profit_per_fruit = (df["selling_prices"] - df["costs_per_unit"]) * df["units_sold_counts"]
assert sum(profit_per_fruit) == total_profit  # Just to check for consistency

17.7 s ± 113 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%%timeit
# Without pandas
profit_per_fruit = []

for price, cost, units_sold in zip(selling_prices, costs_per_unit, units_sold_counts):
    profit = (price - cost) * units_sold
    profit_per_fruit.append(profit)

assert sum(profit_per_fruit) == total_profit  # Just to check for consistency

4.48 s ± 34.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%timeit
# With numpy
price_array = numpy.array(selling_prices)
cost_array = numpy.array(costs_per_unit)
units_array = numpy.array(units_sold_counts)

profit_per_fruit = (price_array - cost_array) * units_array
assert sum(profit_per_fruit) == total_profit  # Just to check for consistency

3.82 s ± 124 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
