In [None]:
!pip install pandas

In [None]:
!wget https://tiago.org/yellow_tripdata_2020-01.csv.gz

# Memory

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("yellow_tripdata_2020-01.csv.gz")
df.head()

In [None]:
df.info(memory_usage="deep")

In [None]:
def summarize_columns(df):
    for c in df.columns:
        print(c, len(df[c].unique()), df[c].memory_usage(deep=True) // (1024**2), sep="\t")

In [None]:
summarize_columns(df)

In [None]:
df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])

In [None]:
df.info(memory_usage="deep")

In [None]:
df["payment_type"] = df["payment_type"].astype(np.int8)

In [None]:
df["payment_type"].unique()

In [None]:
df["payment_type"] = df["payment_type"].fillna(0).astype(np.int8)

In [None]:
df.info(memory_usage="deep")

In [None]:
summarize_columns(df)

In [None]:
df = pd.read_csv("yellow_tripdata_2020-01.csv.gz", 
                 dtype={"PULocationID": np.uint8,"DOLocationID": np.uint8}, 
                 parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"],
                 converters={
                     "VendorID": lambda x: np.int8(["", "1", "2"].index(x)),
                     "store_and_fwd_flag": lambda x: ["", "N", "Y"].index(x) - 1,
                     "payment_type": lambda x: -1 if x == "" else int(x),
                     "RatecodeID": lambda x: -1 if x == "" else int(x),
                     "passenger_count": lambda x: -1 if x == "" else int(x)
                    }
                )

In [None]:
df.info(memory_usage="deep")

## Question
How can you make the memory even smaller?

# Speed

## Indexing

In [None]:
df = pd.read_csv(
    "yellow_tripdata_2020-01.csv.gz", 
    dtype={
        "congestion_surcharge": np.float16,
    },
    parse_dates=[
        "tpep_pickup_datetime",
        "tpep_dropoff_datetime"],
    usecols=[
        "congestion_surcharge",
        "tpep_pickup_datetime",
        "tpep_dropoff_datetime"],
)

In [None]:
df.head()

In [None]:
%timeit df[df["tpep_pickup_datetime"] == "2020-01-06 08:13:00"]

In [None]:
df_sorted = df.sort_values("tpep_pickup_datetime")
df_sorted[df_sorted["tpep_pickup_datetime"] == "2020-01-06 08:13:00"]

In [None]:
df_pickup = df.set_index("tpep_pickup_datetime")
df_pickup_sorted = df_pickup.sort_index()

In [None]:
%timeit df_pickup.loc["2020-01-06 08:13:00"]

In [None]:
%timeit df_pickup_sorted.loc["2020-01-06 08:13:00"]

## Row Iteration

In [None]:
df = pd.read_csv("yellow_tripdata_2020-01.csv.gz")
df.head()

In [None]:
df = df[(df.total_amount != 0)]
df_10 = df.sample(frac=0.1)  # sample 10% of the data
df_100 = df.sample(frac=0.01)  # sample 1% of the data

### Using Loops

In [None]:
def get_tip_mean_explicit(df):
    all_tips = 0
    all_totals = 0
    for i in range(len(df)):
        row = df.iloc[i]
        all_tips += row["tip_amount"]
        all_totals += row["total_amount"]
    return all_tips / all_totals

In [None]:
# %timeit get_tip_mean_explicit(df)
%timeit get_tip_mean_explicit(df_100)

In [None]:
def get_tip_mean_iterrows(df):
    all_tips = 0
    all_totals = 0
    for i, row in df.iterrows():
        all_tips += row["tip_amount"]
        all_totals += row["total_amount"]
    return all_tips / all_totals

In [None]:
# %timeit get_tip_mean_explicit(df)
%timeit get_tip_mean_explicit(df_100)

In [None]:
def get_tip_mean_itertuples(df):
    all_tips = 0
    all_totals = 0
    for my_tuple in df.itertuples():
        all_tips += my_tuple.tip_amount
        all_totals += my_tuple.total_amount
    return all_tips / all_totals

In [None]:
# %timeit get_tip_mean_itertuples(df)
%timeit get_tip_mean_itertuples(df_100)

### Using apply()

In [None]:
def get_tip_mean_apply(df):
    frac_tip = df.apply(lambda row: row["tip_amount"] / row["total_amount"], axis=1)
    return frac_tip.mean()

In [None]:
%timeit get_tip_mean_apply(df_100)

In [None]:
%timeit get_tip_mean_apply(df_10)

In [None]:
# %timeit get_tip_mean_apply(df)

### Using Vectorization

In [None]:
def get_tip_mean_vector(df):
    frac_tip = df["tip_amount"] / df["total_amount"]
    return frac_tip.mean()

In [None]:
%timeit get_tip_mean_vector(df_100)

In [None]:
%timeit get_tip_mean_vector(df)  ## running on full dataframe

## Using NumPy explicitly

In [None]:
df_total = df["total_amount"].to_numpy()
df_tip = df["tip_amount"].to_numpy()

In [None]:
print(type(df_tip))

In [None]:
def get_tip_mean_numpy(df_total, df_tip):
    frac_tip = df_total / df_tip
    return frac_tip.mean()

In [None]:
%timeit get_tip_mean_numpy(df_total, df_tip)

## Using NumExpr

In [None]:
def get_tip_mean_numexpr(df):
    return df.eval("tip_amount / total_amount", engine="numexpr").mean()

In [None]:
%timeit get_tip_mean_numexpr(df)

### Complex expressions

In [None]:
def get_tip_mean_numpy4(df_total, df_tip):
    frac_tip = df_total / df_tip + df_total / df_tip + df_total / df_tip + df_total / df_tip
    return frac_tip.mean()

In [None]:
def get_tip_mean_numexpr4(df):
    return df.eval("tip_amount / total_amount + tip_amount / total_amount + tip_amount / total_amount + tip_amount / total_amount", engine="numexpr").mean()

In [None]:
%timeit get_tip_mean_numpy4(df_total, df_tip)

In [None]:
%timeit get_tip_mean_numexpr4(df)