# Efficient `pandas`

In [1]:
%pylab inline
plt.style.use("bmh")

Populating the interactive namespace from numpy and matplotlib


In [2]:
import numpy as np
import pandas as pd

In [3]:
import string

# Data

In [4]:
df = pd.DataFrame(np.arange(2000).reshape((1000,2)),
                  columns=['a', 'b'],
                  index=np.random.choice(list(string.ascii_lowercase), 1000, replace=True))

In [5]:
df.head()

Unnamed: 0,a,b
p,0,1
t,2,3
c,4,5
c,6,7
t,8,9


# Loops

## Naive

In [6]:
def iterate_df(df):
    """Iterate over df in Python loop."""

    result = []
    
    for i in range(df.shape[0]):
        row = df.iloc[i]
        result.append(row['a']/row['b'])
    return pd.Series(result, name="div_result", index=df.index)

In [7]:
%timeit -n 10 -r 5 iterate_df(df)

94.8 ms ± 2.73 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)


## Using `iterrows`

In [8]:
def iterate_df_rows(df):
    """Iterate over df in Python loop."""

    result = []
    
    for ri, row in df.iterrows():
        result.append(row['a']/row['b'])
    return pd.Series(result, name="AgeGroup", index=df.index)

In [9]:
%timeit -n 20 -r 5 iterate_df_rows(df)

56.2 ms ± 2.84 ms per loop (mean ± std. dev. of 5 runs, 20 loops each)


## Using `apply`

In [10]:
%timeit -n 20 -r 5 df.apply(lambda x: x['a']/x['b'], axis=1)

16.1 ms ± 655 µs per loop (mean ± std. dev. of 5 runs, 20 loops each)


## Using vectorization

In [11]:
%timeit -n 20 -r 5 df['a']/df['b']

143 µs ± 29.6 µs per loop (mean ± std. dev. of 5 runs, 20 loops each)


In [12]:
df['a']/df['b']

p    0.000000
t    0.666667
c    0.800000
c    0.857143
t    0.888889
       ...   
g    0.999498
q    0.999498
l    0.999499
l    0.999499
c    0.999500
Length: 1000, dtype: float64

In [13]:
%timeit -n 20 -r 5 df['a'].values/df['b'].values

The slowest run took 4.26 times longer than the fastest. This could mean that an intermediate result is being cached.
23.9 µs ± 17.1 µs per loop (mean ± std. dev. of 5 runs, 20 loops each)


In [None]:
df['a'].values/df['b'].values

# Memory

In [16]:
titanic_train = pd.read_csv("train.csv", index_col="PassengerId")
titanic_test = pd.read_csv("test.csv", index_col="PassengerId")
titanic = pd.concat([titanic_train, titanic_test], sort=False)

titanic.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
titanic.dtypes

In [None]:
titanic.info(memory_usage="deep")

In [None]:
titanic["Pclass"] = pd.to_numeric(titanic["Pclass"], downcast="unsigned")

In [None]:
titanic.dtypes

In [None]:
titanic.info(memory_usage="deep")

In [None]:
titanic["SibSp"] = pd.to_numeric(titanic["SibSp"], downcast="unsigned")
titanic["Parch"] = pd.to_numeric(titanic["Parch"], downcast="unsigned")

In [None]:
titanic.info(memory_usage="deep")

## How to read it with correct `dtype`s right away?

In [None]:
titanic_train_trunc = pd.read_csv("train.csv", index_col="PassengerId",
                                  dtype={"Pclass":np.uint8,
                                         "SibSp":np.uint8,
                                         "Parch":np.uint8,
                                         "Survived":np.float32,
                                         "Age":np.float32,
                                         "Fare":np.float32},
                                  converters={"Sex": lambda x: (np.uint8(1)
                                                                if x=="female" else np.uint8(0)),
                                              "Embarked": lambda x: 0 if x=="S" else (1 if x=="C" else 2)})
titanic_test_trunc = pd.read_csv("test.csv", index_col="PassengerId",
                                 dtype={"Pclass":np.uint8,
                                        "SibSp":np.uint8,
                                        "Parch":np.uint8,
                                        "Survived":np.float32,
                                        "Age":np.float32,
                                        "Fare":np.float32},
                                 converters={"Sex": lambda x: np.uint8(1) if x=="female" else np.uint8(0),
                                             "Embarked": lambda x: 0 if x=="S" else (1 if x=="C" else 2)})
titanic_trunc = pd.concat([titanic_train_trunc, titanic_test_trunc], sort=False)

titanic_trunc.info(memory_usage="deep")

In [None]:
titanic_trunc.select_dtypes(np.uint8).head()

In [None]:
titanic_trunc