In [None]:
# Methods to efficiently loop through Pandas DataFrame
# Hint: The most efficient approaches are pandas and numpy built-in functions

In [2]:
import timeit
import pandas as pd
import numpy as np

In [21]:
df = pd.DataFrame(np.random.randint(0, 10, size=(100000, 4)), columns=list('ABCD'))
df.head()

Unnamed: 0,A,B,C,D
0,6,8,7,9
1,3,5,6,2
2,1,3,1,1
3,1,4,2,2
4,8,7,7,0


In [22]:
# Standard python for loop with iloc
def loop_with_for(df):
    temp = 0
    for index in range(len(df)):
        temp += df['A'].iloc[index] + df['B'].iloc[index]
    return temp

loop_with_for(df)

5.47 s ± 54.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
# Using pandas iterrows function (approx. five times slower)
def loop_with_iterrows(df):
    temp = 0
    for _, row in df.iterrows():
         temp += row.A + row.B
    return temp

%timeit loop_with_for(df)

5.47 s ± 152 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
# Using pandas itertuples function (preferred over itertuples)
def loop_with_itertuples(df):
    temp = 0
    for row_tuple in df.itertuples():
        temp += row_tuple.A + row_tuple.B
    return temp
%timeit loop_with_for(df)

In [None]:
# Using python zip
def loop_with_zip(df):
    temp = 0
    for a, b in zip(df['A'], df['B']):
        temp += a + b
    return temp
%timeit loop_with_for(df)

In [None]:
# Using pandas apply function
def using_apply(df):
    return df.apply(lambda x: x['A'] + x['B'], axis=1).sum()
%timeit loop_with_for(df)

In [None]:
# Using pandas builtin function
def using_pandas_builtin(df):
    return (df['A'] + df['B']).sum()
%timeit loop_with_for(df)

In [None]:
# Using numpy builtin function
def using_numpy_builtin(df):
    return (df['A'].values + df['B'].values).sum()
%timeit loop_with_for(df)