# Looping through Pandas DataFrame

In [7]:
# First let’s generate a DataFrame large enough with random integers
import timeit
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(0, 10, size=(100000, 4)), columns=list('ABCD'))

In [8]:
# Find the shape of the DataFrame
print(df.shape)

(100000, 4)


In [10]:
# Show the first 5 rows of the DataFrame
df.head()

Unnamed: 0,A,B,C,D
0,9,6,3,3
1,9,3,1,0
2,3,9,6,0
3,6,4,0,3
4,0,3,1,2


## Standard python for loop with iloc

In [11]:
# Loop through the DataFrame using .iloc 
def loop_with_for(df):
    temp = 0
    for index in range(len(df)):
        temp += df['A'].iloc[index] + df['B'].iloc[index]
    return temp

In [12]:
# Check performance using timeit
%timeit loop_with_for(df)

1.43 s ± 60 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
# Using Jupyter’s prune function we get a detailed analysis on number of function calls and time consumed on each step
%prun -l 4 loop_with_for(df)

 

         12000009 function calls (11800007 primitive calls) in 6.100 seconds

   Ordered by: internal time
   List reduced from 36 to 4 due to restriction <4>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   200000    0.688    0.000    1.830    0.000 frame.py:3856(__getitem__)
   200000    0.676    0.000    3.238    0.000 indexing.py:1681(_getitem_axis)
  2800000    0.643    0.000    1.357    0.000 {built-in method builtins.isinstance}
   400000    0.423    0.000    0.639    0.000 indexing.py:2678(check_dict_or_set_indexers)

## Using pandas iterrows function

In [14]:
# The pandas iterrows function returns a pandas Series for each row, with the down side of not preserving dtypes across rows
def loop_with_iterrows(df):
    temp = 0
    for _, row in df.iterrows():
         temp += row.A + row.B
    return temp

In [15]:
# Check performance using timeit
%timeit loop_with_iterrows(df)

3.99 s ± 224 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [27]:
# Using Jupyter’s prune function we get a detailed analysis on number of function calls and time consumed on each step
%prun -l 4 loop_with_iterrows(df)

 

         16600016 function calls (16200016 primitive calls) in 12.567 seconds

   Ordered by: internal time
   List reduced from 86 to 4 due to restriction <4>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   100000    1.036    0.000    7.119    0.000 series.py:371(__init__)
  3700000    0.670    0.000    0.799    0.000 {built-in method builtins.isinstance}
   100000    0.664    0.000    1.381    0.000 construction.py:518(sanitize_array)
   200000    0.558    0.000    2.273    0.000 series.py:1016(__getitem__)

## Using pandas itertuples function

In [16]:
# The pandas itertuples function is similar to iterrows, except it returns a namedtuple for each row, and preserves dtypes across rows
def loop_with_itertuples(df):
    temp = 0
    for row_tuple in df.itertuples():
        temp += row_tuple.A + row_tuple.B
    return temp

In [19]:
# Check performance using timeit
%timeit loop_with_itertuples(df)

67 ms ± 2.65 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
# Using Jupyter’s prune function we get a detailed analysis on number of function calls and time consumed on each step
%prun -l 4 loop_with_itertuples(df)

 

         400814 function calls (400805 primitive calls) in 0.258 seconds

   Ordered by: internal time
   List reduced from 97 to 4 due to restriction <4>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.114    0.114    0.258    0.258 1846764153.py:2(loop_with_itertuples)
   100000    0.053    0.000    0.090    0.000 __init__.py:442(_make)
   100004    0.025    0.000    0.025    0.000 {built-in method __new__ of type object at 0x107d8e2f0}
   100001    0.016    0.000    0.016    0.000 range.py:464(__iter__)

## Using python zip

In [21]:
# Since a column of a Pandas DataFrame is an iterable, we can utilize zip to produce a tuple for each row just like itertuples, without all the pandas overhead
def loop_with_zip(df):
    temp = 0
    for a, b in zip(df['A'], df['B']):
        temp += a + b
    return temp

In [22]:
# Check performance using timeit
%timeit loop_with_zip(df)

19.1 ms ± 186 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [23]:
# Using Jupyter’s prune function we get a detailed analysis on number of function calls and time consumed on each step
%prun -l 4 loop_with_zip(df)

 

         56 function calls in 0.037 seconds

   Ordered by: internal time
   List reduced from 20 to 4 due to restriction <4>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.036    0.036    0.037    0.037 1935659825.py:2(loop_with_zip)
        1    0.000    0.000    0.037    0.037 {built-in method builtins.exec}
        2    0.000    0.000    0.000    0.000 frame.py:3856(__getitem__)
        2    0.000    0.000    0.000    0.000 base.py:5299(__contains__)

## Using pandas apply function

In [24]:
# We can always use the well-known pandas apply function, which is commonly used to do complex operations on DataFrame rows and columns
def using_apply(df):
    return df.apply(lambda x: x['A'] + x['B'], axis=1).sum()

In [25]:
# Check performance using timeit
%timeit using_apply(df)

757 ms ± 32.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [26]:
# Using Jupyter’s prune function we get a detailed analysis on number of function calls and time consumed on each step
%prun -l 4 using_apply(df)

 

         4600484 function calls (4600472 primitive calls) in 2.603 seconds

   Ordered by: internal time
   List reduced from 178 to 4 due to restriction <4>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   200000    0.430    0.000    1.641    0.000 series.py:1016(__getitem__)
   200000    0.248    0.000    0.703    0.000 series.py:1139(_get_value)
   200000    0.209    0.000    0.314    0.000 indexing.py:2678(check_dict_or_set_indexers)
  1100082    0.189    0.000    0.291    0.000 {built-in method builtins.isinstance}

## Using pandas builtin add function

In [28]:
# Unfortunately many computations we do does not have a simple built-in operation in Pandas. 
# But this approach gives us a good indicator of how efficient these Pandas built-in functions are in practice
def using_pandas_builtin(df):
    return (df['A'] + df['B']).sum()

In [29]:
# Check performance using timeit
%timeit using_pandas_builtin(df)

397 µs ± 25 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [30]:
# Using Jupyter’s prune function we get a detailed analysis on number of function calls and time consumed on each step
%prun -l 4 using_pandas_builtin(df)

 

         293 function calls (288 primitive calls) in 0.018 seconds

   Ordered by: internal time
   List reduced from 136 to 4 due to restriction <4>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.010    0.010    0.010    0.010 {built-in method _operator.add}
        2    0.003    0.002    0.003    0.002 _ufunc_config.py:33(seterr)
        1    0.001    0.001    0.001    0.001 cast.py:1569(maybe_cast_to_integer_array)
        1    0.001    0.001    0.004    0.004 _ufunc_config.py:430(__enter__)

## Using numpy builtin function

In [31]:
# We convert each column into a numpy array, and does all the heavy lifting utilizing numpy’s builtin functionalities
def using_numpy(df):
    return (df['A'].values + df['B'].values).sum()

In [32]:
# Check performance using timeit
%timeit using_numpy(df)

250 µs ± 13.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [33]:
# Using Jupyter’s prune function we get a detailed analysis on number of function calls and time consumed on each step
%prun -l 4 using_numpy(df)

 

         59 function calls in 0.017 seconds

   Ordered by: internal time
   List reduced from 24 to 4 due to restriction <4>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.010    0.010    0.016    0.016 778788821.py:2(using_numpy)
        2    0.003    0.001    0.005    0.003 frame.py:3856(__getitem__)
        2    0.002    0.001    0.002    0.001 base.py:5299(__contains__)
        2    0.001    0.000    0.001    0.000 frame.py:4405(_get_item_cache)