# Iterating through a DataFrame

Avoid iterating through a `DataFrame` if it is possible to use expressions as expressions are much faster. 

In [1]:
import polars as pl

In [2]:
csv_file = "data/titanic.csv"

In [3]:
df = pl.read_csv(csv_file)
df.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


### Iterating over a single column

In [4]:
ages = [age for age in df["Age"]]
ages

[22.0,
 38.0,
 26.0,
 35.0,
 35.0,
 None,
 54.0,
 2.0,
 27.0,
 14.0,
 4.0,
 58.0,
 20.0,
 39.0,
 14.0,
 55.0,
 2.0,
 None,
 31.0,
 None,
 35.0,
 34.0,
 15.0,
 28.0,
 8.0,
 38.0,
 None,
 19.0,
 None,
 None,
 40.0,
 None,
 None,
 66.0,
 28.0,
 42.0,
 None,
 21.0,
 18.0,
 14.0,
 40.0,
 27.0,
 None,
 3.0,
 19.0,
 None,
 None,
 None,
 None,
 18.0,
 7.0,
 21.0,
 49.0,
 29.0,
 65.0,
 None,
 21.0,
 28.5,
 5.0,
 11.0,
 22.0,
 38.0,
 45.0,
 4.0,
 None,
 None,
 29.0,
 19.0,
 17.0,
 26.0,
 32.0,
 16.0,
 21.0,
 26.0,
 32.0,
 25.0,
 None,
 None,
 0.83,
 30.0,
 22.0,
 29.0,
 None,
 28.0,
 17.0,
 33.0,
 16.0,
 None,
 23.0,
 24.0,
 29.0,
 20.0,
 46.0,
 26.0,
 59.0,
 None,
 71.0,
 23.0,
 34.0,
 34.0,
 28.0,
 None,
 21.0,
 33.0,
 37.0,
 28.0,
 21.0,
 None,
 38.0,
 None,
 47.0,
 14.5,
 22.0,
 20.0,
 17.0,
 21.0,
 70.5,
 29.0,
 24.0,
 2.0,
 21.0,
 None,
 32.5,
 32.5,
 54.0,
 12.0,
 None,
 24.0,
 None,
 45.0,
 33.0,
 20.0,
 47.0,
 29.0,
 25.0,
 23.0,
 19.0,
 37.0,
 16.0,
 24.0,
 None,
 22.0,
 24.0,
 19.0,
 

### Iterating over multiple columns
Using the `rows` attribute of a `DataFrame`.

In [5]:
name_age = [(row[3], row[5]) for row in df.rows()]
name_age[:3]

[('Braund, Mr. Owen Harris', 22.0),
 ('Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 38.0),
 ('Heikkinen, Miss. Laina', 26.0)]

Alternatively, do this with the `iter_rows` method

In [6]:
name_age = [(row[3], row[5]) for row in df.iter_rows()]
name_age[:3]

[('Braund, Mr. Owen Harris', 22.0),
 ('Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 38.0),
 ('Heikkinen, Miss. Laina', 26.0)]

#### Difference between `rows` and `iter_rows`?
The output of `rows` and `iter_rows` is the same. 

The difference is that:
- when we call `rows`, the entire `DataFrame` is materialized as a list of Python tuples where each tuple is a row. We can then iterate over this list of tuples
- when we call `iter_rows`, Polars materializes each row as a Python tuple when we iterate over it rather than materializing the whole `DataFrame` at the outset

Use `rows` if you are iterating through the full `DataFrame` and have enough memory to materialize the whole `DataFrame` as a list of tuples.

Use `iter_rows` if you don't want to materialize the whole `DataFrame` as a list of tuples to reduce memory use

### Iterating with named columns

In [7]:
name_age = [(row["Name"], row["Age"]) for row in df.iter_rows(named=True)]
name_age[:3]

[('Braund, Mr. Owen Harris', 22.0),
 ('Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 38.0),
 ('Heikkinen, Miss. Laina', 26.0)]

In [8]:
name_age = [(row["Name"], row["Age"]) for row in df.rows(named=True)]
name_age[:3]

[('Braund, Mr. Owen Harris', 22.0),
 ('Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 38.0),
 ('Heikkinen, Miss. Laina', 26.0)]

## Exercises

## Exercise 1
We compare how long it takes to iterate through the `DataFrame` when we select the first 2 columns on each iteration.

In [12]:
import numpy as np
N = 1_000_000
dfRandom = pl.DataFrame(np.random.standard_normal((N,100)))
dfRandom

column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,column_20,column_21,column_22,column_23,column_24,column_25,column_26,column_27,column_28,column_29,column_30,column_31,column_32,column_33,column_34,column_35,column_36,…,column_63,column_64,column_65,column_66,column_67,column_68,column_69,column_70,column_71,column_72,column_73,column_74,column_75,column_76,column_77,column_78,column_79,column_80,column_81,column_82,column_83,column_84,column_85,column_86,column_87,column_88,column_89,column_90,column_91,column_92,column_93,column_94,column_95,column_96,column_97,column_98,column_99
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
-1.164781,-0.070038,-1.729294,-1.422193,-0.207308,0.671221,0.351619,1.841865,1.419423,-1.57015,-1.163337,1.719379,0.91241,0.580959,1.610791,-1.34593,1.084595,-0.794719,0.19123,0.515863,0.712812,0.553116,-0.860036,-0.36616,0.915623,-1.09471,0.353944,1.128129,0.357286,1.76969,1.00722,0.696386,1.29536,-0.049974,-1.310197,-0.663285,0.228812,…,1.02732,-1.029345,-0.145443,0.364117,-1.241702,0.691785,-1.657258,0.37671,1.631367,0.816168,0.578424,-1.880007,1.776485,-0.568615,-1.594181,-0.919303,-0.153705,-0.487181,-0.922334,-0.758925,-1.304779,-0.742403,2.191745,0.751004,0.107717,0.318882,1.961444,2.409777,0.30263,-1.707118,0.660338,0.378522,-0.133142,2.019123,0.773209,0.939155,-0.025671
-1.646543,0.166632,-0.798879,-0.773839,1.819783,-2.039733,-0.163787,1.367067,1.28868,-0.828694,0.660184,-1.447877,-0.603996,-0.644159,0.005569,-0.423889,1.30625,-0.140954,0.291617,1.576322,1.606746,1.015501,0.555552,2.767363,-0.783278,-1.112275,-0.931174,-1.286728,-0.297006,-0.05704,-0.231739,0.454554,-0.021957,0.428325,-0.067247,-0.911469,-0.722624,…,-0.533673,-0.588677,1.696079,2.263042,1.726954,0.365404,0.224735,-0.952058,0.488846,0.418925,1.05019,-0.179621,-0.680896,-0.579089,-0.734238,-0.530722,-0.021984,0.455059,0.447052,-0.772835,-1.243633,-1.50587,0.828322,-0.865589,-0.271607,1.35511,-2.071384,0.2735,0.639316,0.306921,0.027336,-0.648748,0.750459,-1.041649,1.571893,1.02672,0.819728
-0.246229,0.383675,0.465169,-1.115744,-1.671686,1.531959,-0.088546,1.36291,0.351562,1.96074,-0.235987,0.841692,-0.902079,-0.300771,1.051599,1.969963,0.533824,-1.247135,0.047813,-0.95776,-0.463761,0.283953,0.231378,1.005802,0.930924,-0.355434,-0.486288,-1.140801,0.499668,0.654535,1.306645,0.578235,0.510331,-0.920932,0.036678,-1.227284,0.22156,…,-2.399224,-2.596035,1.345273,0.359331,0.440793,3.13806,1.205693,-0.492173,0.041056,-0.167876,0.020262,-0.543153,-0.31744,-0.752348,-0.38331,0.624457,-1.182804,1.003902,-0.458224,1.452544,0.094149,0.079407,0.649323,0.589059,-1.168753,1.347852,0.369314,0.470125,0.563024,0.054561,-0.081201,0.633324,0.638554,0.962883,-0.760922,0.795893,-0.847301
-0.394152,-0.00966,-1.172122,-0.731236,-0.086953,1.521882,0.507532,0.362862,0.033816,-0.11375,-0.468888,0.404764,-0.638838,0.094453,0.156022,1.13008,-1.92628,-2.373432,-1.360256,-1.289944,-0.479182,-0.307104,-0.525397,0.201106,0.532494,-0.438634,0.055154,-0.714834,1.87459,2.272728,0.718894,-0.726518,-0.130726,-1.45562,0.196292,1.035322,-1.671136,…,-0.915496,-0.886895,1.059532,1.366201,0.061887,0.359567,-0.768184,0.957362,-1.373134,-1.631282,-1.269886,-0.303017,-0.158182,0.609806,-1.073232,-0.610284,0.323563,1.479427,-0.348608,1.191177,-0.455397,-0.575827,-0.091742,0.021792,0.395303,0.102827,-0.583216,-1.218242,0.649474,-0.146,-0.613094,0.057233,0.179302,0.080341,-0.391729,-0.001771,1.766666
1.053791,-0.935537,-0.547943,0.152254,-0.45909,1.020304,0.897083,-0.149038,-0.162783,-0.539482,-1.008363,0.992128,0.697358,0.777762,0.497762,-0.354581,-0.921362,-0.341876,0.347532,0.601386,-0.919916,1.236719,-0.939253,0.615968,-1.175504,-1.290711,1.790766,-0.077605,-0.873926,-0.803729,-1.349187,-0.254305,-1.325529,0.889418,-0.697898,-0.165497,0.17653,…,-0.386583,0.178152,0.161746,0.812824,-0.473975,0.888556,-1.30981,-0.443734,0.261826,-0.746819,1.033918,-1.320987,-1.843477,0.041825,-0.346721,0.314492,0.000677,-0.21126,-0.653989,-1.048866,0.729471,-0.410764,2.183808,-1.899666,0.225017,0.662158,1.844978,-0.506592,-0.497348,-2.373747,-0.51684,-0.981813,-0.863778,0.439785,1.713283,1.116212,0.486946
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
-1.579394,0.308192,-0.440738,0.646181,-1.572144,0.28732,0.752152,-1.353065,1.00469,0.00803,-0.56887,0.58433,-1.513876,0.309166,-1.024265,-0.760454,2.573168,1.169208,-0.260137,0.391604,-0.368444,0.714012,-0.660953,0.442846,0.163864,-1.452073,-0.891981,-0.069543,-1.010525,0.696029,-0.076649,1.413275,-0.085483,-1.95364,-0.06469,2.378329,1.387295,…,1.460364,-0.282159,0.925726,-0.14669,-0.089708,0.751628,-0.071486,-1.064316,-0.83999,-0.66877,-1.144904,-0.565279,2.400336,0.96632,-0.426591,-1.252982,0.895259,0.99235,0.130405,-1.650057,-1.583715,0.688954,1.979651,0.254623,0.818384,0.15122,0.395895,-0.071112,0.780718,0.031647,-0.53673,-0.59952,-1.117227,1.062606,-1.783837,1.876276,-0.403633
0.318233,-1.384395,-0.505996,-0.366652,0.516275,1.282053,-0.117974,-0.412632,-0.91999,0.050511,-1.388034,-2.567344,1.532622,-1.318938,1.444164,2.331385,0.219883,1.216182,1.576247,-0.363037,0.495429,-0.629143,-0.135833,-1.593654,-0.463453,-2.114426,-0.025814,0.355699,-0.658774,0.697264,3.364954,1.066108,0.646011,0.073512,-1.051769,-0.289283,-0.375286,…,-1.043277,0.424046,0.33596,-2.360184,0.512096,-0.631307,-0.998994,-0.946764,-0.060897,0.75297,-0.845614,0.833186,2.160909,-0.64143,-0.400486,-1.24968,0.269359,1.247713,-2.16117,0.318553,-0.562406,0.158324,0.432725,0.664376,0.209769,1.750619,0.664556,0.587986,1.17846,-0.745294,-0.026619,1.287394,1.53477,0.594634,1.36154,0.723669,2.332947
-1.318338,1.083186,-0.131859,-1.142923,-0.062799,-0.453593,0.890089,1.625394,0.091928,0.333296,1.474018,0.43532,-0.954546,-0.107952,0.288773,-0.556555,0.964539,0.805549,1.564551,1.293691,-0.215666,-0.466409,-0.436157,0.299768,1.19258,-0.108608,1.395785,0.09552,2.40115,-0.253328,-2.598602,0.509976,-1.283985,0.093129,0.54698,-2.035568,0.328497,…,-0.008438,0.616418,1.109215,-0.309939,-0.90165,0.745582,-0.096053,0.968843,0.872544,-0.861672,0.079265,-0.831094,0.286748,-1.700019,0.157455,0.763182,-0.120738,0.13634,-0.176253,-0.852834,0.015235,-2.491846,-0.414601,0.317712,-0.468179,0.293723,-0.332981,-0.144833,0.698251,-0.254594,0.508081,1.277429,-0.926662,0.105243,-0.483631,0.927743,0.024424
-1.251881,0.646241,0.49925,1.059374,0.482768,-0.269914,-0.723395,0.318572,-0.111889,-0.536589,0.758795,-0.247738,-1.265583,-0.297976,0.74456,0.306716,-0.648002,1.895901,1.862432,0.948266,-1.336217,1.673951,-1.329484,1.920909,-0.516288,0.866558,0.108689,1.020301,-1.055902,-0.410624,-0.32919,0.999922,0.703564,-0.373446,0.522307,-0.225922,-0.039348,…,1.107195,2.190938,0.931761,-2.157326,-0.463973,0.08783,1.603248,0.242891,-0.67824,1.276433,-0.277157,1.475238,-0.275668,1.377117,0.534315,2.32128,0.357603,1.100077,1.701997,-0.868078,-1.273199,-0.428095,-0.560402,-0.955422,0.318177,-1.2362,-0.024321,-0.121079,0.481182,0.5296,0.424722,-0.068679,-1.777587,0.426,-0.221299,0.1559,-0.593095


Iterate through `dfRandom` with `iter_rows` to create a list where each element is a tuple with the first two columns of `dfRandom`

In [10]:
%%timeit -n1 -r1
[(row[0], row[1]) for row in dfRandom.iter_rows()]

4.84 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


How long does it take to iterate if we pre-`select` the columns of interest first?

In [11]:
%%timeit -n1 -r1

[(row[0], row[1]) for row in dfRandom.select("column_0", "column_1").iter_rows()]

383 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Use pre-selected columns below.

Do the same iteration with `iter_rows` but use named columns

In [14]:
%%timeit -n1 -r1

[(row["column_0"], row["column_1"]) for row in dfRandom.select("column_0", "column_1").iter_rows(named=True)]

705 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Compare the performance with the `rows` method

In [15]:
%%timeit -n1 -r1

[(row[0], row[1]) for row in dfRandom.select("column_0", "column_1").rows()]

360 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
