# Pandas vs Polar

In [2]:
#pip installs
%pip install polars pyarrow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting polars
  Downloading polars-0.16.11-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: polars
Successfully installed polars-0.16.11


In [3]:
#imports
import pandas as pd
import polars as pl
import plotly.express as px #using plotly express
import time

# Read CSV
***Pandas:***<br>
`pd.read_csv()`<br>
***Polars:***<br>
`pl.read_csv()`<br>

In [20]:
url = 'https://github.com/byuidatascience/data4dwellings/raw/master/data-raw/dwellings_ml/dwellings_ml.csv'

#Pandas
start1 = time.time()
df1 = pd.read_csv(url) #DS250 Project 4
end1 = time.time()
ttime1 = (end1-start1) * 10**3
print("The time of execution for Pandas is:",
      ttime1, "ms")

#Polars
start2 = time.time()
df2 = pl.read_csv(url)
#df2 = pl.scan_csv(url) # Needs to be a local directory file
end2 = time.time()
ttime2 = (end2-start2) * 10**3
print("The time of execution for Polars is:",
      ttime2, "ms")

#Build a DF to plot time results
df_plot = pl.DataFrame({
    "a": ['Pandas','Polars'], 
    "ms": [ttime1, ttime2]
})
#print(df_plot)

#Plot the Results
fig = px.bar(df_plot, x=df_plot["a"], y=df_plot["ms"])
fig.show()

The time of execution for Pandas is : 459.9871635437012 ms
The time of execution for Polars is : 334.08498764038086 ms


# DataFrames Whats Similar Whats Different


In [None]:
# Example expressions that work both with Pandas and Polars and some that are unique
# Uncomment different lines and run this cell as many times as you want

#Pandas
df1.head() # Get the first n rows
#df1.tail() # Get the last n rows
#df1.info() # Get the basic info of the df
#df1.isnull().sum() # Get the sum of null values in df

#Polars
#df2.head() # Get the first n rows
#df2.tail() # Get the last n rows
#df2.unique() # Get unique values of this expression.
#df2.describe() # Get statistical values about the df
#df2.null_count() # Get the count of null values in the df

Unnamed: 0,parcel,abstrprd,livearea,finbsmnt,basement,yrbuilt,totunits,stories,nocars,numbdrm,...,arcstyle_TRI-LEVEL,arcstyle_TRI-LEVEL WITH BASEMENT,arcstyle_TWO AND HALF-STORY,arcstyle_TWO-STORY,qualified_Q,qualified_U,status_I,status_V,before1980,new_col
0,00102-08-065-065,1130,1346,0,0,2004,1,2,2,2,...,0,0,0,0,1,0,1,0,0,13460
1,00102-08-073-073,1130,1249,0,0,2005,1,1,1,2,...,0,0,0,0,1,0,1,0,0,12490
2,00102-08-078-078,1130,1346,0,0,2005,1,2,1,2,...,0,0,0,0,1,0,1,0,0,13460
3,00102-08-081-081,1130,1146,0,0,2005,1,1,0,2,...,0,0,0,0,1,0,1,0,0,11460
4,00102-08-086-086,1130,1249,0,0,2005,1,1,1,2,...,0,0,0,0,0,1,1,0,0,12490


# Slice vs Select
***Pandas:***<br>
`df1[['parcel', 'livearea']]` <br>
***Polars:***<br>
`df2.select(pl.col(['parcel', 'livearea']))`<br>

In [11]:
# Pandas
start1 = time.time()
df1[['parcel', 'livearea']] 
end1 = time.time()
ttime1 = (end1-start1) * 10**3
print("The time of execution for Pandas is:",
      ttime1, "ms")

#Polars
start2 = time.time()
df2.select(pl.col(['parcel', 'livearea'])) 
end2 = time.time()
ttime2 = (end2-start2) * 10**3
print("The time of execution for Polars is:",
      ttime2, "ms")

#Build a DF to plot time results
df_plot = pl.DataFrame({
    "a": ['Pandas','Polars'], 
    "ms": [ttime1, ttime2]
})
#print(df_plot)

#Plot the Results
fig = px.bar(df_plot, x=df_plot["a"], y=df_plot["ms"])
fig.show()

The time of execution for Pandas is : 9.59324836730957 ms
The time of execution for Polars is : 2.385377883911133 ms


***Polars in Panda Suit:***<br>
`df2[['parcel', 'livearea']]` <br>
***Polars as Recommended:***<br>
`df2.select(pl.col(['parcel', 'livearea']))`<br>

In [13]:
# Polars in Pandas sytax
start1 = time.time()
df2[['parcel', 'livearea']] 
end1 = time.time()
ttime1 = (end1-start1) * 10**3
print("The time of execution for Polars in Pandas syntax is:",
      ttime1, "ms")

#Polars as recommended
start2 = time.time()
df2.select(pl.col(['parcel', 'livearea'])) 
end2 = time.time()
ttime2 = (end2-start2) * 10**3
print("The time of execution for Polars as Recommended is:",
      ttime2, "ms")

#Build a DF to plot time results
df_plot = pl.DataFrame({
    "a": ['Polars in Panda Suit','Polars'], 
    "ms": [ttime1, ttime2]
})
#print(df_plot)

#Plot the Results
fig = px.bar(df_plot, x=df_plot["a"], y=df_plot["ms"])
fig.show()

The time of execution for Polars in Pandas syntax is : 0.14090538024902344 ms
The time of execution for Polars as Recommended is : 1.4767646789550781 ms


# Query vs Filter
***Pandas:***<br>
`df1.query('numbdrm > 5')`<br>
***Polars:***<br>
`df2.filter(pl.col('numbdrm') > 5)`<br>

In [16]:
# Pandas
start1 = time.time()
df1.query('numbdrm > 5')
end1 = time.time()
ttime1 = (end1-start1) * 10**3
print("The time of execution for Pandas is:",
      ttime1, "ms")

# Polars
start2 = time.time()
df2.filter(pl.col('numbdrm') > 5)
end2 = time.time()
ttime2 = (end2-start2) * 10**3
print("The time of execution for Polars is:",
      ttime2, "ms")

#Build a DF to plot time results
df_plot = pl.DataFrame({
    "a": ['Pandas','Polars'], 
    "ms": [ttime1, ttime2]
})
#print(df_plot)

#Plot the Results
fig = px.bar(df_plot, x=df_plot["a"], y=df_plot["ms"])
fig.show()

The time of execution for Polars is : 7.796764373779297 ms
The time of execution for Polars is : 2.7468204498291016 ms


# Adding New Columns
***Pandas:***<br>
`df1["new_col"] = df1["livearea"] * 10`<br>
***Polars:***<br>
`df2.with_columns([(pl.col("livearea") * 10).alias("new_col")])`<br>

In [17]:
# Pandas
start1 = time.time()
df1["new_col"] = df1["livearea"] * 10
end1 = time.time()
ttime1 = (end1-start1) * 10**3
print("The time of execution for Pandas is:",
      ttime1, "ms")

# Polars
start2 = time.time()
df2.with_columns([(pl.col("livearea") * 10).alias("new_col")])
end2 = time.time()
ttime2 = (end2-start2) * 10**3
print("The time of execution for Polars is:",
      ttime2, "ms")

# Polars for multiple columns
# df.with_columns([(pl.col("col") * 10).alias("new_col"), ...])

#Build a DF to plot time results
df_plot = pl.DataFrame({
    "a": ['Pandas','Polars'], 
    "ms": [ttime1, ttime2]
})
#print(df_plot)

#Plot the Results
fig = px.bar(df_plot, x=df_plot["a"], y=df_plot["ms"])
fig.show()

The time of execution for Polars is : 3.684520721435547 ms
The time of execution for Polars is : 2.5250911712646484 ms


# Group By and Aggregate
***Pandas:***<br>
`df1.groupby('yrbuilt')['livearea'].agg('mean')`<br>
***Polars:***<br>
`df2.groupby('yrbuilt').agg([pl.mean('livearea')])`<br>

In [18]:
# Pandas
start1 = time.time()
df1.groupby('yrbuilt')['livearea'].agg('mean')
end1 = time.time()
ttime1 = (end1-start1) * 10**3
print("The time of execution for Pandas is:",
      ttime1, "ms")

# Polars
start2 = time.time()
#df2.groupby('yrbuilt').agg([pl.col('livearea').mean()]) # As suggested in Polars docs
df2.groupby('yrbuilt').agg([pl.mean('livearea')]) # Shorter
end2 = time.time()
ttime2 = (end2-start2) * 10**3
print("The time of execution for Polars is:",
      ttime2, "ms")

#Build a DF to plot time results
df_plot = pl.DataFrame({
    "a": ['Pandas','Polars'], 
    "ms": [ttime1, ttime2]
})
#print(df_plot)

#Plot the Results
fig = px.bar(df_plot, x=df_plot["a"], y=df_plot["ms"])
fig.show()

The time of execution for Pandas is : 9.71531867980957 ms
The time of execution for Polars is : 2.480745315551758 ms


# Fill NA
***Pandas:***<br>
`df1.fillna(np.nan)`<br>
***Polars:***<br>
`df2.fill_null(np.nan)`<br>

In [21]:
import numpy as np
url = 'https://github.com/fivethirtyeight/data/raw/master/star-wars-survey/StarWars.csv' #DS250 Project 5
df1 = pd.read_csv(url, encoding = "ISO-8859-1",header=None)
df2 = pl.read_csv(url, encoding = "ISO-8859-1",has_header=False)

# Pandas
start1 = time.time()
#df1[31].fillna(np.nan) # For a particular column
df1.fillna(np.nan)
end1 = time.time()
ttime1 = (end1-start1) * 10**3
print("The time of execution for Pandas is :",
      ttime1, "ms")


# Polars
start2 = time.time()
#df2.with_column(pl.col('column_32').fill_null(pl.lit(np.nan))) # As suggested in Polars docs
#df2.with_column(pl.col('column_32').fill_null(np.nan)) # Shorter
df2.fill_null(np.nan)
end2 = time.time()
ttime2 = (end2-start2) * 10**3
print("The time of execution for Polars is :",
      ttime2, "ms")

#Build a DF to plot time results
df_plot = pl.DataFrame({
    "a": ['Pandas','Polars'], 
    "ms": [ttime1, ttime2]
})
#print(df_plot)

#Plot the Results
fig = px.bar(df_plot, x=df_plot["a"], y=df_plot["ms"])
fig.show()

The time of execution for Pandas is : 4.272937774658203 ms
The time of execution for Polars is : 1.3153553009033203 ms


Adapted from:
Author: Leonie Monigatti 
Source: https://towardsdatascience.com/pandas-vs-polars-a-syntax-and-speed-comparison-5aa54e27497e