# Polars

In [1]:
!pip install polars==1.22.0



In [2]:
import polars as pl
import time
from datetime import date
import statistics

In [3]:
%run utilities.ipynb



## Reading File

In [4]:
total_read = []
for i in range(REPETITIONS): 
    start_time = time.time()
    df = pl.read_csv("Imdb Movie Dataset.csv")
    time_reading_file = time.time() - start_time
    total_read.append(time_reading_file)
print("--- %s seconds ---" % sum(total_read))

--- 19.296343088150024 seconds ---


In [5]:
df

id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,budget,imdb_id,original_language,original_title,overview,popularity,tagline,genres,production_companies,production_countries,spoken_languages,keywords
i64,str,f64,i64,str,str,i64,i64,bool,i64,str,str,str,str,f64,str,str,str,str,str,str
27205,"""Inception""",8.364,34495,"""Released""","""7/15/2010""",825532764,148,false,160000000,"""tt1375666""","""en""","""Inception""","""Cobb, a skilled thief who comm…",83.952,"""Your mind is the scene of the …","""Action, Science Fiction, Adven…","""Legendary Pictures, Syncopy, W…","""United Kingdom, United States …","""English, French, Japanese, Swa…","""rescue, mission, dream, airpla…"
157336,"""Interstellar""",8.417,32571,"""Released""","""11/5/2014""",701729206,169,false,165000000,"""tt0816692""","""en""","""Interstellar""","""The adventures of a group of e…",140.241,"""Mankind was born on Earth. It …","""Adventure, Drama, Science Fict…","""Legendary Pictures, Syncopy, L…","""United Kingdom, United States …","""English""","""rescue, future, spacecraft, ra…"
155,"""The Dark Knight""",8.512,30619,"""Released""","""7/16/2008""",1004558444,152,false,185000000,"""tt0468569""","""en""","""The Dark Knight""","""Batman raises the stakes in hi…",130.643,"""Welcome to a world without rul…","""Drama, Action, Crime, Thriller""","""DC Comics, Legendary Pictures,…","""United Kingdom, United States …","""English, Mandarin""","""joker, sadism, chaos, secret i…"
19995,"""Avatar""",7.573,29815,"""Released""","""12/15/2009""",2923706026,162,false,237000000,"""tt0499549""","""en""","""Avatar""","""In the 22nd century, a paraple…",79.932,"""Enter the world of Pandora.""","""Action, Adventure, Fantasy, Sc…","""Dune Entertainment, Lightstorm…","""United States of America, Unit…","""English, Spanish""","""future, society, culture clash…"
24428,"""The Avengers""",7.71,29166,"""Released""","""4/25/2012""",1518815515,143,false,220000000,"""tt0848228""","""en""","""The Avengers""","""When an unexpected enemy emerg…",98.082,"""Some assembly required.""","""Science Fiction, Action, Adven…","""Marvel Studios""","""United States of America""","""English, Hindi, Russian""","""new york city, superhero, shie…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
905156,"""鐨勯鏍肩殑椋庢牸儇妒呢蛹刹偎头""",0.0,0,"""Released""",,0,0,false,0,,"""zh""","""鐨勯鏍肩殑椋庢牸儇妒呢蛹刹偎头""",,0.6,,,,,,
905157,"""MILF & Cookies 3""",0.0,0,"""Released""",,0,0,true,0,,"""en""","""MILF & Cookies 3""",,0.6,,,,,,
905158,"""The Choice of Staying""",0.0,0,"""Released""","""10/8/2020""",0,0,false,0,"""tt13925132""","""it""","""The Choice of Staying""",,0.6,,"""Documentary""",,,"""English, Italian, Swedish""",
905161,"""Luisa Schluckt Schon Wieder""",0.0,0,"""Released""","""5/13/2016""",0,0,true,0,,"""de""","""Luisa Schluckt Schon Wieder""",,0.6,,,"""John Thompson Productions""",,,


In [6]:
df.estimated_size()

402900275

## Cleaning data

Things that we are going to remove and measure only once
- No release Date and status is released
- Runtime 0 or Null
- Budget 0. It has to cost something
- It has to have a genre

In [7]:
data_cleaned = df.filter(
    (~((pl.col("status") == "Released") & (pl.col("release_date").is_null()))) &
    ((pl.col("runtime") > 0) | (pl.col("release_date").is_not_null())) &
    (pl.col("runtime") > 0) &
    (pl.col("genres").is_not_null())
)

In [8]:
data_cleaned

id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,budget,imdb_id,original_language,original_title,overview,popularity,tagline,genres,production_companies,production_countries,spoken_languages,keywords
i64,str,f64,i64,str,str,i64,i64,bool,i64,str,str,str,str,f64,str,str,str,str,str,str
27205,"""Inception""",8.364,34495,"""Released""","""7/15/2010""",825532764,148,false,160000000,"""tt1375666""","""en""","""Inception""","""Cobb, a skilled thief who comm…",83.952,"""Your mind is the scene of the …","""Action, Science Fiction, Adven…","""Legendary Pictures, Syncopy, W…","""United Kingdom, United States …","""English, French, Japanese, Swa…","""rescue, mission, dream, airpla…"
157336,"""Interstellar""",8.417,32571,"""Released""","""11/5/2014""",701729206,169,false,165000000,"""tt0816692""","""en""","""Interstellar""","""The adventures of a group of e…",140.241,"""Mankind was born on Earth. It …","""Adventure, Drama, Science Fict…","""Legendary Pictures, Syncopy, L…","""United Kingdom, United States …","""English""","""rescue, future, spacecraft, ra…"
155,"""The Dark Knight""",8.512,30619,"""Released""","""7/16/2008""",1004558444,152,false,185000000,"""tt0468569""","""en""","""The Dark Knight""","""Batman raises the stakes in hi…",130.643,"""Welcome to a world without rul…","""Drama, Action, Crime, Thriller""","""DC Comics, Legendary Pictures,…","""United Kingdom, United States …","""English, Mandarin""","""joker, sadism, chaos, secret i…"
19995,"""Avatar""",7.573,29815,"""Released""","""12/15/2009""",2923706026,162,false,237000000,"""tt0499549""","""en""","""Avatar""","""In the 22nd century, a paraple…",79.932,"""Enter the world of Pandora.""","""Action, Adventure, Fantasy, Sc…","""Dune Entertainment, Lightstorm…","""United States of America, Unit…","""English, Spanish""","""future, society, culture clash…"
24428,"""The Avengers""",7.71,29166,"""Released""","""4/25/2012""",1518815515,143,false,220000000,"""tt0848228""","""en""","""The Avengers""","""When an unexpected enemy emerg…",98.082,"""Some assembly required.""","""Science Fiction, Action, Adven…","""Marvel Studios""","""United States of America""","""English, Hindi, Russian""","""new york city, superhero, shie…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
905065,"""Science Please! : Wheel Meets …",0.0,0,"""Released""","""1/1/1998""",5477,1,false,5477,"""tt0240207""","""en""","""Science Please! : Wheel Meets …","""A clip in the Science Please! …",0.6,,"""Animation""","""ONF | NFB""","""Canada""","""English""","""big wheel"""
905072,"""Revue Cinema: Reel Communities""",0.0,0,"""Released""","""12/13/2021""",0,50,false,0,,"""en""","""Revue Cinema: Reel Communities""","""The Revue Cinema, which opened…",0.618,,"""Documentary""","""Roy Zheng Studio""","""Canada""","""English""",
905074,"""SHIBUYA, TOKYO 16:30""",0.0,0,"""Released""","""4/27/2020""",0,15,false,0,,"""ja""","""SHIBUYA, TOKYO 16:30""","""Aoi, an aspiring filmmaker/ass…",0.6,,"""Drama""","""Rogue Works, Ltd.""",,"""Japanese""",
905076,"""The Fraction""",0.0,0,"""Released""","""1/1/1972""",0,7,false,0,,"""uk""","""Дріб""",,0.6,,"""Animation""","""Kyivnaukfilm""","""Soviet Union""","""No Language""","""short film"""


## Select

In [9]:
total_select = []
for i in range(REPETITIONS): 
    start_time = time.time()
    data_cleaned = data_cleaned.select([pl.col(c) for c in data_cleaned.columns if c not in [
    "overview", "tagline","genres", "production_companies","production_countries","spoken_languages", "keywords"]
                                   ])
    time_select = time.time() - start_time
    total_select.append(time_select)
print("--- %s seconds ---" % sum(total_select))

--- 0.0037229061126708984 seconds ---


In [10]:
data_cleaned

id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,budget,imdb_id,original_language,original_title,popularity
i64,str,f64,i64,str,str,i64,i64,bool,i64,str,str,str,f64
27205,"""Inception""",8.364,34495,"""Released""","""7/15/2010""",825532764,148,false,160000000,"""tt1375666""","""en""","""Inception""",83.952
157336,"""Interstellar""",8.417,32571,"""Released""","""11/5/2014""",701729206,169,false,165000000,"""tt0816692""","""en""","""Interstellar""",140.241
155,"""The Dark Knight""",8.512,30619,"""Released""","""7/16/2008""",1004558444,152,false,185000000,"""tt0468569""","""en""","""The Dark Knight""",130.643
19995,"""Avatar""",7.573,29815,"""Released""","""12/15/2009""",2923706026,162,false,237000000,"""tt0499549""","""en""","""Avatar""",79.932
24428,"""The Avengers""",7.71,29166,"""Released""","""4/25/2012""",1518815515,143,false,220000000,"""tt0848228""","""en""","""The Avengers""",98.082
…,…,…,…,…,…,…,…,…,…,…,…,…,…
905065,"""Science Please! : Wheel Meets …",0.0,0,"""Released""","""1/1/1998""",5477,1,false,5477,"""tt0240207""","""en""","""Science Please! : Wheel Meets …",0.6
905072,"""Revue Cinema: Reel Communities""",0.0,0,"""Released""","""12/13/2021""",0,50,false,0,,"""en""","""Revue Cinema: Reel Communities""",0.618
905074,"""SHIBUYA, TOKYO 16:30""",0.0,0,"""Released""","""4/27/2020""",0,15,false,0,,"""ja""","""SHIBUYA, TOKYO 16:30""",0.6
905076,"""The Fraction""",0.0,0,"""Released""","""1/1/1972""",0,7,false,0,,"""uk""","""Дріб""",0.6


## Filter

In [11]:
total_filter = []
for i in range(REPETITIONS): 
    start_time = time.time()
    df_filter = data_cleaned.filter(pl.col("budget") > 550000000)
    time_filter = time.time() - start_time
    total_filter.append(time_filter)
print("--- %s seconds ---" % sum(total_filter))

--- 0.3086822032928467 seconds ---


In [12]:
df_filter

id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,budget,imdb_id,original_language,original_title,popularity
i64,str,f64,i64,str,str,i64,i64,bool,i64,str,str,str,f64
1057999,"""Enea""",0.0,0,"""Released""","""9/21/2023""",0,115,False,888000000,"""tt27219440""","""it""","""Enea""",4.319
1224207,"""Adventures in Bora Bora""",0.0,0,"""Released""","""8/23/2023""",3000000000,5,False,800000000,,"""en""","""Adventures in Bora Bora""",0.0


## Aggregation

In [13]:
total_agg = []
for i in range(REPETITIONS):
    start_time = time.time()
    df_agg = data_cleaned.group_by(["status"]).agg(pl.col("runtime").mean())
    time_agg = time.time() - start_time
    total_agg.append(time_agg)
print("--- %s seconds ---" % sum(total_agg))

--- 0.46825623512268066 seconds ---


In [14]:
df_agg

status,runtime
str,f64
"""Post Production""",60.983746
"""Planned""",59.428302
"""Released""",68.709632
"""Canceled""",45.2
"""Rumored""",50.846154
"""In Production""",48.70313


## Sorting

In [15]:
total_sort = []
for i in range(REPETITIONS):
    start_time = time.time()
    df_sort = data_cleaned.sort(["revenue"])
    time_sort = time.time() - start_time
    total_sort.append(time_sort)
print("--- %s seconds ---" % sum(total_sort))

--- 0.9552304744720459 seconds ---


In [16]:
df_sort

id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,budget,imdb_id,original_language,original_title,popularity
i64,str,f64,i64,str,str,i64,i64,bool,i64,str,str,str,f64
405774,"""Bird Box""",6.854,9227,"""Released""","""12/13/2018""",0,124,false,19800000,"""tt2737304""","""en""","""Bird Box""",27.913
791373,"""Zack Snyder's Justice League""",8.19,9202,"""Released""","""3/18/2021""",0,242,false,70000000,"""tt12361974""","""en""","""Zack Snyder's Justice League""",130.69
466282,"""To All the Boys I've Loved Bef…",7.642,8045,"""Released""","""8/16/2018""",0,100,false,0,"""tt3846674""","""en""","""To All the Boys I've Loved Bef…",38.926
454983,"""The Kissing Booth""",7.229,6926,"""Released""","""5/11/2018""",0,105,false,0,"""tt3799232""","""en""","""The Kissing Booth""",36.649
766507,"""Prey""",7.762,5821,"""Released""","""8/2/2022""",0,100,false,65000000,"""tt11866324""","""en""","""Prey""",154.579
…,…,…,…,…,…,…,…,…,…,…,…,…,…
76600,"""Avatar: The Way of Water""",7.654,9830,"""Released""","""12/14/2022""",2320250281,192,false,460000000,"""tt1630029""","""en""","""Avatar: The Way of Water""",241.285
299534,"""Avengers: Endgame""",8.263,23857,"""Released""","""4/24/2019""",2800000000,181,false,356000000,"""tt4154796""","""en""","""Avengers: Endgame""",91.756
19995,"""Avatar""",7.573,29815,"""Released""","""12/15/2009""",2923706026,162,false,237000000,"""tt0499549""","""en""","""Avatar""",79.932
1270893,"""TikTok Rizz Party""",10.0,1,"""Released""","""4/1/2024""",3000000000,180,false,250000000,,"""en""","""TikTok Rizz Party""",0.0


## Complex Execution

In [17]:
total_complex = []
for i in range(REPETITIONS):
    start_time = time.time()
    revenue_max_non_en_movie_2024 = (
        data_cleaned.with_columns(
            (pl.col("release_date").str.to_date("%m/%d/%Y", strict=False)).alias("release_date")
        )
        .filter(
            (pl.col("release_date").is_between(date(2024,1,1),date(2024,12,31)))
        )
        .group_by(["original_language"]).agg(pl.col("revenue").max()).filter((pl.col("original_language") != "en"))   
    )
    time_complex = time.time() - start_time
    total_complex.append(time_complex)
print("--- %s seconds ---" % sum(total_complex))

--- 4.0509421825408936 seconds ---


In [18]:
revenue_max_non_en_movie_2024

original_language,revenue
str,i64
"""te""",0
"""ne""",0
"""cy""",0
"""or""",0
"""de""",10
…,…
"""ky""",0
"""ko""",0
"""sn""",0
"""da""",0


## Saving files

In [19]:
filename = "Polars.csv"
data_list = ["sum","avg","min","max"]
write_list = []
for x in data_list:
    data = {}
    fun = sum if x == "sum" else statistics.mean if x == "avg" else min if x == "min" else max
    data["type"] = x
    data["read_file"] = fun(total_read)
    data["memory_size_read"] = df.estimated_size()
    data["select"] = fun(total_select)
    data["memory_size_select"] = data_cleaned.estimated_size()
    data["filter"] = fun(total_filter)
    data["memory_size_filter"] = df_filter.estimated_size()
    data["agg"] = fun(total_agg)
    data["memory_size_agg"] = df_agg.estimated_size()
    data["sort"] = fun(total_sort)
    data["memory_size_sort"] = df_sort.estimated_size()
    data["complex"] = fun(total_complex)
    data["memory_size_complex"] = revenue_max_non_en_movie_2024.estimated_size()
    data["repetitions"] = REPETITIONS
    write_list.append(data)
write_csv(filename,write_list)