# Install polars 

In [None]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import polars as pl

# Series & DataFrames

# Series 

In [None]:
import polars as pl

my_series = pl.Series("a", [1, 2, 3, 4, 5])
print(my_series)

shape: (5,)
Series: 'a' [i64]
[
	1
	2
	3
	4
	5
]


### Methods: AGGREGATIONS

In [None]:
my_series = pl.Series("a", [1, 2, 3, 4, 5])
print(my_series.min())
print(my_series.max())

1
5


### Methods: STRING 

In [None]:
my_series = pl.Series("a", ["polar", "bear", "arctic", "polar fox", "polar bear"])
my_series_2 = my_series.str.replace("polar", "pola")
print(my_series_2)

shape: (5,)
Series: 'a' [str]
[
	"pola"
	"bear"
	"arctic"
	"pola fox"
	"pola bear"
]


# Methods: DATETIME 

In [None]:
from datetime import datetime

t_start = datetime(2001, 1, 1)
t_stop = datetime(2001, 1, 9)
my_series = pl.date_range(t_start, t_stop, interval="2d")
my_series.dt.day()
print(my_series)

shape: (5,)
Series: '' [datetime[μs]]
[
	2001-01-01 00:00:00
	2001-01-03 00:00:00
	2001-01-05 00:00:00
	2001-01-07 00:00:00
	2001-01-09 00:00:00
]


## DataFrame 

In [None]:
df = pl.DataFrame(
    {
        "integer": [1, 2, 3, 4, 5],
        "date": [
            datetime(2022, 1, 1),
            datetime(2022, 1, 2),
            datetime(2022, 1, 3),
            datetime(2022, 1, 4),
            datetime(2022, 1, 5),
        ],
        "float": [4.0, 5.0, 6.0, 7.0, 8.0],
    }
)

print(df)

shape: (5, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   │
│ 4       ┆ 2022-01-04 00:00:00 ┆ 7.0   │
│ 5       ┆ 2022-01-05 00:00:00 ┆ 8.0   │
└─────────┴─────────────────────┴───────┘


In [None]:
df.head(3)

integer,date,float
i64,datetime[μs],f64
1,2022-01-01 00:00:00,4.0
2,2022-01-02 00:00:00,5.0
3,2022-01-03 00:00:00,6.0


In [None]:
df.tail(3)

integer,date,float
i64,datetime[μs],f64
3,2022-01-03 00:00:00,6.0
4,2022-01-04 00:00:00,7.0
5,2022-01-05 00:00:00,8.0


In [None]:
df.describe()

describe,integer,date,float
str,f64,str,f64
"""count""",5.0,"""5""",5.0
"""null_count""",0.0,"""0""",0.0
"""mean""",3.0,,6.0
"""std""",1.581139,,1.581139
"""min""",1.0,"""2022-01-01 00:…",4.0
"""max""",5.0,"""2022-01-05 00:…",8.0
"""median""",3.0,,6.0
"""25%""",2.0,,5.0
"""75%""",4.0,,7.0


# Reading & Writing 

In [None]:
df = pl.DataFrame(
    {
        "integer": [1, 2, 3],
        "date": [
            datetime(2022, 1, 1),
            datetime(2022, 1, 2),
            datetime(2022, 1, 3),
        ],
        "float": [4.0, 5.0, 6.0],
    }
)

df

integer,date,float
i64,datetime[μs],f64
1,2022-01-01 00:00:00,4.0
2,2022-01-02 00:00:00,5.0
3,2022-01-03 00:00:00,6.0


In [None]:
# write csv
df.write_csv("output.csv")
df_csv = pl.read_csv("output.csv")

df_csv

integer,date,float
i64,str,f64
1,"""2022-01-01T00:…",4.0
2,"""2022-01-02T00:…",5.0
3,"""2022-01-03T00:…",6.0


In [None]:
# Read csv 
df_csv = pl.read_csv("output.csv", try_parse_dates=True)
print(df_csv)

shape: (3, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   │
└─────────┴─────────────────────┴───────┘


In [None]:
# write & read json 
df.write_json("output.json")
df_json = pl.read_json("output.json")

df_json

integer,date,float
i64,datetime[μs],f64
1,2022-01-01 00:00:00,4.0
2,2022-01-02 00:00:00,5.0
3,2022-01-03 00:00:00,6.0


# Expressions

In [None]:
df.select(pl.col("*"))

integer,date,float
i64,datetime[μs],f64
1,2022-01-01 00:00:00,4.0
2,2022-01-02 00:00:00,5.0
3,2022-01-03 00:00:00,6.0


In [None]:
df.select(pl.col(["integer", "float"]))

integer,float
i64,f64
1,4.0
2,5.0
3,6.0


In [None]:
df.select([pl.col("integer"), pl.col("float")]).limit(2)

integer,float
i64,f64
1,4.0
2,5.0


In [None]:
df.select([pl.exclude("integer")])

date,float
datetime[μs],f64
2022-01-01 00:00:00,4.0
2022-01-02 00:00:00,5.0
2022-01-03 00:00:00,6.0


In [None]:
# filter 
df.filter(
    pl.col("float").is_between(4, 5),
)

integer,date,float
i64,datetime[μs],f64
1,2022-01-01 00:00:00,4.0
2,2022-01-02 00:00:00,5.0


In [None]:
import numpy as np 

df = pl.DataFrame(
    {
        "integer": [1, 2, 3, 4, 5, 6, 7],
        "date": [
            datetime(2022, 1, 1),
            datetime(2022, 1, 2),
            datetime(2022, 1, 3),
            datetime(2022, 1, 4),
            datetime(2022, 1, 5),
            datetime(2022, 1, 6),
            datetime(2022, 1, 7),
        ],
        "float": [4.0, 5.0, 6.0, 7.0, np.nan, np.nan, 9.0],
    }
)

df

integer,date,float
i64,datetime[μs],f64
1,2022-01-01 00:00:00,4.0
2,2022-01-02 00:00:00,5.0
3,2022-01-03 00:00:00,6.0
4,2022-01-04 00:00:00,7.0
5,2022-01-05 00:00:00,
6,2022-01-06 00:00:00,
7,2022-01-07 00:00:00,9.0


In [None]:
df.filter((pl.col("integer") > 3) & (pl.col("float").is_not_nan()))

integer,date,float
i64,datetime[μs],f64
4,2022-01-04 00:00:00,7.0
7,2022-01-07 00:00:00,9.0


**with_columns** cho phép chúng ta tạo thêm 1 cột mới trong quá trình xử lý dữ liệu

In [None]:
df = df.with_columns([pl.col("integer").sum().alias("sum"), (pl.col("float") - 1).alias("sub-1")])

In [None]:
# Group_by 
df2 = pl.DataFrame(
    {
        "x": np.arange(0, 8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)

df2

x,y
i64,str
0,"""A"""
1,"""A"""
2,"""A"""
3,"""B"""
4,"""B"""
5,"""C"""
6,"""X"""
7,"""X"""


In [None]:
df2.groupby("y", maintain_order=True).count()

y,count
str,u32
"""A""",3
"""B""",2
"""C""",1
"""X""",2


In [None]:
df2.groupby("y", maintain_order=True).agg(
    [
        pl.col("*").count().alias("count"),
        pl.col("*").sum().alias("sum"),
    ]
)

y,count,sum
str,u32,i64
"""A""",3,3
"""B""",2,7
"""C""",1,5
"""X""",2,13


In [None]:
df

integer,date,float,sum,sub-1
i64,datetime[μs],f64,i64,f64
1,2022-01-01 00:00:00,4.0,28,3.0
2,2022-01-02 00:00:00,5.0,28,4.0
3,2022-01-03 00:00:00,6.0,28,5.0
4,2022-01-04 00:00:00,7.0,28,6.0
5,2022-01-05 00:00:00,,28,
6,2022-01-06 00:00:00,,28,
7,2022-01-07 00:00:00,9.0,28,8.0


In [None]:
df_x = df.with_columns((pl.col("integer") * pl.col("sum")).alias("integer * sum")).select(
    [pl.all().exclude(["date", "float"])]
)

df_x

integer,sum,sub-1,integer * sum
i64,i64,f64,i64
1,28,3.0,28
2,28,4.0,56
3,28,5.0,84
4,28,6.0,112
5,28,,140
6,28,,168
7,28,8.0,196


# Combine DF 

In [None]:
# Join 
df = pl.DataFrame(
    {
        "a": np.arange(0, 8),
        "b": np.random.rand(8),
        "d": [1, 2.0, np.NaN, np.NaN, 0, -5, -42, None],
    }
)

df2 = pl.DataFrame(
    {
        "x": np.arange(0, 8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)

df_joined = df.join(df2, left_on="a", right_on="x")

df_joined

a,b,d,y
i64,f64,f64,str
0,0.748164,1.0,"""A"""
1,0.200531,2.0,"""A"""
2,0.035221,,"""A"""
3,0.267183,,"""B"""
4,0.804834,0.0,"""B"""
5,0.418853,-5.0,"""C"""
6,0.322342,-42.0,"""X"""
7,0.085542,,"""X"""


# Polars & Pandas 

In [None]:
import pandas as pd 

In [None]:
import os 
os.environ['KAGGLE_USERNAME'] = '' # username from the json file
os.environ['KAGGLE_KEY'] = '' # key from the json file

!kaggle datasets download -d openfoodfacts/world-food-facts

Downloading world-food-facts.zip to /content
 89% 97.0M/109M [00:01<00:00, 102MB/s]
100% 109M/109M [00:01<00:00, 95.5MB/s]


In [None]:
!unzip world-food-facts.zip 

Archive:  world-food-facts.zip
  inflating: en.openfoodfacts.org.products.tsv  


In [None]:
!pip install ipython-autotime
%load_ext autotime

In [None]:
%time
pd_df = pd.read_csv("en.openfoodfacts.org.products.tsv", sep="\t")

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.63 µs


  pd_df = pd.read_csv("en.openfoodfacts.org.products.tsv", sep="\t")


time: 25.2 s (started: 2023-05-08 04:33:32 +00:00)


In [None]:
pl_df = pl.read_csv("en.openfoodfacts.org.products.tsv")

time: 4.84 s (started: 2023-05-08 04:34:40 +00:00)


In [None]:
!kaggle competitions download -c fake-news

Downloading fake-news.zip to /content
 82% 38.0M/46.5M [00:00<00:00, 110MB/s] 
100% 46.5M/46.5M [00:00<00:00, 94.0MB/s]
time: 1.51 s (started: 2023-05-08 04:47:23 +00:00)


In [None]:
!unzip fake-news.zip

Archive:  fake-news.zip
  inflating: submit.csv              
  inflating: test.csv                
  inflating: train.csv               
time: 1.63 s (started: 2023-05-08 04:47:38 +00:00)


In [None]:
pd_df = pd.read_csv("train.csv")

time: 1.45 s (started: 2023-05-08 04:48:01 +00:00)


In [None]:
pl_df = pl.read_csv("train.csv")

time: 1.05 s (started: 2023-05-08 04:48:15 +00:00)


In [None]:
pd_df.describe()

Unnamed: 0,id,label
count,20800.0,20800.0
mean,10399.5,0.500625
std,6004.587135,0.500012
min,0.0,0.0
25%,5199.75,0.0
50%,10399.5,1.0
75%,15599.25,1.0
max,20799.0,1.0


time: 56.1 ms (started: 2023-05-08 04:48:25 +00:00)


In [None]:
pl_df.describe()

describe,id,title,author,text,label
str,f64,str,str,str,f64
"""count""",20800.0,"""20800""","""20800""","""20800""",20800.0
"""null_count""",0.0,"""558""","""0""","""39""",0.0
"""mean""",10399.5,,,,0.500625
"""std""",6004.587135,,,,0.500012
"""min""",0.0,"""""Allahu Akbar,…","""# 1 NWO Hatr""",""" """,0.0
"""max""",20799.0,"""🚨Bill Clinton …","""“Shoot First A…","""🚨Bill Clinton …",1.0
"""median""",10399.5,,,,1.0
"""25%""",5200.0,,,,0.0
"""75%""",15600.0,,,,1.0


time: 36.3 ms (started: 2023-05-08 04:48:37 +00:00)
