In [1]:
import polars as pl
from datetime import date

In [2]:
pl.__version__

'1.4.1'

In [3]:
!python --version

Python 3.12.1


In [4]:
series = pl.Series("age", [24,20,55,44,77,16])

In [5]:
series

age
i64
24
20
55
44
77
16


In [6]:
series.max(), series.sum(), series.min()

(77, 236, 16)

In [7]:
dir(series)

['__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_ufunc__',
 '__arrow_c_stream__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rmatmul__',
 '__rmod__',
 '__rmul__',
 '__ror__',
 '__rpow__',
 '__rsub__',
 '__rtruediv__',
 '__rxor__',
 '__setattr__',
 '__setitem__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__truediv__',
 '__weakref__',
 '__xor__',
 '_accessors',
 '_arithmetic',
 '_comp',
 '_export_arr

In [8]:
series.sort()

age
i64
16
20
24
44
55
77


In [9]:
series.sort(descending=True)

age
i64
77
55
44
24
20
16


In [10]:
series.to_pandas()

0    24
1    20
2    55
3    44
4    77
5    16
Name: age, dtype: int64

In [11]:
df = pl.DataFrame({
    "Employee ID": [1,2,3,4,5],
    "Name" : ["Alice", "Bob", "Charlie", "David", "Eve"],
    "Age": [29,34,31,28,42],
    "Department": ["HR", "Engineering", "Finance", "HR", "Finance"],
    "Salary": [50000., 75000., 60000., 58000., 85000.],
    "Start Date": [date(2023,5,21), date(2019,8,14), date(2021,1,10), date(2018,11,3), date(2022,6,27)]
    })

In [12]:
df

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
1,"""Alice""",29,"""HR""",50000.0,2023-05-21
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14
3,"""Charlie""",31,"""Finance""",60000.0,2021-01-10
4,"""David""",28,"""HR""",58000.0,2018-11-03
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27


In [13]:
df.head(2)

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
1,"""Alice""",29,"""HR""",50000.0,2023-05-21
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14


In [14]:
df.sample(3)

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14
3,"""Charlie""",31,"""Finance""",60000.0,2021-01-10
1,"""Alice""",29,"""HR""",50000.0,2023-05-21


In [15]:
df.describe()

statistic,Employee ID,Name,Age,Department,Salary,Start Date
str,f64,str,f64,str,f64,str
"""count""",5.0,"""5""",5.0,"""5""",5.0,"""5"""
"""null_count""",0.0,"""0""",0.0,"""0""",0.0,"""0"""
"""mean""",3.0,,32.8,,65600.0,"""2021-01-26 00:00:00"""
"""std""",1.581139,,5.630275,,14117.365193,
"""min""",1.0,"""Alice""",28.0,"""Engineering""",50000.0,"""2018-11-03"""
"""25%""",2.0,,29.0,,58000.0,"""2019-08-14"""
"""50%""",3.0,,31.0,,60000.0,"""2021-01-10"""
"""75%""",4.0,,34.0,,75000.0,"""2022-06-27"""
"""max""",5.0,"""Eve""",42.0,"""HR""",85000.0,"""2023-05-21"""


In [16]:
df.schema

Schema([('Employee ID', Int64),
        ('Name', String),
        ('Age', Int64),
        ('Department', String),
        ('Salary', Float64),
        ('Start Date', Date)])

In [17]:
df_selected = df.select(["Name", "Age"])

In [18]:
df_selected

Name,Age
str,i64
"""Alice""",29
"""Bob""",34
"""Charlie""",31
"""David""",28
"""Eve""",42


In [19]:
df["Name", "Age", "Salary"]

Name,Age,Salary
str,i64,f64
"""Alice""",29,50000.0
"""Bob""",34,75000.0
"""Charlie""",31,60000.0
"""David""",28,58000.0
"""Eve""",42,85000.0


In [20]:
#Empleados que tienen más de 30 años
df.filter(pl.col("Age") > 30)

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14
3,"""Charlie""",31,"""Finance""",60000.0,2021-01-10
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27


In [21]:
df.filter(pl.col("Department") ==  "Finance")

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
3,"""Charlie""",31,"""Finance""",60000.0,2021-01-10
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27


In [22]:
df.filter(pl.col("Department") !=  "Finance")

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
1,"""Alice""",29,"""HR""",50000.0,2023-05-21
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14
4,"""David""",28,"""HR""",58000.0,2018-11-03


In [23]:
df.filter(pl.col("Department") ==  "Finance", pl.col("Salary") > 62000)

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27


In [24]:
# Calculemos sueldo mensual
df.with_columns((pl.col("Salary") /12).alias("Monthly Salary"))

Employee ID,Name,Age,Department,Salary,Start Date,Monthly Salary
i64,str,i64,str,f64,date,f64
1,"""Alice""",29,"""HR""",50000.0,2023-05-21,4166.666667
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14,6250.0
3,"""Charlie""",31,"""Finance""",60000.0,2021-01-10,5000.0
4,"""David""",28,"""HR""",58000.0,2018-11-03,4833.333333
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27,7083.333333


In [25]:
# Calculemos sueldo mensual redondeado
df.with_columns((pl.col("Salary") / 12).round(1).alias("Monthly Salary"))

Employee ID,Name,Age,Department,Salary,Start Date,Monthly Salary
i64,str,i64,str,f64,date,f64
1,"""Alice""",29,"""HR""",50000.0,2023-05-21,4166.7
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14,6250.0
3,"""Charlie""",31,"""Finance""",60000.0,2021-01-10,5000.0
4,"""David""",28,"""HR""",58000.0,2018-11-03,4833.3
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27,7083.3


In [26]:
# Esto no cambia el df
df

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
1,"""Alice""",29,"""HR""",50000.0,2023-05-21
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14
3,"""Charlie""",31,"""Finance""",60000.0,2021-01-10
4,"""David""",28,"""HR""",58000.0,2018-11-03
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27


In [27]:
df = df.with_columns((pl.col("Salary") / 12).round(1).alias("Monthly Salary"))
df

Employee ID,Name,Age,Department,Salary,Start Date,Monthly Salary
i64,str,i64,str,f64,date,f64
1,"""Alice""",29,"""HR""",50000.0,2023-05-21,4166.7
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14,6250.0
3,"""Charlie""",31,"""Finance""",60000.0,2021-01-10,5000.0
4,"""David""",28,"""HR""",58000.0,2018-11-03,4833.3
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27,7083.3


In [28]:
# Borremos esa columna
df.drop("Monthly Salary")

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
1,"""Alice""",29,"""HR""",50000.0,2023-05-21
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14
3,"""Charlie""",31,"""Finance""",60000.0,2021-01-10
4,"""David""",28,"""HR""",58000.0,2018-11-03
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27


In [29]:
# No es inplace 
df

Employee ID,Name,Age,Department,Salary,Start Date,Monthly Salary
i64,str,i64,str,f64,date,f64
1,"""Alice""",29,"""HR""",50000.0,2023-05-21,4166.7
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14,6250.0
3,"""Charlie""",31,"""Finance""",60000.0,2021-01-10,5000.0
4,"""David""",28,"""HR""",58000.0,2018-11-03,4833.3
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27,7083.3


In [30]:
df = df.drop("Monthly Salary")

In [31]:
df

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
1,"""Alice""",29,"""HR""",50000.0,2023-05-21
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14
3,"""Charlie""",31,"""Finance""",60000.0,2021-01-10
4,"""David""",28,"""HR""",58000.0,2018-11-03
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27


In [32]:
df.sort("Salary")

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
1,"""Alice""",29,"""HR""",50000.0,2023-05-21
4,"""David""",28,"""HR""",58000.0,2018-11-03
3,"""Charlie""",31,"""Finance""",60000.0,2021-01-10
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27


In [33]:
df.sort("Start Date", descending=True)

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
1,"""Alice""",29,"""HR""",50000.0,2023-05-21
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27
3,"""Charlie""",31,"""Finance""",60000.0,2021-01-10
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14
4,"""David""",28,"""HR""",58000.0,2018-11-03


In [34]:
df["Salary"].max()

85000.0

In [35]:
df["Age"].median()

31.0

In [36]:
df.filter(pl.col("Age") > 30)["Salary"].max()

85000.0

In [37]:
type(df.filter(pl.col("Age") > 30)["Salary"].max())

float

In [38]:
df.group_by("Department")

<polars.dataframe.group_by.GroupBy at 0x1162591f0>

In [39]:
df.group_by("Department").agg(
    [pl.col("Salary").mean().alias("Avg. Salary")]
)

Department,Avg. Salary
str,f64
"""Finance""",72500.0
"""Engineering""",75000.0
"""HR""",54000.0


In [40]:
df.group_by("Department").agg([
    pl.col("Salary").mean().alias("Avg. Salary"),
    pl.col("Salary").max().alias("Max Salary")
]
)

Department,Avg. Salary,Max Salary
str,f64,f64
"""Engineering""",75000.0,75000.0
"""Finance""",72500.0,85000.0
"""HR""",54000.0,58000.0


In [41]:
df.group_by("Department").agg([
    pl.col("Salary").mean().alias("Avg. Salary"),
    pl.col("Age").min().alias("Youngest Age")
]
)

Department,Avg. Salary,Youngest Age
str,f64,i64
"""Engineering""",75000.0,34
"""Finance""",72500.0,31
"""HR""",54000.0,28


In [42]:
df = pl.DataFrame({
    "Employee ID": [1,2,3,4,5],
    "Name" : ["Alice", "Bob", "Charlie", "David", "Eve"],
    "Age": [29,34,31,28,42],
    "Department": ["HR", "Engineering", "Finance", "HR", "Finance"],
    "Salary": [50000., 75000., None, None, 85000.],
    "Start Date": [date(2023,5,21), date(2019,8,14), date(2021,1,10), date(2018,11,3), date(2022,6,27)]
    })

In [43]:
df

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
1,"""Alice""",29,"""HR""",50000.0,2023-05-21
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14
3,"""Charlie""",31,"""Finance""",,2021-01-10
4,"""David""",28,"""HR""",,2018-11-03
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27


In [44]:
df.null_count()

Employee ID,Name,Age,Department,Salary,Start Date
u32,u32,u32,u32,u32,u32
0,0,0,0,2,0


In [45]:
df.select(pl.col("Salary").is_null())

Salary
bool
False
False
True
True
False


In [46]:
df.filter(pl.col("Salary").is_null())

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
3,"""Charlie""",31,"""Finance""",,2021-01-10
4,"""David""",28,"""HR""",,2018-11-03


In [47]:
df.filter(pl.col("Salary").is_not_null())

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
1,"""Alice""",29,"""HR""",50000.0,2023-05-21
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27


In [48]:
df.with_columns(pl.col("Salary").fill_null(pl.lit(50)))

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
1,"""Alice""",29,"""HR""",50000.0,2023-05-21
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14
3,"""Charlie""",31,"""Finance""",50.0,2021-01-10
4,"""David""",28,"""HR""",50.0,2018-11-03
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27


In [49]:
df.with_columns(pl.col("Salary").fill_null(50))

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
1,"""Alice""",29,"""HR""",50000.0,2023-05-21
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14
3,"""Charlie""",31,"""Finance""",50.0,2021-01-10
4,"""David""",28,"""HR""",50.0,2018-11-03
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27


In [50]:
df.with_columns(pl.col("Salary").fill_null(strategy="forward"))

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
1,"""Alice""",29,"""HR""",50000.0,2023-05-21
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14
3,"""Charlie""",31,"""Finance""",75000.0,2021-01-10
4,"""David""",28,"""HR""",75000.0,2018-11-03
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27


In [51]:
df.with_columns(pl.col("Salary").fill_null(strategy="backward"))

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
1,"""Alice""",29,"""HR""",50000.0,2023-05-21
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14
3,"""Charlie""",31,"""Finance""",85000.0,2021-01-10
4,"""David""",28,"""HR""",85000.0,2018-11-03
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27


In [52]:
df.with_columns(pl.col("Salary").interpolate())

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,date
1,"""Alice""",29,"""HR""",50000.0,2023-05-21
2,"""Bob""",34,"""Engineering""",75000.0,2019-08-14
3,"""Charlie""",31,"""Finance""",78333.333333,2021-01-10
4,"""David""",28,"""HR""",81666.666667,2018-11-03
5,"""Eve""",42,"""Finance""",85000.0,2022-06-27


In [53]:
df.write_csv("employees.csv")
df.write_json("employees.json")

In [54]:
df2= pl.read_csv("employees.csv")
df2 #ojo que start date ahora es string

Employee ID,Name,Age,Department,Salary,Start Date
i64,str,i64,str,f64,str
1,"""Alice""",29,"""HR""",50000.0,"""2023-05-21"""
2,"""Bob""",34,"""Engineering""",75000.0,"""2019-08-14"""
3,"""Charlie""",31,"""Finance""",,"""2021-01-10"""
4,"""David""",28,"""HR""",,"""2018-11-03"""
5,"""Eve""",42,"""Finance""",85000.0,"""2022-06-27"""


In [58]:
df["Age"].plot.bar()

In [59]:
df = pl.DataFrame({
    "Employee ID": [1,2,3,4,5],
    "Name" : ["Alice", "Bob", "Charlie", "David", "Eve"],
    "Age": [29,34,31,28,42],
    "Department": ["HR", "Engineering", "Finance", "HR", "Finance"],
    "Salary": [50000., 75000., 60000., 58000., 85000.],
    "Start Date": [date(2023,5,21), date(2019,8,14), date(2021,1,10), date(2018,11,3), date(2022,6,27)]
    })

In [61]:
df.plot.scatter(x="Age", y="Salary", by="Name")