In [1]:
# Built-in library
import re
import json
import logging
from typing import Any, Dict, List, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import pandas as pd
import polars as pl
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

### Create A DataFrame

In [2]:
from datetime import datetime


df: pl.DataFrame = pl.DataFrame(
    {
        "integer": [1, 2, 3, 4, 5],
        "date": [
            datetime(2022, 1, 1),
            datetime(2022, 1, 2),
            datetime(2022, 1, 3),
            datetime(2022, 1, 4),
            datetime(2022, 1, 5),
        ],
        "float": [4.0, 5.0, 6.0, 7.0, 8.0],
    }
)

print(df)

#### Viewing Data

```text
- df.head()
- df.tail()
```

In [3]:
df.head(2)

integer,date,float
i64,datetime[μs],f64
1,2022-01-01 00:00:00,4.0
2,2022-01-02 00:00:00,5.0


In [4]:
df.tail(2)

integer,date,float
i64,datetime[μs],f64
4,2022-01-04 00:00:00,7.0
5,2022-01-05 00:00:00,8.0


#### Descriptive Stats

In [5]:
df.describe()

describe,integer,date,float
str,f64,str,f64
"""count""",5.0,"""5""",5.0
"""null_count""",0.0,"""0""",0.0
"""mean""",3.0,,6.0
"""std""",1.581139,,1.581139
"""min""",1.0,"""2022-01-01 00:…",4.0
"""25%""",2.0,,5.0
"""50%""",3.0,,6.0
"""75%""",4.0,,7.0
"""max""",5.0,"""2022-01-05 00:…",8.0


In [6]:
df: pl.DataFrame = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", None],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "B"],
    }
)
print(df)

In [7]:
# Select
out: pl.DataFrame = df.select(
    pl.sum("nrs"),  # sum this column
    pl.col("names").sort(),  # select and sort this column
    # select the first row and rename this column
    pl.col("names").first().alias("first name"),
    # calculate the mean, multiply by 10 and rename
    (pl.mean("nrs") * 10).alias("10xnrs"),
)
print(out)

In [9]:
df

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.938371,"""A"""
2.0,"""ham""",0.11866,"""A"""
3.0,"""spam""",0.575215,"""B"""
,"""egg""",0.859469,"""C"""
5.0,,0.020818,"""B"""


In [8]:
# Add new column(s)
df_1: pl.DataFrame = df.with_columns(
    pl.sum("nrs").alias("nrs_sum"),
    pl.col("random").count().alias("count"),
)
print(df_1)

In [11]:
# Filter
out: pl.DataFrame = df_1.filter(pl.col("nrs") > 2)
print(out)

### Group By

In [14]:
print(df_1)

In [12]:
out: pl.DataFrame = df_1.group_by("groups").agg(
    pl.sum("nrs"),  # sum nrs by groups
    pl.col("random").count().alias("count"),  # count group members
    # sum random where name != null
    pl.col("random").filter(pl.col("names").is_not_null()).sum().name.suffix("_sum"),
    pl.col("names").reverse().alias("reversed names"),
)
print(out)

### Expressions

```text
- Polars has a powerful concept called expressions that is central to its very fast performance.
- Expressions are at the core of many data science operations:
  * taking a sample of rows from a column
  * multiplying values in a column
  * extracting a column of years from dates
  * convert a column of strings to lowercase

- However, expressions are also used within other operations:
  * taking the mean of a group in a group_by operation
  * calculating the size of groups in a group_by operation
  * taking the sum horizontally across columns
  * Polars performs these core data transformations very quickly by:

- Polars expressions are a mapping from a series to a series (or mathematically Fn(Series) -> Series). 
- As expressions have a Series as an input and a Series as an output then it is straightforward to do a sequence of expressions (similar to method chaining in Pandas).
```

In [15]:
# Select column "foo"
# Then sort the column (not in reversed order)
# Then take the first two values of the sorted output
pl.col("foo").sort().head(2)

In [18]:
df_1

nrs,names,random,groups,nrs_sum,count
i64,str,f64,str,i64,u32
1.0,"""foo""",0.938371,"""A""",11,5
2.0,"""ham""",0.11866,"""A""",11,5
3.0,"""spam""",0.575215,"""B""",11,5
,"""egg""",0.859469,"""C""",11,5
5.0,,0.020818,"""B""",11,5


In [19]:
df

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.938371,"""A"""
2.0,"""ham""",0.11866,"""A"""
3.0,"""spam""",0.575215,"""B"""
,"""egg""",0.859469,"""C"""
5.0,,0.020818,"""B"""


### [Lazy Vs Eager API](https://pola-rs.github.io/polars/user-guide/concepts/lazy-vs-eager/)

In [21]:
# Polars settings
pl.set_option("max_rows", 1000)
pl.set_option("max_columns", 1000)
pl.set_option("max_colwidth", 600)

AttributeError: module 'polars' has no attribute 'set_option'

In [30]:
fp: str = "../../data/AirlineTweets.csv"
df: pl.DataFrame = pl.read_csv(fp)
pl.Config.set_tbl_column_data_type_inline(True)
N: int = 8
with pl.Config(tbl_cols=N):  # display N columns
    print(df.head())

In [31]:
df.describe()

describe,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
str,f64,str,f64,str,f64,str,str,str,str,f64,str,str,str,str,str
"""count""",14640.0,"""14640""",14640.0,"""14640""",14640.0,"""14640""","""14640""","""14640""","""14640""",14640.0,"""14640""","""14640""","""14640""","""14640""","""14640"""
"""null_count""",0.0,"""0""",0.0,"""5462""",4118.0,"""0""","""14600""","""0""","""14608""",0.0,"""0""","""13621""","""0""","""4733""","""4820"""
"""mean""",5.6922e+17,,0.900169,,0.638298,,,,,0.08265,,,,,
"""std""",779110000000000.0,,0.16283,,0.33044,,,,,0.745778,,,,,
"""min""",5.6759e+17,"""negative""",0.335,"""Bad Flight""",0.0,"""American""","""negative""","""0504Traveller""","""Bad Flight""",0.0,"""""LOL you guys …","""[-33.87144962,…","""2015-02-16 23:…",""" || san anton…","""Abu Dhabi"""
"""25%""",5.6856e+17,,0.6923,,0.3606,,,,,0.0,,,,,
"""50%""",5.6948e+17,,1.0,,0.6706,,,,,0.0,,,,,
"""75%""",5.6989e+17,,1.0,,1.0,,,,,0.0,,,,,
"""max""",5.7031e+17,"""positive""",1.0,"""longlines""",1.0,"""Virgin America…","""positive""","""zupshawrl""","""Lost Luggage D…",44.0,"""😳 LOLOLOLOLOL …","""[59.38247253, …","""2015-02-24 11:…","""명동서식 37.56638,…","""West Central A…"


In [32]:
VALUE: float = 0.567
df_small: pl.DataFrame = df.filter(pl.col("airline_sentiment_confidence") > VALUE)
df_agg: pl.DataFrame = df_small.group_by("negativereason").agg(
    pl.col("retweet_count").mean()
)
print(df_agg)