# Data Modelling

In [3]:
// Add dependencies from Cargo.toml
:dep polars = { version = "0.39.2", features = ["lazy", "temporal", "describe", "json", "parquet", "dtype-datetime"] }


In [4]:
// Import libraries
use polars::prelude::*;

##### Reading CSV

In [5]:
let dataset = LazyCsvReader::new("./data/amazon_prime_titles.csv")
        .has_header(true)
        .finish()
        .unwrap();

let df = dataset.clone().collect().unwrap();

df

shape: (9_668, 12)
┌─────────┬─────────┬─────────────┬─────────────┬───┬────────┬───────────┬────────────┬────────────┐
│ show_id ┆ type    ┆ title       ┆ director    ┆ … ┆ rating ┆ duration  ┆ listed_in  ┆ descriptio │
│ ---     ┆ ---     ┆ ---         ┆ ---         ┆   ┆ ---    ┆ ---       ┆ ---        ┆ n          │
│ str     ┆ str     ┆ str         ┆ str         ┆   ┆ str    ┆ str       ┆ str        ┆ ---        │
│         ┆         ┆             ┆             ┆   ┆        ┆           ┆            ┆ str        │
╞═════════╪═════════╪═════════════╪═════════════╪═══╪════════╪═══════════╪════════════╪════════════╡
│ s1      ┆ Movie   ┆ The Grand   ┆ Don         ┆ … ┆ null   ┆ 113 min   ┆ Comedy,    ┆ A small    │
│         ┆         ┆ Seduction   ┆ McKellar    ┆   ┆        ┆           ┆ Drama      ┆ fishing    │
│         ┆         ┆             ┆             ┆   ┆        ┆           ┆            ┆ village    │
│         ┆         ┆             ┆             ┆   ┆        ┆          

##### Schema

In [37]:
df.schema()

Schema:
name: show_id, data type: String
name: type, data type: String
name: title, data type: String
name: director, data type: String
name: cast, data type: String
name: country, data type: String
name: date_added, data type: String
name: release_year, data type: Int64
name: rating, data type: String
name: duration, data type: String
name: listed_in, data type: String
name: description, data type: String


##### Selecting Columns

In [6]:
let df_types = df.clone().lazy().select([
    col("show_id"),
    col("type"),
]).collect().unwrap();

println!("{:?}", df_types);

shape: (9_668, 2)


┌─────────┬─────────┐
│ show_id ┆ type    │
│ ---     ┆ ---     │
│ str     ┆ str     │
╞═════════╪═════════╡
│ s1      ┆ Movie   │
│ s2      ┆ Movie   │
│ s3      ┆ Movie   │
│ s4      ┆ Movie   │
│ s5      ┆ Movie   │
│ …       ┆ …       │
│ s9664   ┆ Movie   │
│ s9665   ┆ TV Show │
│ s9666   ┆ Movie   │
│ s9667   ┆ TV Show │
│ s9668   ┆ Movie   │
└─────────┴─────────┘


##### GroupBy Aggregation

In [7]:
let df_types_grouped = df.clone().lazy()
    .group_by(["type"])
    .agg(vec![
        col("type").count().alias("unique")
    ])
    .collect()
    .unwrap();

println!("{:?}", df_types_grouped);

shape: (2, 2)
┌─────────┬────────┐
│ type    ┆ unique │
│ ---     ┆ ---    │
│ str     ┆ u32    │
╞═════════╪════════╡
│ TV Show ┆ 1854   │
│ Movie   ┆ 7814   │
└─────────┴────────┘


In [9]:
let df_listed_in = df
        .clone()
        .lazy()
        .select([
            // Split "listed_in" column by comma and space, then explode
            col("listed_in")
                .str
                .split(", ")
                .list() // Create a List of Strings
        ])
        .explode("listed_in") // Explode the list into individual rows
        .group_by("listed_in")
        .agg([col("listed_in").count().alias("count")])
        .collect()?;

    println!("{:?}", df_listed_in);

Error: no field `str` on type `Expr`

Error: the trait bound `Expr: From<u8>` is not satisfied

Error: the trait bound `Expr: From<u8>` is not satisfied

In [11]:
let df_listed_in = df
    .clone()
    .lazy()
    .select(col("listed_in").str.split(", "))
    .explode("listed_in")
    .group_by("listed_in")
    .agg(col("listed_in").count().alias("count"))
    .collect()
    .unwrap();

println!("{:?}", df_listed_in);

Error: no method named `str` found for enum `Expr` in the current scope

Error: the trait bound `Expr: From<u8>` is not satisfied

Error: the trait bound `Expr: From<u8>` is not satisfied