#### Top

# Chapter 11 Extending Polars

* [11.0 Imports and Setup](#11.0-Imports-and-Setup)
* [11.1 Loans with Polars](#11.1-Loans-with-Polars)
* [11.4 Adding a PCA API to Polars](#11.4-Adding-a-PCA-API-to-Polars)
* [11.5 Calculating the Principal Components](#11.5-Calculating-the-Principal-Components)


---
# 11.0 Imports and Setup

[back to Top](#Top)

In [1]:
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
import chardet
import pprint as pp
import hvplot.polars
hvplot.extension('matplotlib')

matplotlib_inline.backend_inline.set_matplotlib_formats("retina")
pd.options.mode.copy_on_write = True
print(pd.options.mode.copy_on_write)
pl.Config.set_verbose(True)
pl.show_versions()

def HR():
    print("-"*40)

@pl.Config(tbl_cols=-1, ascii_tables=True)
def tight_layout(df: pl.DataFrame, n=5) -> None:
    with pl.Config(tbl_cols=-1, fmt_str_lengths=n):
        print(df)

def detect_encoding(filename: str) -> str:
    """Return the most probable character encoding for a file."""
    with open(filename, "rb") as f:
        raw_data = f.read()
        print(raw_data)
        result = chardet.detect(raw_data)
        return result["encoding"]

True
--------Version info---------
Polars:              1.9.0
Index type:          UInt32
Platform:            macOS-12.7.6-x86_64-i386-64bit
Python:              3.11.5 (main, Jan 16 2024, 17:25:53) [Clang 14.0.0 (clang-1400.0.29.202)]

----Optional dependencies----
adbc_driver_manager  1.1.0
altair               5.4.0
cloudpickle          3.0.0
connectorx           0.3.3
deltalake            0.19.1
fastexcel            0.11.6
fsspec               2023.12.2
gevent               24.2.1
great_tables         0.10.0
matplotlib           3.9.2
nest_asyncio         1.6.0
numpy                2.0.2
openpyxl             3.1.5
pandas               2.2.2
pyarrow              17.0.0
pydantic             2.8.2
pyiceberg            0.6.1
sqlalchemy           2.0.32
torch                <not installed>
xlsx2csv             0.8.3
xlsxwriter           3.2.0


# 11.1 Loans with Polars

[back to Top](#Top)

In [2]:
L = 600_000
c = 0.055 / 12
n = 360
P = (L * (c * (1 + c) ** n) / ((1 + c) ** n - 1))
P  # monthly payment 3_406.734

3406.734008082003

* Use pure Python code to solve this.
* Use Python generators to create a sequence of dictionaries.
* Use the `DataFrame` constructor to create a dataframe from the sequence of dictionaries.

In [3]:
def payment_schedule_gen(
    principal, number_of_payments, monthly_interest_rate, monthly_payment):

    remaining_balance = principal
    done = False
    for month in range(number_of_payments):
        if remaining_balance < monthly_payment:
            interest_payment = remaining_balance * monthly_interest_rate 
            monthly_payment = remaining_balance + interest_payment 
            principal_payment = remaining_balance
            remaining_balance = 0
            done = True
        else:
            interest_payment = remaining_balance * monthly_interest_rate 
            principal_payment = monthly_payment - interest_payment 
            remaining_balance -= principal_payment
        yield {
            'month': month,
            'Principal': principal_payment,
            'Interest': interest_payment,
            'Remaining Balance': remaining_balance,
            'Monthly Payment': monthly_payment}
        if done:
            break

pl.DataFrame(payment_schedule_gen(L, n, c, P))

month,Principal,Interest,Remaining Balance,Monthly Payment
i64,f64,f64,f64,f64
0,656.734008,2750.0,599343.265992,3406.734008
1,659.744039,2746.989969,598683.521953,3406.734008
2,662.767866,2743.966142,598020.754087,3406.734008
3,665.805552,2740.928456,597354.948535,3406.734008
4,668.857161,2737.876847,596686.091375,3406.734008
…,…,…,…,…
355,3329.725121,77.008887,13472.213962,3406.734008
356,3344.986361,61.747647,10127.227601,3406.734008
357,3360.317548,46.41646,6766.910053,3406.734008
358,3375.719004,31.015004,3391.191049,3406.734008


---
# 11.4 Adding a PCA API to Polars
[back to Top](#Top)

* Polars provides an API for extending the API.
* We can create attributes lke `.str` and `.dt` that provide additional functionality.
* Create a PCA API for Polars.
* Create a dataframe attribute called `.pca` with methods for working with principal component analysis (PCA).

In [4]:
@pl.StringCache()
def tweak_auto(df):
    cols=[
        'year', 'make', 'model', 'displ', 'cylinders',
        'trany', 'drive', 'VClass', 'fuelType', 'barrels08',
        'city08', 'highway08', 'createdOn'
    ]
    return (
        df
        .select(pl.col(cols))
        .with_columns(
            pl.col('year').cast(pl.Int16),
            pl.col(['cylinders', 'highway08', 'city08']).cast(pl.UInt8),
            pl.col(['displ', 'barrels08']).cast(pl.Float32),
            pl.col(['make', 'model', 'VClass', 'drive', 'fuelType']).cast(pl.Categorical),
            pl.col('createdOn').str.to_datetime('%a %b %d %H:%M:%S %Z %Y'),
            is_automatic=pl.col('trany').str.contains('Auto'),
            num_gears=pl.col('trany').str.extract(r'(\d+)').cast(pl.Int8)
        )
    )


In [5]:
raw = pl.read_csv(
    './data/vehicles.csv',
    null_values='NA'
)
raw.head(1)

autos = tweak_auto(raw)
autos.head(1)

avg line length: 434.78027
std. dev. line length: 23.885818
initial row estimate: 47850
no. of chunks: 4 processed by: 4 threads.


year,make,model,displ,cylinders,trany,drive,VClass,fuelType,barrels08,city08,highway08,createdOn,is_automatic,num_gears
i16,cat,cat,f32,u8,str,cat,cat,cat,f32,u8,u8,datetime[μs],bool,i8
1985,"""Alfa Romeo""","""Spider Veloce 2000""",2.0,4,"""Manual 5-spd""","""Rear-Wheel Drive""","""Two Seaters""","""Regular""",14.167143,19,25,2013-01-01 00:00:00,False,5


---
# 11.5 Calculating the Principal Components

[back to Top](#Top)

Enumerated steps to calculate the principal components:

1. Standardize the data (optional but recommended)
2. Center the data (required if you don't standardize the data)
3. Calculate the covariance matrix
4. Calculate the eigenvalues and eigenvectors of the covariance matrix
5. Calculate explained variance from sorted eigenvalues
6. Calculate weights of the principal components from the eigenvectors
7. Calculate the principal components from the dot product of the centered data and the eigenvectors


In [6]:
import numpy as np
import polars.selectors as cs
X = (autos
     .select(cs.numeric().fill_null(0))
     .select((pl.all() - pl.all().mean()) / pl.all().std()) # 1
)
num_df = X
centered = (num_df # 2
            .select((pl.all() - pl.all().mean()))
           )
cov = np.cov(centered.transpose()) # 3
vals, vecs = np.linalg.eig(cov) # 4

exp_var = pl.DataFrame( # 5
    {'PC': [f'PC{i+1}' for i in range(len(num_df.columns))],
     'var':sorted(vals, reverse=True)})

idxs = np.argsort(vals)[::-1]
comps = (pl.DataFrame(vecs[:, idxs]) # 6
         .rename(mapping={f'column_{i}': f'PC{i+1}' 
                          for i in range(len(num_df.columns))})
)

pcas = (pl.DataFrame(np.dot(centered, comps)) # 7
    .rename(mapping={f'column_{i}': f'PC{i+1}' 
                     for i in range(len(num_df.columns))})
)

In [7]:
import polars as pl
import numpy as np

@pl.api.register_dataframe_namespace('pca')
class PCA:
    def __init__(self, df):
        self.df = df

    def fit(self):
        centered = (self.df
                    .select(pl.all() - pl.all().mean())
                   )
        cov = np.cov(centered.transpose())                     
        vals, vecs = np.linalg.eig(cov)
        self._explained_variance = pl.DataFrame(
            {'PC': [f'PC{i+1}' for i in range(len(num_df.columns))],
             'var':sorted(vals, reverse=True)}
        )

        idxs = np.argsort(vals)[::-1]
        comps = (pl.DataFrame(vecs[:, idxs])
            .rename(mapping={f'column_{i}': f'PC{i+1}' 
                for i in range(len(num_df.columns))})
        )

        self.pcs = (pl.DataFrame(np.dot(centered, comps))
            .rename(mapping={f'column_{i}': f'PC{i+1}' 
                for i in range(len(num_df.columns))})
        )    
        self._components = comps
        return self.df
                
    def transform(self):
        return self.pcs

    def explained_variance(self):
        return self._explained_variance

    def components(self):
        return (self._components
                .with_columns(Feature=pl.Series(self.df.columns))
               )

    def filter_components(self, limit_components, mag_threshold):
        comps = self.components()
        columns = comps.columns[:limit_components]
        res =  (comps
           .select(*columns, pl.col('Feature'))
           .filter(pl.any_horizontal(cs.numeric().abs() > mag_threshold))        
        )
        return res
    
    def component_plot(self, limit_components=3, mag_threshold=.1):
        comps = self.filter_components(limit_components, mag_threshold)
        return  (comps
                 .select(cs.numeric())
                 .transpose()
                 .rename(mapping={f'column_{i}': col
                       for i, col in enumerate(comps['Feature'])})
                 .select(pl.Series([f'PC{i+1}' 
                          for i in range(limit_components)]).alias('PC'),
                        *comps['Feature'])
            # .plot.bar(x='PC', rot=90)
            .plot.bar(x='PC')
        )
    
    def scatter3d_plot(self, x='PC1', y='PC2', z='PC3', color_col=None, 
                       size_col=None, symbol_col=None, cmap='viridis', 
                       biplot=True, biplot_scale=20, biplot_limit=.2, 
                       alpha=1, width=600, height=600):
        return plot_pca_3d(self.pcs, x, y, color_col, size_col, 
                symbol_col, cmap, self._components, biplot, biplot_scale, 
                biplot_limit, alpha, width, height)

In [8]:
fit = (
    X.pca.fit()
)

In [9]:
fit

year,displ,cylinders,barrels08,city08,highway08,num_gears
f64,f32,f64,f32,f64,f64,f64
-1.545048,-0.851517,-0.821913,-0.205892,-0.073411,-0.074076,-0.107017
-1.545048,1.197237,3.327206,2.618402,-0.863424,-1.066215,-0.107017
-1.545048,-0.710223,-0.821913,-0.896275,0.242594,0.64748,-0.107017
-1.545048,1.409177,1.252646,2.618402,-0.784423,-1.246604,-1.152933
-0.914205,-0.710223,-0.821913,0.121132,-0.231413,-0.254465,-0.107017
…,…,…,…,…,…,…
-0.914205,-0.710223,-0.821913,-0.347107,-0.073411,0.016118,-0.629975
-0.914205,-0.710223,-0.821913,-0.476042,0.005591,0.196507,-0.107017
-0.914205,-0.710223,-0.821913,-0.205892,-0.152412,-0.164271,-0.629975
-0.914205,-0.710223,-0.821913,-0.205892,-0.152412,-0.164271,-0.107017


In [10]:
from sklearn.decomposition import PCA
import sklearn

sklearn.set_config(transform_output='polars')
pca = PCA()
pca.fit_transform(X)

pca0,pca1,pca2,pca3,pca4,pca5,pca6
f64,f64,f64,f64,f64,f64,f64
0.557681,-1.320249,-1.134497,0.712835,-0.02666,-0.021445,0.045411
-4.197779,-0.959774,1.196523,1.00051,-0.023678,1.632437,0.005648
1.277504,-1.189073,-0.716832,1.00654,0.461666,-0.213793,-0.168074
-3.324079,-1.946352,0.797528,0.087553,-0.877934,0.210463,-0.011611
0.279791,-0.913741,-1.040788,0.227641,-0.332086,-0.040137,-0.000154
…,…,…,…,…,…,…
0.741085,-1.203261,-0.726808,0.014824,0.049644,-0.095235,-0.04101
0.860223,-0.820595,-0.825586,0.407634,0.089278,-0.13224,-0.076606
0.560476,-1.229398,-0.817225,-0.05678,-0.042933,-0.080308,0.016647
0.504627,-0.871472,-1.007932,0.263912,-0.085774,-0.105242,0.040829


---
---
---

In [11]:
import polars as pl
import numpy as np

@pl.api.register_dataframe_namespace('pca2')
class PCA:
    def __init__(self, df):
        self.df = df

    def fit(self):
        centered = (self.df
                    .select(pl.all() - pl.all().mean())
                   )
        cov = np.cov(centered.transpose())                     
        vals, vecs = np.linalg.eig(cov)
        self._explained_variance = pl.DataFrame(
            {'PC': [f'PC{i+1}' for i in range(len(num_df.columns))],
             'var':sorted(vals, reverse=True)}
        )

        idxs = np.argsort(vals)[::-1]
        comps = (pl.DataFrame(vecs[:, idxs])
            .rename(mapping={f'column_{i}': f'PC{i+1}' 
                for i in range(len(num_df.columns))})
        )

        self.pcs = (pl.DataFrame(np.dot(centered, comps))
            .rename(mapping={f'column_{i}': f'PC{i+1}' 
                for i in range(len(num_df.columns))})
        )    
        self._components = comps
        return self.df
                
    def transform(self):
        return self.pcs

    def explained_variance(self):
        return self._explained_variance

    def components(self):
        return (self._components
                .with_columns(Feature=pl.Series(self.df.columns))
               )

    def filter_components(self, limit_components, mag_threshold):
        comps = self.components()
        columns = comps.columns[:limit_components]
        res =  (comps
           .select(*columns, pl.col('Feature'))
           .filter(pl.any_horizontal(cs.numeric().abs() > mag_threshold))        
        )
        return res
    
    def component_plot(self, limit_components=3, mag_threshold=.1):
        comps = self.filter_components(limit_components, mag_threshold)
        return  (comps
                 .select(cs.numeric())
                 .transpose()
                 .rename(mapping={f'column_{i}': col
                       for i, col in enumerate(comps['Feature'])})
                 .select(pl.Series([f'PC{i+1}' 
                          for i in range(limit_components)]).alias('PC'),
                        *comps['Feature'])
            .hvplot.bar(x='PC', rot=90)
        )
    
    def scatter3d_plot(self, x='PC1', y='PC2', z='PC3', color_col=None, 
                       size_col=None, symbol_col=None, cmap='viridis', 
                       biplot=True, biplot_scale=20, biplot_limit=.2, 
                       alpha=1, width=600, height=600):
        return plot_pca_3d(self.pcs, x, y, color_col, size_col, 
                symbol_col, cmap, self._components, biplot, biplot_scale, 
                biplot_limit, alpha, width, height)

In [12]:
fit = (X.pca2.fit())

In [13]:
try:
    (
        fit
        .pca2
        .explained_variance()
        .select(
            pl.all()/pl.all().sum()
        ) 
    )
except Exception as e:
    print(e)

`sum` operation not supported for dtype `str`

Hint: you may mean to call `str.concat` or `list.join`


In [14]:
try:
    fit.pca2.component_plot()
except Exception as e:
    print(e)

dataframe filtered
