# Feature Engineering

- Create a pipeline containing:
  - vectorized text features
  - numeric features
  - encoded  categorical features
- Use ColumnTransformer to combine the features
- Use a pipeline to combine the transformer and the model
- Perform a grid search to find the best model and hyperparameters.

In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,mlxtend,omegaconf --conda

Python implementation: CPython
Python version       : 3.10.8
IPython version      : 8.26.0

numpy    : 1.26.0
pandas   : 2.2.2
polars   : 1.4.1
mlxtend  : 0.23.1
omegaconf: not installed

conda environment: n/a



In [2]:
# Built-in library
from pathlib import Path
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")


# auto reload imports# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(500)

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

### Load Data

In [3]:
pl.Config.set_fmt_str_lengths(100)


fp: str = "../data/cleaned_articles.parquet"
articles_df: pl.DataFrame = pl.read_parquet(fp)
print(f"{articles_df.shape = }")

articles_df.head(2)

articles_df.shape = (9335, 9)


articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata,metadata_1,metadata_2
str,i64,str,str,str,str,str,str,str
"""58927e0495d0e0392607e1b3""",1129,"""article""","""2017-02-02 00:26:16""","""The New York Times""","""news""","""ken belson n f l vs politics has been battle all season long football super bowl national football l…","""ken belson n f l vs politics has been battle all season long football super bowl national football l…","""ken belson n f l vs politics has been battle all season long football super bowl national football l…"
"""5893033d95d0e0392607e2d6""",3082,"""article""","""2017-02-02 10:00:24""","""The New York Times""","""news""","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…"


In [4]:
print(articles_df["documentType"].value_counts(sort=True))
print(articles_df["source"].value_counts(sort=True))
print(articles_df["typeOfMaterial"].value_counts(sort=True))

shape: (2, 2)
┌──────────────┬───────┐
│ documentType ┆ count │
│ ---          ┆ ---   │
│ str          ┆ u32   │
╞══════════════╪═══════╡
│ article      ┆ 9168  │
│ blogpost     ┆ 167   │
└──────────────┴───────┘
shape: (2, 2)
┌──────────────────────────────┬───────┐
│ source                       ┆ count │
│ ---                          ┆ ---   │
│ str                          ┆ u32   │
╞══════════════════════════════╪═══════╡
│ The New York Times           ┆ 9284  │
│ International New York Times ┆ 51    │
└──────────────────────────────┴───────┘
shape: (7, 2)
┌────────────────┬───────┐
│ typeOfMaterial ┆ count │
│ ---            ┆ ---   │
│ str            ┆ u32   │
╞════════════════╪═══════╡
│ news           ┆ 6219  │
│ op-ed          ┆ 1693  │
│ review         ┆ 513   │
│ editorial      ┆ 302   │
│ briefing       ┆ 298   │
│ blog           ┆ 167   │
│ other          ┆ 143   │
└────────────────┴───────┘


In [5]:
from polars import selectors as cs


# One-hot encode
ohe_columns: list[str] = ["documentType", "source", "typeOfMaterial"]
articles_temp_df: pl.DataFrame = articles_df.clone()
articles_temp_df = articles_temp_df.to_dummies(columns=ohe_columns)
articles_temp_df.head(3)

articleID,articleWordCount,documentType_article,documentType_blogpost,pubDate,source_International New York Times,source_The New York Times,typeOfMaterial_blog,typeOfMaterial_briefing,typeOfMaterial_editorial,typeOfMaterial_news,typeOfMaterial_op-ed,typeOfMaterial_other,typeOfMaterial_review,metadata,metadata_1,metadata_2
str,i64,u8,u8,str,u8,u8,u8,u8,u8,u8,u8,u8,u8,str,str,str
"""58927e0495d0e0392607e1b3""",1129,1,0,"""2017-02-02 00:26:16""",0,1,0,0,0,1,0,0,0,"""ken belson n f l vs politics has been battle all season long football super bowl national football l…","""ken belson n f l vs politics has been battle all season long football super bowl national football l…","""ken belson n f l vs politics has been battle all season long football super bowl national football l…"
"""5893033d95d0e0392607e2d6""",3082,1,0,"""2017-02-02 10:00:24""",0,1,0,0,0,1,0,0,0,"""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…"
"""5893039595d0e0392607e2da""",693,1,0,"""2017-02-02 10:01:53""",0,1,0,0,0,0,0,0,1,"""manohla dargis a stand up s downward slide movies the comedian movie de niro robert hackford taylor …","""manohla dargis a stand up s downward slide movies the comedian movie de niro robert hackford taylor …","""manohla dargis a stand up s downward slide movies the comedian movie de niro robert hackford taylor …"


### Create Cyclic Features

- day_of_the_week: e.g Monday
- day: e.g 1st
- week_of_the_year: e.g 1st week of the year
- week_of_the_month: e.g 1st week of the month
- month: e.g January
- year: e.g 2017
- quarter: e.g 1st quarter
- half: e.g 1st half
- year_month: e.g 2017-jan

In [6]:
# Convert pubDate to datetime
date_format: str = "%Y-%m-%d %H:%M:%S"
articles_df = articles_df.with_columns(
    pubDate=pl.col("pubDate").str.to_date(date_format)
)

articles_df.head()

articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata,metadata_1,metadata_2
str,i64,str,date,str,str,str,str,str
"""58927e0495d0e0392607e1b3""",1129,"""article""",2017-02-02,"""The New York Times""","""news""","""ken belson n f l vs politics has been battle all season long football super bowl national football l…","""ken belson n f l vs politics has been battle all season long football super bowl national football l…","""ken belson n f l vs politics has been battle all season long football super bowl national football l…"
"""5893033d95d0e0392607e2d6""",3082,"""article""",2017-02-02,"""The New York Times""","""news""","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…"
"""5893039595d0e0392607e2da""",693,"""article""",2017-02-02,"""The New York Times""","""review""","""manohla dargis a stand up s downward slide movies the comedian movie de niro robert hackford taylor …","""manohla dargis a stand up s downward slide movies the comedian movie de niro robert hackford taylor …","""manohla dargis a stand up s downward slide movies the comedian movie de niro robert hackford taylor …"
"""5893109995d0e0392607e2ef""",1049,"""article""",2017-02-02,"""The New York Times""","""briefing""","""alexandra s levine new york today a groundhog has her day new york city groundhogs metro thursday a …","""alexandra s levine new york today a groundhog has her day new york city groundhogs metro thursday a …","""alexandra s levine new york today a groundhog has her day new york city groundhogs metro thursday a …"
"""5893114495d0e0392607e2f1""",1283,"""article""",2017-02-02,"""The New York Times""","""news""","""bonnie tsui a swimmer s communion with the ocean travel and vacations swimming oceans and seas hawai…","""bonnie tsui a swimmer s communion with the ocean travel and vacations swimming oceans and seas hawai…","""bonnie tsui a swimmer s communion with the ocean travel and vacations swimming oceans and seas hawai…"


In [7]:
articles_df.with_columns(
    day=pl.col("pubDate").dt.day(),
    day_of_week=pl.col("pubDate").dt.weekday(),
    week_of_year=pl.col("pubDate").dt.week(),
    month=pl.col("pubDate").dt.month(),
    year=pl.col("pubDate").dt.year(),
    quarter=pl.col("pubDate").dt.quarter(),
).sample(n=2, seed=1)

articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata,metadata_1,metadata_2,day,day_of_week,week_of_year,month,year,quarter
str,i64,str,date,str,str,str,str,str,i8,i8,i8,i8,i32,i8
"""58a2e33d95d0e02474636550""",154,"""article""",2017-02-14,"""The New York Times""","""news""","""david gelles how to be mindful while eating chocolate chocolate valentine s day meditation well take…","""david gelles how to be mindful while eating chocolate chocolate valentine s day meditation well take…","""david gelles how to be mindful while eating chocolate chocolate valentine s day meditation well take…",14,2,7,2,2017,1
"""59084d4c7c459f24986dd363""",837,"""article""",2017-05-02,"""International New York Times""","""op-ed""","""roger cohen uncertainty bordering on dismay international trade and world market north atlantic trea…","""roger cohen uncertainty bordering on dismay international trade and world market north atlantic trea…","""roger cohen uncertainty bordering on dismay international trade and world market north atlantic trea…",2,2,18,5,2017,2


In [8]:
def extract_temporal_features(
    data: pl.DataFrame, date_column: str, date_format: str = "%Y-%m-%d %H:%M:%S"
) -> pl.DataFrame:
    # Convert to datetime
    if not data[date_column].dtype == pl.Date:
        data = data.with_columns(pubDate=pl.col(date_column).str.to_date(date_format))
    else:
        data = data

    try:
        data = data.with_columns(
            day=pl.col(date_column).dt.day(),
            # where monday = 1 and sunday = 7
            day_of_week=pl.col(date_column).dt.weekday(),
            week_of_year=pl.col(date_column).dt.week(),
            month=pl.col(date_column).dt.month(),
            year=pl.col(date_column).dt.year(),
            quarter=pl.col(date_column).dt.quarter(),
        )
    except:
        print("Error creating temporal features")
        return pl.DataFrame()

    return data

In [9]:
df: pl.DataFrame = extract_temporal_features(
    data=articles_df, date_column="pubDate", date_format="%Y-%m-%d %H:%M:%S"
)

df.sample(n=3, seed=1)

articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata,metadata_1,metadata_2,day,day_of_week,week_of_year,month,year,quarter
str,i64,str,date,str,str,str,str,str,i8,i8,i8,i8,i32,i8
"""58a2e33d95d0e02474636550""",154,"""article""",2017-02-14,"""The New York Times""","""news""","""david gelles how to be mindful while eating chocolate chocolate valentine s day meditation well take…","""david gelles how to be mindful while eating chocolate chocolate valentine s day meditation well take…","""david gelles how to be mindful while eating chocolate chocolate valentine s day meditation well take…",14,2,7,2,2017,1
"""58f8e6b37c459f24986db2bf""",455,"""article""",2017-04-20,"""The New York Times""","""review""","""jeannette catsoulis a love triangle near the front lines movies the promise movie george terry bale …","""jeannette catsoulis a love triangle near the front lines movies the promise movie george terry bale …","""jeannette catsoulis a love triangle near the front lines movies the promise movie george terry bale …",20,4,16,4,2017,2
"""59084d4c7c459f24986dd363""",837,"""article""",2017-05-02,"""International New York Times""","""op-ed""","""roger cohen uncertainty bordering on dismay international trade and world market north atlantic trea…","""roger cohen uncertainty bordering on dismay international trade and world market north atlantic trea…","""roger cohen uncertainty bordering on dismay international trade and world market north atlantic trea…",2,2,18,5,2017,2


In [10]:
# Create Cyclic Temporal Features
day_factor: int = 30
day_of_week_factor: int = 7
week_of_year_factor: int = 52
month_factor: int = 12

df.with_columns(
    day_sin=pl.col("day").map_elements(lambda x: np.sin(2 * np.pi * x / day_factor)),
    day_cos=pl.col("day").map_elements(lambda x: np.cos(2 * np.pi * x / day_factor)),
    # where monday = 1 and sunday = 7
    day_of_week_sin=pl.col("day_of_week").map_elements(
        lambda x: np.sin(2 * np.pi * x / day_of_week_factor)
    ),
    day_of_week_cos=pl.col("day_of_week").map_elements(
        lambda x: np.cos(2 * np.pi * x / day_of_week_factor)
    ),
    week_of_year_sin=pl.col("week_of_year").map_elements(
        lambda x: np.sin(2 * np.pi * x / week_of_year_factor)
    ),
    week_of_year_cos=pl.col("week_of_year").map_elements(
        lambda x: np.cos(2 * np.pi * x / week_of_year_factor),
    ),
    month_sin=pl.col("month").map_elements(
        lambda x: np.sin(2 * np.pi * x / month_factor)
    ),
    month_cos=pl.col("month").map_elements(
        lambda x: np.cos(2 * np.pi * x / month_factor)
    ),
).head(3)

articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata,metadata_1,metadata_2,day,day_of_week,week_of_year,month,year,quarter,day_sin,day_cos,day_of_week_sin,day_of_week_cos,week_of_year_sin,week_of_year_cos,month_sin,month_cos
str,i64,str,date,str,str,str,str,str,i8,i8,i8,i8,i32,i8,f64,f64,f64,f64,f64,f64,f64,f64
"""58927e0495d0e0392607e1b3""",1129,"""article""",2017-02-02,"""The New York Times""","""news""","""ken belson n f l vs politics has been battle all season long football super bowl national football l…","""ken belson n f l vs politics has been battle all season long football super bowl national football l…","""ken belson n f l vs politics has been battle all season long football super bowl national football l…",2,4,5,2,2017,1,0.406737,0.913545,-0.433884,-0.900969,0.568065,0.822984,0.866025,0.5
"""5893033d95d0e0392607e2d6""",3082,"""article""",2017-02-02,"""The New York Times""","""news""","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…",2,4,5,2,2017,1,0.406737,0.913545,-0.433884,-0.900969,0.568065,0.822984,0.866025,0.5
"""5893039595d0e0392607e2da""",693,"""article""",2017-02-02,"""The New York Times""","""review""","""manohla dargis a stand up s downward slide movies the comedian movie de niro robert hackford taylor …","""manohla dargis a stand up s downward slide movies the comedian movie de niro robert hackford taylor …","""manohla dargis a stand up s downward slide movies the comedian movie de niro robert hackford taylor …",2,4,5,2,2017,1,0.406737,0.913545,-0.433884,-0.900969,0.568065,0.822984,0.866025,0.5


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
import numpy as np
from typing import Callable


def sin_transformer(period: float) -> Callable[[np.ndarray], np.ndarray]:
    """
    Create a sine transformer function.

    Parameters
    ----------
    period : float
        The period of the sine function.

    Returns
    -------
    Callable[[np.ndarray], np.ndarray]
        A function that applies a sine transformation to input data.

    Notes
    -----
    The transformer applies the following function to the input:
    f(x) = sin(2 * pi * x / period)

    The input and output arrays are expected to have the same shape.
    """
    return FunctionTransformer(lambda x: np.sin(2 * np.pi * x / period))


def cos_transformer(period: float) -> Callable[[np.ndarray], np.ndarray]:
    """
    Create a cosine transformer function.

    Parameters
    ----------
    period : float
        The period of the cosine function.

    Returns
    -------
    Callable[[np.ndarray], np.ndarray]
        A function that applies a cosine transformation to input data.

    Notes
    -----
    The transformer applies the following function to the input:
    f(x) = cos(2 * pi * x / period)

    The input and output arrays are expected to have the same shape.
    """
    return FunctionTransformer(lambda x: np.cos(2 * np.pi * x / period))

In [12]:
cyclical_columns: list[str] = ["day", "day_of_week", "week_of_year", "month"]
cyclical_encoder: ColumnTransformer = ColumnTransformer(
    [
        ("day_sin", sin_transformer(day_factor), ["day"]),
        ("day_cos", cos_transformer(day_factor), ["day"]),
        ("day_of_week_sin", sin_transformer(day_of_week_factor), ["day_of_week"]),
        ("day_of_week_cos", cos_transformer(day_of_week_factor), ["day_of_week"]),
        ("week_of_year_sin", sin_transformer(week_of_year_factor), ["week_of_year"]),
        ("week_of_year_cos", cos_transformer(week_of_year_factor), ["day_of_week"]),
        ("month_sin", sin_transformer(12), ["month"]),
        ("month_cos", cos_transformer(12), ["month"]),
    ],
    remainder="passthrough",
)

encoded_features: np.ndarray = cyclical_encoder.fit_transform(
    df.select(cyclical_columns)
)

encoded_features

array([[ 0.4067,  0.9135, -0.4339, ...,  0.8855,  0.866 ,  0.5   ],
       [ 0.4067,  0.9135, -0.4339, ...,  0.8855,  0.866 ,  0.5   ],
       [ 0.4067,  0.9135, -0.4339, ...,  0.8855,  0.866 ,  0.5   ],
       ...,
       [ 0.2079,  0.9781,  0.9749, ...,  0.9709,  0.5   , -0.866 ],
       [ 0.2079,  0.9781,  0.9749, ...,  0.9709,  0.5   , -0.866 ],
       [ 0.2079,  0.9781,  0.9749, ...,  0.9709,  0.5   , -0.866 ]])

In [13]:
articles_df.head(1)

articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata,metadata_1,metadata_2
str,i64,str,date,str,str,str,str,str
"""58927e0495d0e0392607e1b3""",1129,"""article""",2017-02-02,"""The New York Times""","""news""","""ken belson n f l vs politics has been battle all season long football super bowl national football l…","""ken belson n f l vs politics has been battle all season long football super bowl national football l…","""ken belson n f l vs politics has been battle all season long football super bowl national football l…"


In [14]:
from sklearn.base import BaseEstimator, TransformerMixin
from typing import Union, Optional
import polars as pl
import pandas as pd


def extract_temporal_features(
    data: Union[pl.DataFrame, pd.DataFrame],
    date_column: str,
    date_format: str = "%Y-%m-%d %H:%M:%S",
) -> pl.DataFrame:
    """
    Extract temporal features from a date column in a Polars or Pandas DataFrame.

    Parameters
    ----------
    data : Union[pl.DataFrame, pd.DataFrame]
        Input DataFrame containing the date column.
    date_column : str
        Name of the column containing date information.
    date_format : str, optional
        Format of the date string, by default "%Y-%m-%d %H:%M:%S".

    Returns
    -------
    pl.DataFrame
        DataFrame with additional temporal features.
    """
    if isinstance(data, pd.DataFrame):
        data = pl.from_pandas(data)
    # Convert to datetime
    if not data[date_column].dtype == pl.Date:
        data = data.with_columns(pubDate=pl.col(date_column).str.to_date(date_format))
    else:
        data = data

    try:
        data = data.with_columns(
            day=pl.col(date_column).dt.day(),
            # where monday = 1 and sunday = 7
            day_of_week=pl.col(date_column).dt.weekday(),
            week_of_year=pl.col(date_column).dt.week(),
            month=pl.col(date_column).dt.month(),
            year=pl.col(date_column).dt.year(),
            quarter=pl.col(date_column).dt.quarter(),
        )
    except:
        print("Error creating temporal features")
        return pl.DataFrame()

    return data


class ExtractTemporalFeatures(BaseEstimator, TransformerMixin):
    """
    Transformer to extract temporal features from a date column.

    Parameters
    ----------
    date_column : str
        Name of the column containing date information.
    date_format : str, optional
        Format of the date string, by default "%Y-%m-%d %H:%M:%S".
    """

    def __init__(self, date_column: str, date_format: str = "%Y-%m-%d %H:%M:%S"):
        self.date_column: str = date_column
        self.date_format: str = date_format

    def fit(
        self, X: pl.DataFrame, y: Optional[pl.DataFrame] = None
    ) -> "ExtractTemporalFeatures":
        """
        Fit the transformer (no-op).

        Parameters
        ----------
        X : pl.DataFrame
            Input features.
        y : Optional[pl.DataFrame], optional
            Target variable, by default None.

        Returns
        -------
        ExtractTemporalFeatures
            Fitted transformer.
        """
        return self

    def transform(self, X: pl.DataFrame) -> pl.DataFrame:
        """
        Transform the input DataFrame by extracting temporal features.

        Parameters
        ----------
        X : pl.DataFrame
            Input features.

        Returns
        -------
        pl.DataFrame
            Transformed DataFrame with additional temporal features.
        """
        return extract_temporal_features(X, self.date_column, self.date_format)


class DropFeatures(BaseEstimator, TransformerMixin):
    """
    Transformer to drop specified features from a DataFrame.

    Parameters
    ----------
    features : list[str]
        List of feature names to be dropped.
    """

    def __init__(self, features: list[str]):
        self.features: list[str] = features

    def fit(
        self,
        X: Union[pl.DataFrame, pd.DataFrame],
        y: Optional[Union[pl.DataFrame, pd.DataFrame]] = None,
    ) -> "DropFeatures":
        """
        Fit the transformer (no-op).

        Parameters
        ----------
        X : Union[pl.DataFrame, pd.DataFrame]
            Input features.
        y : Optional[Union[pl.DataFrame, pd.DataFrame]], optional
            Target variable, by default None.

        Returns
        -------
        DropFeatures
            Fitted transformer.
        """
        return self

    def transform(
        self, X: Union[pl.DataFrame, pd.DataFrame]
    ) -> Union[pl.DataFrame, pd.DataFrame]:
        """
        Transform the input DataFrame by dropping specified features.

        Parameters
        ----------
        X : Union[pl.DataFrame, pd.DataFrame]
            Input features.

        Returns
        -------
        Union[pl.DataFrame, pd.DataFrame]
            Transformed DataFrame with specified features dropped.

        Raises
        ------
        ValueError
            If the input is not a pandas DataFrame or a polars DataFrame.
        """
        if isinstance(X, pl.DataFrame):
            return X.drop(self.features)
        elif isinstance(X, pd.DataFrame):
            return X.drop(columns=self.features)
        else:
            raise ValueError("Input must be a pandas DataFrame or a polars DataFrame")

In [15]:
import numpy as np
import polars as pl
import pandas as pd
from typing import Literal, Union, Optional
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler


def create_cyclic_features(data: Union[pl.DataFrame, pd.DataFrame]) -> pl.DataFrame:
    """
    Create cyclic temporal features from the input DataFrame.

    Parameters
    ----------
    data : Union[pl.DataFrame, pd.DataFrame]
        Input DataFrame containing temporal features.

    Returns
    -------
    pl.DataFrame
        DataFrame with additional cyclic temporal features.

    Notes
    -----
    This function creates sine and cosine transformations for day, day of week,
    week of year, and month columns.
    """
    day_factor: int = 30
    day_of_week_factor: int = 7
    week_of_year_factor: int = 52
    month_factor: int = 12

    if isinstance(data, pd.DataFrame):
        data = pl.from_pandas(data)

    try:
        data = data.with_columns(
            day_sin=pl.col("day").map_elements(
                lambda x: np.sin(2 * np.pi * x / day_factor)
            ),
            day_cos=pl.col("day").map_elements(
                lambda x: np.cos(2 * np.pi * x / day_factor)
            ),
            # where monday = 1 and sunday = 7
            day_of_week_sin=pl.col("day_of_week").map_elements(
                lambda x: np.sin(2 * np.pi * x / day_of_week_factor)
            ),
            day_of_week_cos=pl.col("day_of_week").map_elements(
                lambda x: np.cos(2 * np.pi * x / day_of_week_factor)
            ),
            week_of_year_sin=pl.col("week_of_year").map_elements(
                lambda x: np.sin(2 * np.pi * x / week_of_year_factor)
            ),
            week_of_year_cos=pl.col("week_of_year").map_elements(
                lambda x: np.cos(2 * np.pi * x / week_of_year_factor),
            ),
            month_sin=pl.col("month").map_elements(
                lambda x: np.sin(2 * np.pi * x / month_factor)
            ),
            month_cos=pl.col("month").map_elements(
                lambda x: np.cos(2 * np.pi * x / month_factor)
            ),
        )
    except:
        print("Error creating cyclic temporal features")
        return pl.DataFrame()

    return data


class CyclicTemporalFeatures(BaseEstimator, TransformerMixin):
    """
    Transformer for creating cyclic temporal features.

    This transformer applies the create_cyclic_features function to the input data.
    """

    def __init__(self) -> None:
        """
        Initialize the CyclicTemporalFeatures transformer.
        """
        ...

    def fit(
        self, X: pl.DataFrame, y: Optional[pl.DataFrame] = None
    ) -> "CyclicTemporalFeatures":
        """
        Fit the transformer to the data.

        This method is a no-op and returns self.

        Parameters
        ----------
        X : pl.DataFrame, shape (n_samples, n_features)
            Input features.
        y : Optional[pl.DataFrame], default=None
            Target values (ignored).

        Returns
        -------
        CyclicTemporalFeatures
            The fitted transformer.
        """
        return self

    def transform(self, X: pl.DataFrame) -> pl.DataFrame:
        """
        Transform the input DataFrame by extracting temporal features.

        Parameters
        ----------
        X : pl.DataFrame, shape (n_samples, n_features)
            Input features.

        Returns
        -------
        pl.DataFrame, shape (n_samples, n_features + n_cyclic_features)
            Transformed DataFrame with additional temporal features.
        """
        return create_cyclic_features(X)

In [16]:
class NumericalScaler(BaseEstimator, TransformerMixin):
    """
    Transformer for scaling numerical features.

    This transformer applies either StandardScaler or MinMaxScaler to the specified features.

    Parameters
    ----------
    scaler_type : Literal["standard", "min_max"]
        Type of scaler to use.
    features : list[str] | None, optional
        List of features to scale. If None, all features will be scaled.
    exclude_features : list[str] | None, optional
        List of features to exclude from scaling.

    Attributes
    ----------
    scaler : StandardScaler | MinMaxScaler
        The scaler object used for transformation.
    features : list[str] | None
        List of features to scale.
    exclude_features : list[str]
        List of features to exclude from scaling.
    ignore_columns_ : list[str]
        List of columns to ignore during scaling.
    """

    def __init__(
        self,
        scaler_type: Literal["standard", "min_max"],
        features: list[str] | None = None,
        exclude_features: list[str] | None = None,
    ) -> None:
        self.scaler_type: Literal["standard", "min_max"] = scaler_type
        if features is None and exclude_features is None:
            raise ValueError("`features` and `exclude_features` cannot both be None")
        if features is not None and exclude_features is not None:
            raise ValueError(
                "`features` and `exclude_features` cannot both be not None"
            )

        self.features: list[str] | None = features

        if exclude_features is None:
            self.exclude_features: list[str] = []
        else:
            assert isinstance(
                exclude_features, list
            ), "`exclude_features` must be of type List"
            self.exclude_features: list[str] = exclude_features

        if scaler_type not in ["standard", "min_max"]:
            raise ValueError("scaler_type must be either 'standard' or 'min_max'")
        if scaler_type == "standard":
            self.scaler: StandardScaler = StandardScaler()
        else:
            self.scaler: MinMaxScaler = MinMaxScaler(feature_range=(0, 1), clip=True)

    def fit(
        self, X: pl.DataFrame, y: Optional[pl.DataFrame] = None
    ) -> "NumericalScaler":
        """
        Fit the transformer to the data.

        Parameters
        ----------
        X : pl.DataFrame, shape (n_samples, n_features)
            Input features.
        y : Optional[pl.DataFrame], default=None
            Target values (ignored).

        Returns
        -------
        NumericalScaler
            The fitted transformer.
        """
        if isinstance(X, pd.DataFrame):
            X = pl.from_pandas(X)
        if self.features is not None:
            self.ignore_columns_: list[str] = sorted(
                set(X.columns) - set(self.features)
            )

        elif self.exclude_features and self.features is None:
            self.ignore_columns_: list[str] = self.exclude_features
            self.features = sorted(set(X.columns) - set(self.exclude_features))

        self.ignore_columns_ = self.ignore_columns_
        self.scaler.fit(X.select(self.features))
        return self

    def transform(self, X: pl.DataFrame) -> pl.DataFrame:
        """
        Transform the input DataFrame by scaling numerical features.

        Parameters
        ----------
        X : pl.DataFrame, shape (n_samples, n_features)
            Input features.

        Returns
        -------
        pl.DataFrame, shape (n_samples, n_features)
            Transformed DataFrame with scaled features.
        """
        if isinstance(X, pd.DataFrame):
            X = pl.from_pandas(X)
        vector: np.ndarray = self.scaler.transform(X.select(self.features))
        ignore_df: pl.DataFrame = X.select(self.ignore_columns_)
        vector_df: pl.DataFrame = pl.DataFrame(vector, schema=self.features)
        df: pl.DataFrame = pl.concat([ignore_df, vector_df], how="horizontal")
        return df


class CustomOneHotEncoder(BaseEstimator, TransformerMixin):
    """
    A custom one-hot encoder that works with Polars DataFrames.

    Parameters
    ----------
    features : list[str] | None, optional
        List of column names to encode. If None, all columns will be encoded.

    Attributes
    ----------
    features : list[str] | None
        List of column names to encode.
    encoder : OneHotEncoder
        The underlying scikit-learn OneHotEncoder.
    ignore_columns : list[str]
        List of column names to ignore during encoding.
    ignore_columns_ : list[str]
        List of column names ignored during encoding after fitting.
    """

    def __init__(
        self,
        features: list[str] | None = None,
    ) -> None:
        self.features: list[str] | None = features
        self.encoder: OneHotEncoder = OneHotEncoder(handle_unknown="ignore")
        self.ignore_columns: list[str] = []
        self.ignore_columns_: list[str] = []

    def fit(
        self, X: pl.DataFrame, y: pl.DataFrame | None = None
    ) -> "CustomOneHotEncoder":
        """
        Fit the OneHotEncoder to the input data.

        Parameters
        ----------
        X : pl.DataFrame, shape (n_samples, n_features)
            Input features to fit the encoder.
        y : pl.DataFrame | None, optional
            Ignored. Kept for scikit-learn compatibility.

        Returns
        -------
        CustomOneHotEncoder
            The fitted encoder.
        """
        if isinstance(X, pd.DataFrame):
            X = pl.from_pandas(X)
        if self.features is None:
            self.features = X.columns
        self.ignore_columns_: list[str] = list(set(X.columns) - set(self.features))
        self.encoder.fit(X.select(self.features))
        return self

    def transform(self, X: pl.DataFrame) -> pl.DataFrame:
        """
        Transform the input data using the fitted encoder.

        Parameters
        ----------
        X : pl.DataFrame, shape (n_samples, n_features)
            Input features to transform.

        Returns
        -------
        pl.DataFrame, shape (n_samples, n_encoded_features)
            Transformed DataFrame with one-hot encoded features.
        """
        if isinstance(X, pd.DataFrame):
            X = pl.from_pandas(X)
        vector: np.ndarray = self.encoder.transform(X.select(self.features)).toarray()
        ignore_df: pl.DataFrame = X.select(self.ignore_columns_)
        vector_df: pl.DataFrame = pl.DataFrame(
            vector, schema=sorted(self.encoder.get_feature_names_out())
        )
        df: pl.DataFrame = pl.concat([ignore_df, vector_df], how="horizontal")
        return df

In [17]:
temporal_transf: ExtractTemporalFeatures = ExtractTemporalFeatures(
    date_column="pubDate", date_format="%Y-%m-%d %H:%M:%S"
)
temporal_transf.fit_transform(articles_df).head(2)

articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata,metadata_1,metadata_2,day,day_of_week,week_of_year,month,year,quarter
str,i64,str,date,str,str,str,str,str,i8,i8,i8,i8,i32,i8
"""58927e0495d0e0392607e1b3""",1129,"""article""",2017-02-02,"""The New York Times""","""news""","""ken belson n f l vs politics has been battle all season long football super bowl national football l…","""ken belson n f l vs politics has been battle all season long football super bowl national football l…","""ken belson n f l vs politics has been battle all season long football super bowl national football l…",2,4,5,2,2017,1
"""5893033d95d0e0392607e2d6""",3082,"""article""",2017-02-02,"""The New York Times""","""news""","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…",2,4,5,2,2017,1


In [18]:
cyclic_transf: CyclicTemporalFeatures = CyclicTemporalFeatures()
cyclic_transf.fit_transform(df).head(2)

articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata,metadata_1,metadata_2,day,day_of_week,week_of_year,month,year,quarter,day_sin,day_cos,day_of_week_sin,day_of_week_cos,week_of_year_sin,week_of_year_cos,month_sin,month_cos
str,i64,str,date,str,str,str,str,str,i8,i8,i8,i8,i32,i8,f64,f64,f64,f64,f64,f64,f64,f64
"""58927e0495d0e0392607e1b3""",1129,"""article""",2017-02-02,"""The New York Times""","""news""","""ken belson n f l vs politics has been battle all season long football super bowl national football l…","""ken belson n f l vs politics has been battle all season long football super bowl national football l…","""ken belson n f l vs politics has been battle all season long football super bowl national football l…",2,4,5,2,2017,1,0.406737,0.913545,-0.433884,-0.900969,0.568065,0.822984,0.866025,0.5
"""5893033d95d0e0392607e2d6""",3082,"""article""",2017-02-02,"""The New York Times""","""news""","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…",2,4,5,2,2017,1,0.406737,0.913545,-0.433884,-0.900969,0.568065,0.822984,0.866025,0.5


In [19]:
A: pl.DataFrame = pl.DataFrame(
    {"income": [10000, 20000, 30000, 40000, 50000], "age": [25, 30, 35, 40, 45]}
)

scaler: NumericalScaler = NumericalScaler(
    scaler_type="min_max", features=["age"], exclude_features=None
)
print(scaler.fit_transform(A).describe())

shape: (9, 3)
┌────────────┬──────────────┬──────────┐
│ statistic  ┆ income       ┆ age      │
│ ---        ┆ ---          ┆ ---      │
│ str        ┆ f64          ┆ f64      │
╞════════════╪══════════════╪══════════╡
│ count      ┆ 5.0          ┆ 5.0      │
│ null_count ┆ 0.0          ┆ 0.0      │
│ mean       ┆ 30000.0      ┆ 0.5      │
│ std        ┆ 15811.388301 ┆ 0.395285 │
│ min        ┆ 10000.0      ┆ 0.0      │
│ 25%        ┆ 20000.0      ┆ 0.25     │
│ 50%        ┆ 30000.0      ┆ 0.5      │
│ 75%        ┆ 40000.0      ┆ 0.75     │
│ max        ┆ 50000.0      ┆ 1.0      │
└────────────┴──────────────┴──────────┘


In [20]:
ohe: CustomOneHotEncoder = CustomOneHotEncoder(features=["documentType"])
ohe.fit_transform(articles_df).head(2)

metadata,metadata_2,pubDate,source,articleID,articleWordCount,typeOfMaterial,metadata_1,documentType_article,documentType_blogpost
str,str,date,str,str,i64,str,str,f64,f64
"""ken belson n f l vs politics has been battle all season long football super bowl national football l…","""ken belson n f l vs politics has been battle all season long football super bowl national football l…",2017-02-02,"""The New York Times""","""58927e0495d0e0392607e1b3""",1129,"""news""","""ken belson n f l vs politics has been battle all season long football super bowl national football l…",1.0,0.0
"""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…",2017-02-02,"""The New York Times""","""5893033d95d0e0392607e2d6""",3082,"""news""","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…",1.0,0.0


<hr>

### Vectorize The Metadata

- using TF-IDF

In [21]:
# tfidf, SVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix


n_components: int = 100
seed: int = 123

tfidf_vec: TfidfVectorizer = TfidfVectorizer(stop_words="english", max_df=0.9, min_df=2)
tfidf_matrix: csr_matrix = tfidf_vec.fit_transform(df["metadata"])

# Extract the latent features
svd = TruncatedSVD(n_components=n_components, random_state=seed)
svd_matrix: np.ndarray = svd.fit_transform(tfidf_matrix)

print(f"{svd_matrix.shape = }")

svd_matrix.shape = (9335, 100)


In [22]:
dir(tfidf_vec)
tfidf_vec.get_feature_names_out()

array(['000', '039', '10', ..., 'zuckerberg', 'zuma', 'zwaan'],
      dtype=object)

In [23]:
class TFIDFTransformer(BaseEstimator, TransformerMixin):
    """
    Transformer for TF-IDF vectorization of text data.

    Parameters
    ----------
    feature : str
        The name of the feature column containing text data.
    max_df : float, default=1.0
        The maximum document frequency for TfidfVectorizer.
    min_df : int, default=1
        The minimum document frequency for TfidfVectorizer.

    Attributes
    ----------
    tfidf : TfidfVectorizer
        The TF-IDF vectorizer object.
    ignore_columns_ : list[str]
        List of columns to ignore during transformation.
    """

    def __init__(
        self,
        feature: str,
        max_df: float = 1.0,
        min_df: int = 1,
    ) -> None:
        self.feature: str = feature
        self.max_df: float = max_df
        self.min_df: int = min_df
        self.tfidf: TfidfVectorizer = TfidfVectorizer(
            stop_words="english", max_df=max_df, min_df=min_df
        )

    def fit(self, X: pd.DataFrame | pl.DataFrame, y: None = None) -> "TFIDFTransformer":
        """
        Fit the TF-IDF vectorizer to the input data.

        Parameters
        ----------
        X : pd.DataFrame | pl.DataFrame, shape (n_samples, n_features)
            Input DataFrame containing the text feature.
        y : None
            Ignored. Kept for compatibility with scikit-learn API.

        Returns
        -------
        TFIDFTransformer
            The fitted transformer.
        """
        if isinstance(X, pd.DataFrame):
            X = pl.from_pandas(X)
        self.ignore_columns_: list[str] = sorted(set(X.columns) - {self.feature})
        self.tfidf.fit(X[self.feature])
        return self

    def transform(self, X: pd.DataFrame | pl.DataFrame) -> pl.DataFrame:
        """
        Transform the input data using the fitted TF-IDF vectorizer.

        Parameters
        ----------
        X : pd.DataFrame | pl.DataFrame, shape (n_samples, n_features)
            Input DataFrame containing the text feature.

        Returns
        -------
        pl.DataFrame, shape (n_samples, n_features + n_tfidf_features)
            Transformed DataFrame with TF-IDF features.
        """
        if isinstance(X, pd.DataFrame):
            X = pl.from_pandas(X)
        tfidf_matrix: csr_matrix = self.tfidf.transform(X[self.feature])
        schema: list[str] = list(self.tfidf.get_feature_names_out())
        schema = [f"tfidf__{s}" for s in schema]
        tfidf_df: pl.DataFrame = pl.DataFrame(tfidf_matrix.toarray(), schema=schema)
        ignore_df: pl.DataFrame = X.select(self.ignore_columns_)
        df: pl.DataFrame = pl.concat([ignore_df, tfidf_df], how="horizontal")
        return df


class SVDTransformer(BaseEstimator, TransformerMixin):
    """
    Transformer for dimensionality reduction using Truncated SVD.

    Parameters
    ----------
    exclude_features : list[str] | None, default=None
        List of feature names to exclude from SVD transformation.
    include_pattern : str | None, default=None
        Regular expression pattern to include features for SVD transformation.
    n_components : int, default=100
        Number of components to keep in the SVD transformation.
    random_state : int, default=42
        Random state for reproducibility.

    Attributes
    ----------
    svd : TruncatedSVD
        The TruncatedSVD object for dimensionality reduction.
    features : list[str]
        List of feature names to include in SVD transformation.
    ignore_columns_ : list[str]
        List of feature names to exclude from SVD transformation.
    """

    def __init__(
        self,
        exclude_features: list[str] | None = None,
        include_pattern: str | None = None,
        n_components: int = 100,
        random_state: int = 42,
    ) -> None:
        if exclude_features is None and include_pattern is None:
            raise ValueError(
                "`exclude_features` and `include_pattern` cannot both be None"
            )
        if exclude_features is not None and include_pattern is not None:
            raise ValueError(
                "`exclude_features` and `include_pattern` cannot both be not None"
            )
        assert (
            isinstance(exclude_features, list) or exclude_features is None
        ), "`exclude_features` must be of type List"
        self.exclude_features: list[str] | None = exclude_features
        if include_pattern is not None:
            assert isinstance(
                include_pattern, str
            ), "`include_pattern` must be of type str"
            self.include_pattern: str = include_pattern
        self.n_components: int = n_components
        self.random_state: int = random_state
        self.svd: TruncatedSVD = TruncatedSVD(
            n_components=n_components, random_state=random_state
        )

    def fit(self, X: pd.DataFrame | pl.DataFrame, y: None = None) -> "SVDTransformer":
        """
        Fit the SVD transformer to the input data.

        Parameters
        ----------
        X : pd.DataFrame | pl.DataFrame, shape (n_samples, n_features)
            Input DataFrame containing the features.
        y : None
            Ignored. Kept for compatibility with scikit-learn API.

        Returns
        -------
        SVDTransformer
            The fitted transformer.
        """
        if isinstance(X, pd.DataFrame):
            X = pl.from_pandas(X)
        if self.exclude_features:
            self.features: list[str] = list(set(X.columns) - set(self.exclude_features))
            self.ignore_columns_: list[str] = self.exclude_features
        elif hasattr(self, "include_pattern"):
            self.features: list[str] = [
                col
                for col in X.columns
                if re.match(self.include_pattern, col, flags=re.IGNORECASE)
            ]
            self.ignore_columns_: list[str] = sorted(
                set(X.columns) - set(self.features)
            )
        self.svd.fit(X.select(self.features))
        return self

    def transform(self, X: pd.DataFrame | pl.DataFrame) -> pl.DataFrame:
        """
        Transform the input data using the fitted SVD transformer.

        Parameters
        ----------
        X : pd.DataFrame | pl.DataFrame, shape (n_samples, n_features)
            Input DataFrame containing the features.

        Returns
        -------
        pl.DataFrame, shape (n_samples, n_excluded_features + n_components)
            Transformed DataFrame with reduced dimensions.
        """
        if isinstance(X, pd.DataFrame):
            X = pl.from_pandas(X)
        svd_matrix: np.ndarray = self.svd.transform(X.select(self.features))
        svd_df: pl.DataFrame = pl.DataFrame(
            svd_matrix,
            schema=list(self.svd.get_feature_names_out()),
        )
        ignore_df: pl.DataFrame = X.select(self.ignore_columns_)
        df: pl.DataFrame = pl.concat([ignore_df, svd_df], how="horizontal")
        return df

In [24]:
A: pl.DataFrame = pl.DataFrame(
    {
        "income": [10000, 20000, 30000, 40000, 50000],
        "age": [25, 30, 35, 40, 45],
        "text": [
            "If completed the deal will be a record sale for Bournemouth - exceeding the £40m they "
            "received from Manchester City for defender  Nathan Ake in 2020.",
            "New York Times has a wide audience and plays a prominent role in shaping people's",
            "replyCount: Number of replies the comment received. on clause on any profit.I know about it",
            "Solanke joined Bournemouth for £19m in 2019 from Liverpool, who are understood to have a 20% sell-",
            "Gimme my money back I really already. I know about it  I'm out of here",
        ],
    }
)
vec: TFIDFTransformer = TFIDFTransformer(feature="text")
T: pl.DataFrame = vec.fit_transform(A)
T.head()

age,income,tfidf__19m,tfidf__20,tfidf__2019,tfidf__2020,tfidf__40m,tfidf__ake,tfidf__audience,tfidf__bournemouth,tfidf__city,tfidf__clause,tfidf__comment,tfidf__completed,tfidf__deal,tfidf__defender,tfidf__exceeding,tfidf__gimme,tfidf__joined,tfidf__know,tfidf__liverpool,tfidf__manchester,tfidf__money,tfidf__nathan,tfidf__new,tfidf__number,tfidf__people,tfidf__plays,tfidf__profit,tfidf__prominent,tfidf__really,tfidf__received,tfidf__record,tfidf__replies,tfidf__replycount,tfidf__role,tfidf__sale,tfidf__sell,tfidf__shaping,tfidf__solanke,tfidf__times,tfidf__understood,tfidf__wide,tfidf__york
i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
25,10000,0.0,0.0,0.0,0.274185,0.274185,0.274185,0.0,0.221211,0.274185,0.0,0.0,0.274185,0.274185,0.274185,0.274185,0.0,0.0,0.0,0.0,0.274185,0.0,0.274185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.221211,0.274185,0.0,0.0,0.0,0.274185,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30,20000,0.0,0.0,0.0,0.0,0.0,0.0,0.316228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.316228,0.0,0.316228,0.316228,0.0,0.316228,0.0,0.0,0.0,0.0,0.0,0.316228,0.0,0.0,0.316228,0.0,0.316228,0.0,0.316228,0.316228
35,30000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.37007,0.37007,0.0,0.0,0.0,0.0,0.0,0.0,0.29857,0.0,0.0,0.0,0.0,0.0,0.37007,0.0,0.0,0.37007,0.0,0.0,0.29857,0.0,0.37007,0.37007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,40000,0.339992,0.339992,0.339992,0.0,0.0,0.0,0.0,0.274304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.339992,0.0,0.339992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.339992,0.0,0.339992,0.0,0.339992,0.0,0.0
45,50000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.523358,0.0,0.422242,0.0,0.0,0.523358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.523358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
svd_t: SVDTransformer = SVDTransformer(
    exclude_features=None, n_components=39, include_pattern="tfidf__"
)
T_t: pl.DataFrame = svd_t.fit_transform(T)
T_t.head()

age,income,truncatedsvd0,truncatedsvd1,truncatedsvd2,truncatedsvd3,truncatedsvd4
i64,i64,f64,f64,f64,f64,f64
25,10000,0.405314,0.612646,2.9631e-16,-0.581242,0.350062
30,20000,5.0187e-16,-2.4244e-17,1.0,1.2578e-15,1.5751e-15
35,30000,0.737692,-0.162015,8.218e-16,-0.153711,-0.63713
40,40000,0.169017,0.707133,-9.5244e-16,0.670886,-0.145977
45,50000,0.639122,-0.388524,-1.759e-15,0.368608,0.551997


In [26]:
# One-hot encode
ohe_columns: list[str] = ["documentType", "source", "typeOfMaterial", "year"]

# Others
text_columns: list[str] = ["metadata"]
numeric_columns: list[str] = ["articleWordCount"]
date_columns: list[str] = ["pubDate"]

# Columns to drop
drop_columns: list[str] = cyclical_columns + date_columns
drop_columns

['day', 'day_of_week', 'week_of_year', 'month', 'pubDate']

In [27]:
max_df: float = 0.9
min_df: float = 2
n_components: int = 100


preprocessor_pipe: Pipeline = Pipeline(
    steps=[
        (
            "temporal_features",
            ExtractTemporalFeatures(
                date_column="pubDate", date_format="%Y-%m-%d %H:%M:%S"
            ),
        ),
        ("cyclical_features", CyclicTemporalFeatures()),
        ("ohe", CustomOneHotEncoder(features=ohe_columns)),
        (
            "tfidf_vectorizer",
            TFIDFTransformer(feature="metadata", max_df=max_df, min_df=min_df),
        ),
        (
            "svd",
            SVDTransformer(
                include_pattern="tfidf__",
                n_components=n_components,
                random_state=seed,
            ),
        ),
        ("drop_features", DropFeatures(features=drop_columns)),
        (
            "scaler",
            NumericalScaler(scaler_type="min_max", exclude_features=["articleID"]),
        ),
    ]
)

preprocessor_pipe

In [28]:
# temp_df: pl.DataFrame = preprocessor_pipe.fit_transform(articles_df)


# temp_df.head(2)

<hr>

### Load Comments Data

- Extract the targets from the comments data
  - Combine the `recommendations`, `replyCount` and create a new column `target`

In [29]:
fp: str = "../data/all_comments.parquet"

comments_df: pl.DataFrame = pl.read_parquet(fp)
print(f"{comments_df.shape = }")
comments_df.head(2)

comments_df.shape = (2176364, 34)


approveDate,articleID,articleWordCount,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,editorsSelection,inReplyTo,newDesk,parentID,parentUserDisplayName,permID,picURL,printPage,recommendations,recommendedFlag,replyCount,reportAbuseFlag,sectionName,sharing,status,timespeople,trusted,typeOfMaterial,updateDate,userDisplayName,userID,userLocation,userTitle,userURL
i64,str,f64,str,f64,f64,str,str,f64,f64,i64,f64,str,f64,str,str,str,f64,f64,null,f64,null,str,i64,str,f64,f64,str,i64,str,f64,str,str,str
1517529462,"""5a7258e410f40f00018bed7d""",835.0,"""The snake-filled heads comment made me think of Medusa. <br/><br/>I hope he loses, so retrograde.""",25791250.0,25791250.0,"""<br/>""","""comment""",1517500000.0,1.0,0,0.0,"""OpEd""",0.0,,"""25791250""","""https://graphics8.nytimes.com/images/apps/timespeople/none.png""",23.0,5.0,,0.0,,"""Unknown""",0,"""approved""",1.0,0.0,"""Op-Ed""",1517529462,"""Jennie""",79172841.0,"""WA""",,
1517529428,"""5a7258e410f40f00018bed7d""",835.0,"""She-devil reporting for duty!""",25795675.0,25795675.0,"""<br/>""","""comment""",1517500000.0,1.0,0,0.0,"""OpEd""",0.0,,"""25795675""","""https://graphics8.nytimes.com/images/apps/timespeople/none.png""",23.0,2.0,,0.0,,"""Unknown""",0,"""approved""",1.0,0.0,"""Op-Ed""",1517529428,"""Nice White Lady""",66376882.0,"""Seattle""",,


In [30]:
comments_engagments_df: pl.DataFrame = comments_df.group_by("articleID").agg(
    avg_comment_length=pl.col("commentBody").str.len_chars().mean().round(2),
    avg_upvotes=pl.col("recommendations").mean().round(2),
    avg_replies=pl.col("replyCount").mean().round(2),
)

comments_engagments_df.head()

articleID,avg_comment_length,avg_upvotes,avg_replies
str,f64,f64,f64
"""5a90da4d10f40f00018c2c5f""",536.6,3.1,0.29
"""5a5e7c2d7c459f29e79b4481""",388.34,12.62,0.75
"""58c26c147c459f247a91297d""",360.61,0.0,0.0
"""5aa7931d47de81a90120df8e""",285.05,7.36,0.28
"""5ac004e547de81a90121a997""",289.17,1.64,0.59


In [31]:
percentiles: np.ndarray = np.linspace(0.5, 0.999, 8)

comments_engagments_df.describe(percentiles=percentiles)

statistic,articleID,avg_comment_length,avg_upvotes,avg_replies
str,str,f64,f64,f64
"""count""","""9450""",9450.0,9450.0,9450.0
"""null_count""","""0""",0.0,0.0,0.0
"""mean""",,415.727201,11.018242,0.353254
"""std""",,161.292958,11.6116,0.228909
"""min""","""58691a5795d0e039260788b9""",4.0,0.0,0.0
"""50%""",,403.0,8.24,0.35
"""57.1286%""",,426.5,10.03,0.39
"""64.2571%""",,452.96,11.97,0.43
"""71.3857%""",,482.57,14.16,0.47
"""78.5143%""",,515.26,17.08,0.52


In [32]:
scaler: NumericalScaler = NumericalScaler(
    scaler_type="min_max", exclude_features=["articleID"]
)
comments_engagments_scaled_df: pl.DataFrame = scaler.fit_transform(
    comments_engagments_df
)
comments_engagments_scaled_df.head()

articleID,avg_comment_length,avg_replies,avg_upvotes
str,f64,f64,f64
"""5a90da4d10f40f00018c2c5f""",0.299045,0.124464,0.00787
"""5a5e7c2d7c459f29e79b4481""",0.2158,0.321888,0.032039
"""58c26c147c459f247a91297d""",0.20023,0.0,0.0
"""5aa7931d47de81a90120df8e""",0.157805,0.120172,0.018685
"""5ac004e547de81a90121a997""",0.160118,0.253219,0.004164


In [33]:
# Weights for creating the final score
w1: float = 0.10  # avg_comment_length
w2: float = 0.20  # avg_replies
w3: float = 0.70  # avg_upvotes


comments_engagments_scaled_df = comments_engagments_scaled_df.with_columns(
    final_score=(
        (pl.col("avg_comment_length") * w1)
        + (pl.col("avg_replies") * w2)
        + (pl.col("avg_upvotes") * w3)
    )
)
threshold: float = np.percentile(comments_engagments_scaled_df["final_score"], 60)

comments_engagments_scaled_df = comments_engagments_scaled_df.with_columns(
    is_high_engagement=(pl.col("final_score") > threshold).cast(pl.UInt8)
)
print(f"{threshold = }")

comments_engagments_scaled_df.head()

threshold = 0.08123442347585084


articleID,avg_comment_length,avg_replies,avg_upvotes,final_score,is_high_engagement
str,f64,f64,f64,f64,u8
"""5a90da4d10f40f00018c2c5f""",0.299045,0.124464,0.00787,0.060306,0
"""5a5e7c2d7c459f29e79b4481""",0.2158,0.321888,0.032039,0.108385,1
"""58c26c147c459f247a91297d""",0.20023,0.0,0.0,0.020023,0
"""5aa7931d47de81a90120df8e""",0.157805,0.120172,0.018685,0.052895,0
"""5ac004e547de81a90121a997""",0.160118,0.253219,0.004164,0.06957,0


In [34]:
percentiles: np.ndarray = np.linspace(0.5, 0.999, 5)

comments_engagments_scaled_df.describe(percentiles=percentiles)

statistic,articleID,avg_comment_length,avg_replies,avg_upvotes,final_score,is_high_engagement
str,str,f64,f64,f64,f64,f64
"""count""","""9450""",9450.0,9450.0,9450.0,9450.0,9450.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0
"""mean""",,0.231178,0.151611,0.027973,0.073021,0.4
"""std""",,0.090563,0.098244,0.029479,0.032446,0.489924
"""min""","""58691a5795d0e039260788b9""",0.0,0.0,0.0,0.0,0.0
"""50%""",,0.224031,0.150215,0.02092,0.073961,0.0
"""62.475%""",,0.248327,0.180258,0.029145,0.083266,1.0
"""74.95%""",,0.277558,0.214592,0.039326,0.093467,1.0
"""87.425%""",,0.315873,0.261803,0.056945,0.107704,1.0
"""99.9%""",,0.816957,0.515021,0.258422,0.227705,1.0


In [35]:
pl.Config.set_fmt_str_lengths(20)

# Join the data
articles_final_df: pl.DataFrame = articles_df.join(
    comments_engagments_scaled_df.select(["articleID", "is_high_engagement"]),
    on="articleID",
    how="left",
)
print(f"{articles_final_df.shape = }")

articles_final_df.head(10)

articles_final_df.shape = (9335, 10)


articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata,metadata_1,metadata_2,is_high_engagement
str,i64,str,date,str,str,str,str,str,u8
"""58927e0495d0e0392607…",1129,"""article""",2017-02-02,"""The New York Times""","""news""","""ken belson n f l vs …","""ken belson n f l vs …","""ken belson n f l vs …",0
"""5893033d95d0e0392607…",3082,"""article""",2017-02-02,"""The New York Times""","""news""","""unknown voice vice v…","""unknown voice vice v…","""unknown voice vice v…",1
"""5893039595d0e0392607…",693,"""article""",2017-02-02,"""The New York Times""","""review""","""manohla dargis a sta…","""manohla dargis a sta…","""manohla dargis a sta…",0
"""5893109995d0e0392607…",1049,"""article""",2017-02-02,"""The New York Times""","""briefing""","""alexandra s levine n…","""alexandra s levine n…","""alexandra s levine n…",0
"""5893114495d0e0392607…",1283,"""article""",2017-02-02,"""The New York Times""","""news""","""bonnie tsui a swimme…","""bonnie tsui a swimme…","""bonnie tsui a swimme…",0
"""5892a0d995d0e0392607…",837,"""article""",2017-02-02,"""The New York Times""","""news""","""caitlin lovinger tra…","""caitlin lovinger tra…","""caitlin lovinger tra…",0
"""5892e7bd95d0e0392607…",97,"""article""",2017-02-02,"""The New York Times""","""news""","""katherine schulten s…","""katherine schulten s…","""katherine schulten s…",0
"""5892ec0795d0e0392607…",870,"""article""",2017-02-02,"""The New York Times""","""op-ed""","""ioan grillo trump s …","""ioan grillo trump s …","""ioan grillo trump s …",1
"""5892ec0795d0e0392607…",804,"""article""",2017-02-02,"""The New York Times""","""op-ed""","""gail collins pence s…","""gail collins pence s…","""gail collins pence s…",1
"""5892ec0795d0e0392607…",864,"""article""",2017-02-02,"""The New York Times""","""op-ed""","""charles m blow fruit…","""charles m blow fruit…","""charles m blow fruit…",1


In [36]:
articles_final_df.filter(pl.col("is_high_engagement").eq(1)).sample(5)

articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata,metadata_1,metadata_2,is_high_engagement
str,i64,str,date,str,str,str,str,str,u8
"""58943d8995d0e0392607…",829,"""article""",2017-02-03,"""The New York Times""","""op-ed""","""david brooks a retur…","""david brooks a retur…","""david brooks a retur…",1
"""5aa0305b47de81a90120…",1081,"""article""",2018-03-07,"""The New York Times""","""news""","""vivian wang and cher…","""vivian wang and cher…","""vivian wang and cher…",1
"""58b5330a95d0e024902f…",1540,"""article""",2017-02-28,"""The New York Times""","""op-ed""","""amber batura how pla…","""amber batura how pla…","""amber batura how pla…",1
"""5874998695d0e0392607…",547,"""article""",2017-01-10,"""The New York Times""","""editorial""","""the editorial board …","""the editorial board …","""the editorial board …",1
"""5a68ba9e10f40f00018b…",816,"""article""",2018-01-24,"""The New York Times""","""op-ed""","""sam brinton tortured…","""sam brinton tortured…","""sam brinton tortured…",1


In [37]:
sp: str = "../data/articles_final_df.parquet"
articles_final_df.write_parquet(sp)