# Feature Engineering

- Create a pipeline containing:
  - vectorized text features
  - numeric features
  - encoded  categorical features
- Use ColumnTransformer to combine the features
- Use a pipeline to combine the transformer and the model
- Perform a grid search to find the best model and hyperparameters.

In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,mlxtend,omegaconf --conda

Python implementation: CPython
Python version       : 3.10.8
IPython version      : 8.26.0

numpy    : 1.26.0
pandas   : 2.2.2
polars   : 1.4.1
mlxtend  : 0.23.1
omegaconf: not installed

conda environment: n/a



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")


# auto reload imports# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(500)

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

### Load Data

In [3]:
pl.Config.set_fmt_str_lengths(100)


fp: str = "../data/cleaned_articles.parquet"
articles_df: pl.DataFrame = pl.read_parquet(fp)
print(f"{articles_df.shape = }")

articles_df.head(2)

articles_df.shape = (9335, 7)


articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata
str,i64,str,str,str,str,str
"""58927e0495d0e0392607e1b3""",1129,"""article""","""2017-02-02 00:26:16""","""The New York Times""","""news""","""ken belson n f l vs politics has been battle all season long football super bowl national football l…"
"""5893033d95d0e0392607e2d6""",3082,"""article""","""2017-02-02 10:00:24""","""The New York Times""","""news""","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…"


In [4]:
print(articles_df["documentType"].value_counts(sort=True))
print(articles_df["source"].value_counts(sort=True))
print(articles_df["typeOfMaterial"].value_counts(sort=True))

shape: (2, 2)
┌──────────────┬───────┐
│ documentType ┆ count │
│ ---          ┆ ---   │
│ str          ┆ u32   │
╞══════════════╪═══════╡
│ article      ┆ 9168  │
│ blogpost     ┆ 167   │
└──────────────┴───────┘
shape: (2, 2)
┌──────────────────────────────┬───────┐
│ source                       ┆ count │
│ ---                          ┆ ---   │
│ str                          ┆ u32   │
╞══════════════════════════════╪═══════╡
│ The New York Times           ┆ 9284  │
│ International New York Times ┆ 51    │
└──────────────────────────────┴───────┘
shape: (7, 2)
┌────────────────┬───────┐
│ typeOfMaterial ┆ count │
│ ---            ┆ ---   │
│ str            ┆ u32   │
╞════════════════╪═══════╡
│ news           ┆ 6219  │
│ op-ed          ┆ 1693  │
│ review         ┆ 513   │
│ editorial      ┆ 302   │
│ briefing       ┆ 298   │
│ blog           ┆ 167   │
│ other          ┆ 143   │
└────────────────┴───────┘


In [5]:
from polars import selectors as cs


# One-hot encode
ohe_columns: list[str] = ["documentType", "source", "typeOfMaterial"]
articles_temp_df: pl.DataFrame = articles_df.clone()
articles_temp_df = articles_temp_df.to_dummies(columns=ohe_columns)
articles_temp_df.head(3)

articleID,articleWordCount,documentType_article,documentType_blogpost,pubDate,source_International New York Times,source_The New York Times,typeOfMaterial_blog,typeOfMaterial_briefing,typeOfMaterial_editorial,typeOfMaterial_news,typeOfMaterial_op-ed,typeOfMaterial_other,typeOfMaterial_review,metadata
str,i64,u8,u8,str,u8,u8,u8,u8,u8,u8,u8,u8,u8,str
"""58927e0495d0e0392607e1b3""",1129,1,0,"""2017-02-02 00:26:16""",0,1,0,0,0,1,0,0,0,"""ken belson n f l vs politics has been battle all season long football super bowl national football l…"
"""5893033d95d0e0392607e2d6""",3082,1,0,"""2017-02-02 10:00:24""",0,1,0,0,0,1,0,0,0,"""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…"
"""5893039595d0e0392607e2da""",693,1,0,"""2017-02-02 10:01:53""",0,1,0,0,0,0,0,0,1,"""manohla dargis a stand up s downward slide movies the comedian movie de niro robert hackford taylor …"


### Create Cyclic Features

- day_of_the_week: e.g Monday
- day: e.g 1st
- week_of_the_year: e.g 1st week of the year
- week_of_the_month: e.g 1st week of the month
- month: e.g January
- year: e.g 2017
- quarter: e.g 1st quarter
- half: e.g 1st half
- year_month: e.g 2017-jan

In [6]:
# Convert pubDate to datetime
date_format: str = "%Y-%m-%d %H:%M:%S"
articles_df = articles_df.with_columns(
    pubDate=pl.col("pubDate").str.to_date(date_format)
)

articles_df.head()

articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata
str,i64,str,date,str,str,str
"""58927e0495d0e0392607e1b3""",1129,"""article""",2017-02-02,"""The New York Times""","""news""","""ken belson n f l vs politics has been battle all season long football super bowl national football l…"
"""5893033d95d0e0392607e2d6""",3082,"""article""",2017-02-02,"""The New York Times""","""news""","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…"
"""5893039595d0e0392607e2da""",693,"""article""",2017-02-02,"""The New York Times""","""review""","""manohla dargis a stand up s downward slide movies the comedian movie de niro robert hackford taylor …"
"""5893109995d0e0392607e2ef""",1049,"""article""",2017-02-02,"""The New York Times""","""briefing""","""alexandra s levine new york today a groundhog has her day new york city groundhogs metro thursday a …"
"""5893114495d0e0392607e2f1""",1283,"""article""",2017-02-02,"""The New York Times""","""news""","""bonnie tsui a swimmer s communion with the ocean travel and vacations swimming oceans and seas hawai…"


In [7]:
articles_df.with_columns(
    day=pl.col("pubDate").dt.day(),
    day_of_week=pl.col("pubDate").dt.weekday(),
    week_of_year=pl.col("pubDate").dt.week(),
    month=pl.col("pubDate").dt.month(),
    year=pl.col("pubDate").dt.year(),
    quarter=pl.col("pubDate").dt.quarter(),
).sample(n=2, seed=1)

articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata,day,day_of_week,week_of_year,month,year,quarter
str,i64,str,date,str,str,str,i8,i8,i8,i8,i32,i8
"""58a2e33d95d0e02474636550""",154,"""article""",2017-02-14,"""The New York Times""","""news""","""david gelles how to be mindful while eating chocolate chocolate valentine s day meditation well take…",14,2,7,2,2017,1
"""59084d4c7c459f24986dd363""",837,"""article""",2017-05-02,"""International New York Times""","""op-ed""","""roger cohen uncertainty bordering on dismay international trade and world market north atlantic trea…",2,2,18,5,2017,2


In [8]:
def extract_temporal_features(
    data: pl.DataFrame, date_column: str, date_format: str = "%Y-%m-%d %H:%M:%S"
) -> pl.DataFrame:
    # Convert to datetime
    if not data[date_column].dtype == pl.Date:
        data = data.with_columns(pubDate=pl.col(date_column).str.to_date(date_format))
    else:
        data = data

    try:
        data = data.with_columns(
            day=pl.col(date_column).dt.day(),
            # where monday = 1 and sunday = 7
            day_of_week=pl.col(date_column).dt.weekday(),
            week_of_year=pl.col(date_column).dt.week(),
            month=pl.col(date_column).dt.month(),
            year=pl.col(date_column).dt.year(),
            quarter=pl.col(date_column).dt.quarter(),
        )
    except:
        print("Error creating temporal features")
        return pl.DataFrame()

    return data

In [9]:
df: pl.DataFrame = extract_temporal_features(
    data=articles_df, date_column="pubDate", date_format="%Y-%m-%d %H:%M:%S"
)

df.sample(n=3, seed=1)

articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata,day,day_of_week,week_of_year,month,year,quarter
str,i64,str,date,str,str,str,i8,i8,i8,i8,i32,i8
"""58a2e33d95d0e02474636550""",154,"""article""",2017-02-14,"""The New York Times""","""news""","""david gelles how to be mindful while eating chocolate chocolate valentine s day meditation well take…",14,2,7,2,2017,1
"""58f8e6b37c459f24986db2bf""",455,"""article""",2017-04-20,"""The New York Times""","""review""","""jeannette catsoulis a love triangle near the front lines movies the promise movie george terry bale …",20,4,16,4,2017,2
"""59084d4c7c459f24986dd363""",837,"""article""",2017-05-02,"""International New York Times""","""op-ed""","""roger cohen uncertainty bordering on dismay international trade and world market north atlantic trea…",2,2,18,5,2017,2


In [10]:
# Create Cyclic Temporal Features
day_factor: int = 30
day_of_week_factor: int = 7
week_of_year_factor: int = 52
month_factor: int = 12

df.with_columns(
    day_sin=pl.col("day").map_elements(lambda x: np.sin(2 * np.pi * x / day_factor)),
    day_cos=pl.col("day").map_elements(lambda x: np.cos(2 * np.pi * x / day_factor)),
    # where monday = 1 and sunday = 7
    day_of_week_sin=pl.col("day_of_week").map_elements(
        lambda x: np.sin(2 * np.pi * x / day_of_week_factor)
    ),
    day_of_week_cos=pl.col("day_of_week").map_elements(
        lambda x: np.cos(2 * np.pi * x / day_of_week_factor)
    ),
    week_of_year_sin=pl.col("week_of_year").map_elements(
        lambda x: np.sin(2 * np.pi * x / week_of_year_factor)
    ),
    week_of_year_cos=pl.col("week_of_year").map_elements(
        lambda x: np.cos(2 * np.pi * x / week_of_year_factor),
    ),
    month_sin=pl.col("month").map_elements(
        lambda x: np.sin(2 * np.pi * x / month_factor)
    ),
    month_cos=pl.col("month").map_elements(
        lambda x: np.cos(2 * np.pi * x / month_factor)
    ),
).head(3)

articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata,day,day_of_week,week_of_year,month,year,quarter,day_sin,day_cos,day_of_week_sin,day_of_week_cos,week_of_year_sin,week_of_year_cos,month_sin,month_cos
str,i64,str,date,str,str,str,i8,i8,i8,i8,i32,i8,f64,f64,f64,f64,f64,f64,f64,f64
"""58927e0495d0e0392607e1b3""",1129,"""article""",2017-02-02,"""The New York Times""","""news""","""ken belson n f l vs politics has been battle all season long football super bowl national football l…",2,4,5,2,2017,1,0.406737,0.913545,-0.433884,-0.900969,0.568065,0.822984,0.866025,0.5
"""5893033d95d0e0392607e2d6""",3082,"""article""",2017-02-02,"""The New York Times""","""news""","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…",2,4,5,2,2017,1,0.406737,0.913545,-0.433884,-0.900969,0.568065,0.822984,0.866025,0.5
"""5893039595d0e0392607e2da""",693,"""article""",2017-02-02,"""The New York Times""","""review""","""manohla dargis a stand up s downward slide movies the comedian movie de niro robert hackford taylor …",2,4,5,2,2017,1,0.406737,0.913545,-0.433884,-0.900969,0.568065,0.822984,0.866025,0.5


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
import numpy as np
from typing import Callable


def sin_transformer(period: float) -> Callable[[np.ndarray], np.ndarray]:
    """
    Create a sine transformer function.

    Parameters
    ----------
    period : float
        The period of the sine function.

    Returns
    -------
    Callable[[np.ndarray], np.ndarray]
        A function that applies a sine transformation to input data.

    Notes
    -----
    The transformer applies the following function to the input:
    f(x) = sin(2 * pi * x / period)

    The input and output arrays are expected to have the same shape.
    """
    return FunctionTransformer(lambda x: np.sin(2 * np.pi * x / period))


def cos_transformer(period: float) -> Callable[[np.ndarray], np.ndarray]:
    """
    Create a cosine transformer function.

    Parameters
    ----------
    period : float
        The period of the cosine function.

    Returns
    -------
    Callable[[np.ndarray], np.ndarray]
        A function that applies a cosine transformation to input data.

    Notes
    -----
    The transformer applies the following function to the input:
    f(x) = cos(2 * pi * x / period)

    The input and output arrays are expected to have the same shape.
    """
    return FunctionTransformer(lambda x: np.cos(2 * np.pi * x / period))

In [12]:
cyclical_columns: list[str] = ["day", "day_of_week", "week_of_year", "month"]
cyclical_encoder: ColumnTransformer = ColumnTransformer(
    [
        ("day_sin", sin_transformer(day_factor), ["day"]),
        ("day_cos", cos_transformer(day_factor), ["day"]),
        ("day_of_week_sin", sin_transformer(day_of_week_factor), ["day_of_week"]),
        ("day_of_week_cos", cos_transformer(day_of_week_factor), ["day_of_week"]),
        ("week_of_year_sin", sin_transformer(week_of_year_factor), ["week_of_year"]),
        ("week_of_year_cos", cos_transformer(week_of_year_factor), ["day_of_week"]),
        ("month_sin", sin_transformer(12), ["month"]),
        ("month_cos", cos_transformer(12), ["month"]),
    ],
    remainder="passthrough",
)

encoded_features: np.ndarray = cyclical_encoder.fit_transform(
    df.select(cyclical_columns)
)

encoded_features

array([[ 0.4067,  0.9135, -0.4339, ...,  0.8855,  0.866 ,  0.5   ],
       [ 0.4067,  0.9135, -0.4339, ...,  0.8855,  0.866 ,  0.5   ],
       [ 0.4067,  0.9135, -0.4339, ...,  0.8855,  0.866 ,  0.5   ],
       ...,
       [ 0.2079,  0.9781,  0.9749, ...,  0.9709,  0.5   , -0.866 ],
       [ 0.2079,  0.9781,  0.9749, ...,  0.9709,  0.5   , -0.866 ],
       [ 0.2079,  0.9781,  0.9749, ...,  0.9709,  0.5   , -0.866 ]])

In [13]:
articles_df.head(1)

articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata
str,i64,str,date,str,str,str
"""58927e0495d0e0392607e1b3""",1129,"""article""",2017-02-02,"""The New York Times""","""news""","""ken belson n f l vs politics has been battle all season long football super bowl national football l…"


In [14]:
from sklearn.base import BaseEstimator, TransformerMixin
from typing import Union, Optional
import polars as pl
import pandas as pd


def extract_temporal_features(
    data: Union[pl.DataFrame, pd.DataFrame],
    date_column: str,
    date_format: str = "%Y-%m-%d %H:%M:%S",
) -> pl.DataFrame:
    """
    Extract temporal features from a date column in a Polars or Pandas DataFrame.

    Parameters
    ----------
    data : Union[pl.DataFrame, pd.DataFrame]
        Input DataFrame containing the date column.
    date_column : str
        Name of the column containing date information.
    date_format : str, optional
        Format of the date string, by default "%Y-%m-%d %H:%M:%S".

    Returns
    -------
    pl.DataFrame
        DataFrame with additional temporal features.
    """
    if isinstance(data, pd.DataFrame):
        data = pl.from_pandas(data)
    # Convert to datetime
    if not data[date_column].dtype == pl.Date:
        data = data.with_columns(pubDate=pl.col(date_column).str.to_date(date_format))
    else:
        data = data

    try:
        data = data.with_columns(
            day=pl.col(date_column).dt.day(),
            # where monday = 1 and sunday = 7
            day_of_week=pl.col(date_column).dt.weekday(),
            week_of_year=pl.col(date_column).dt.week(),
            month=pl.col(date_column).dt.month(),
            year=pl.col(date_column).dt.year(),
            quarter=pl.col(date_column).dt.quarter(),
        )
    except:
        print("Error creating temporal features")
        return pl.DataFrame()

    return data


class ExtractTemporalFeatures(BaseEstimator, TransformerMixin):
    """
    Transformer to extract temporal features from a date column.

    Parameters
    ----------
    date_column : str
        Name of the column containing date information.
    date_format : str, optional
        Format of the date string, by default "%Y-%m-%d %H:%M:%S".
    """

    def __init__(self, date_column: str, date_format: str = "%Y-%m-%d %H:%M:%S"):
        self.date_column: str = date_column
        self.date_format: str = date_format

    def fit(
        self, X: pl.DataFrame, y: Optional[pl.DataFrame] = None
    ) -> "ExtractTemporalFeatures":
        """
        Fit the transformer (no-op).

        Parameters
        ----------
        X : pl.DataFrame
            Input features.
        y : Optional[pl.DataFrame], optional
            Target variable, by default None.

        Returns
        -------
        ExtractTemporalFeatures
            Fitted transformer.
        """
        return self

    def transform(self, X: pl.DataFrame) -> pl.DataFrame:
        """
        Transform the input DataFrame by extracting temporal features.

        Parameters
        ----------
        X : pl.DataFrame
            Input features.

        Returns
        -------
        pl.DataFrame
            Transformed DataFrame with additional temporal features.
        """
        return extract_temporal_features(X, self.date_column, self.date_format)


class DropFeatures(BaseEstimator, TransformerMixin):
    """
    Transformer to drop specified features from a DataFrame.

    Parameters
    ----------
    features : list[str]
        List of feature names to be dropped.
    """

    def __init__(self, features: list[str]):
        self.features: list[str] = features

    def fit(
        self,
        X: Union[pl.DataFrame, pd.DataFrame],
        y: Optional[Union[pl.DataFrame, pd.DataFrame]] = None,
    ) -> "DropFeatures":
        """
        Fit the transformer (no-op).

        Parameters
        ----------
        X : Union[pl.DataFrame, pd.DataFrame]
            Input features.
        y : Optional[Union[pl.DataFrame, pd.DataFrame]], optional
            Target variable, by default None.

        Returns
        -------
        DropFeatures
            Fitted transformer.
        """
        return self

    def transform(
        self, X: Union[pl.DataFrame, pd.DataFrame]
    ) -> Union[pl.DataFrame, pd.DataFrame]:
        """
        Transform the input DataFrame by dropping specified features.

        Parameters
        ----------
        X : Union[pl.DataFrame, pd.DataFrame]
            Input features.

        Returns
        -------
        Union[pl.DataFrame, pd.DataFrame]
            Transformed DataFrame with specified features dropped.

        Raises
        ------
        ValueError
            If the input is not a pandas DataFrame or a polars DataFrame.
        """
        if isinstance(X, pl.DataFrame):
            return X.drop(self.features)
        elif isinstance(X, pd.DataFrame):
            return X.drop(columns=self.features)
        else:
            raise ValueError("Input must be a pandas DataFrame or a polars DataFrame")

In [15]:
import numpy as np
import polars as pl
import pandas as pd
from typing import Literal, Union, Optional
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler


def create_cyclic_features(data: Union[pl.DataFrame, pd.DataFrame]) -> pl.DataFrame:
    """
    Create cyclic temporal features from the input DataFrame.

    Parameters
    ----------
    data : Union[pl.DataFrame, pd.DataFrame]
        Input DataFrame containing temporal features.

    Returns
    -------
    pl.DataFrame
        DataFrame with additional cyclic temporal features.

    Notes
    -----
    This function creates sine and cosine transformations for day, day of week,
    week of year, and month columns.
    """
    day_factor: int = 30
    day_of_week_factor: int = 7
    week_of_year_factor: int = 52
    month_factor: int = 12

    if isinstance(data, pd.DataFrame):
        data = pl.from_pandas(data)

    try:
        data = data.with_columns(
            day_sin=pl.col("day").map_elements(
                lambda x: np.sin(2 * np.pi * x / day_factor)
            ),
            day_cos=pl.col("day").map_elements(
                lambda x: np.cos(2 * np.pi * x / day_factor)
            ),
            # where monday = 1 and sunday = 7
            day_of_week_sin=pl.col("day_of_week").map_elements(
                lambda x: np.sin(2 * np.pi * x / day_of_week_factor)
            ),
            day_of_week_cos=pl.col("day_of_week").map_elements(
                lambda x: np.cos(2 * np.pi * x / day_of_week_factor)
            ),
            week_of_year_sin=pl.col("week_of_year").map_elements(
                lambda x: np.sin(2 * np.pi * x / week_of_year_factor)
            ),
            week_of_year_cos=pl.col("week_of_year").map_elements(
                lambda x: np.cos(2 * np.pi * x / week_of_year_factor),
            ),
            month_sin=pl.col("month").map_elements(
                lambda x: np.sin(2 * np.pi * x / month_factor)
            ),
            month_cos=pl.col("month").map_elements(
                lambda x: np.cos(2 * np.pi * x / month_factor)
            ),
        )
    except:
        print("Error creating cyclic temporal features")
        return pl.DataFrame()

    return data


class CyclicTemporalFeatures(BaseEstimator, TransformerMixin):
    """
    Transformer for creating cyclic temporal features.

    This transformer applies the create_cyclic_features function to the input data.
    """

    def __init__(self) -> None:
        """
        Initialize the CyclicTemporalFeatures transformer.
        """
        ...

    def fit(
        self, X: pl.DataFrame, y: Optional[pl.DataFrame] = None
    ) -> "CyclicTemporalFeatures":
        """
        Fit the transformer to the data.

        This method is a no-op and returns self.

        Parameters
        ----------
        X : pl.DataFrame, shape (n_samples, n_features)
            Input features.
        y : Optional[pl.DataFrame], default=None
            Target values (ignored).

        Returns
        -------
        CyclicTemporalFeatures
            The fitted transformer.
        """
        return self

    def transform(self, X: pl.DataFrame) -> pl.DataFrame:
        """
        Transform the input DataFrame by extracting temporal features.

        Parameters
        ----------
        X : pl.DataFrame, shape (n_samples, n_features)
            Input features.

        Returns
        -------
        pl.DataFrame, shape (n_samples, n_features + n_cyclic_features)
            Transformed DataFrame with additional temporal features.
        """
        return create_cyclic_features(X)

In [16]:
class NumericalScaler(BaseEstimator, TransformerMixin):
    """
    Transformer for scaling numerical features.

    This transformer applies either StandardScaler or MinMaxScaler to the specified features.

    Parameters
    ----------
    scaler_type : Literal["standard", "min_max"]
        Type of scaler to use.
    features : list[str] | None, optional
        List of features to scale. If None, all features will be scaled.
    exclude_features : list[str] | None, optional
        List of features to exclude from scaling.

    Attributes
    ----------
    scaler : StandardScaler | MinMaxScaler
        The scaler object used for transformation.
    features : list[str] | None
        List of features to scale.
    exclude_features : list[str]
        List of features to exclude from scaling.
    ignore_columns_ : list[str]
        List of columns to ignore during scaling.
    """

    def __init__(
        self,
        scaler_type: Literal["standard", "min_max"],
        features: list[str] | None = None,
        exclude_features: list[str] | None = None,
    ) -> None:
        self.scaler_type: Literal["standard", "min_max"] = scaler_type
        if features is None and exclude_features is None:
            raise ValueError("`features` and `exclude_features` cannot both be None")
        if features is not None and exclude_features is not None:
            raise ValueError(
                "`features` and `exclude_features` cannot both be not None"
            )

        self.features: list[str] | None = features

        if exclude_features is None:
            self.exclude_features: list[str] = []
        else:
            self.exclude_features: list[str] = exclude_features

        if scaler_type not in ["standard", "min_max"]:
            raise ValueError("scaler_type must be either 'standard' or 'min_max'")
        if scaler_type == "standard":
            self.scaler: StandardScaler = StandardScaler()
        else:
            self.scaler: MinMaxScaler = MinMaxScaler(feature_range=(0, 1), clip=True)

    def fit(
        self, X: pl.DataFrame, y: Optional[pl.DataFrame] = None
    ) -> "NumericalScaler":
        """
        Fit the transformer to the data.

        Parameters
        ----------
        X : pl.DataFrame, shape (n_samples, n_features)
            Input features.
        y : Optional[pl.DataFrame], default=None
            Target values (ignored).

        Returns
        -------
        NumericalScaler
            The fitted transformer.
        """
        if isinstance(X, pd.DataFrame):
            X = pl.from_pandas(X)
        if self.features is not None:
            self.ignore_columns_: list[str] = sorted(
                set(X.columns) - set(self.features)
            )

        elif self.exclude_features and self.features is None:
            self.ignore_columns_: list[str] = self.exclude_features
            self.features = sorted(set(X.columns) - set(self.exclude_features))

        self.ignore_columns_ = self.ignore_columns_
        self.scaler.fit(X.select(self.features))
        return self

    def transform(self, X: pl.DataFrame) -> pl.DataFrame:
        """
        Transform the input DataFrame by scaling numerical features.

        Parameters
        ----------
        X : pl.DataFrame, shape (n_samples, n_features)
            Input features.

        Returns
        -------
        pl.DataFrame, shape (n_samples, n_features)
            Transformed DataFrame with scaled features.
        """
        if isinstance(X, pd.DataFrame):
            X = pl.from_pandas(X)
        vector: np.ndarray = self.scaler.transform(X.select(self.features))
        ignore_df: pl.DataFrame = X.select(self.ignore_columns_)
        vector_df: pl.DataFrame = pl.DataFrame(vector, schema=self.features)
        df: pl.DataFrame = pl.concat([ignore_df, vector_df], how="horizontal")
        return df


class CustomOneHotEncoder(BaseEstimator, TransformerMixin):
    """
    A custom one-hot encoder that works with Polars DataFrames.

    Parameters
    ----------
    features : list[str] | None, optional
        List of column names to encode. If None, all columns will be encoded.

    Attributes
    ----------
    features : list[str] | None
        List of column names to encode.
    encoder : OneHotEncoder
        The underlying scikit-learn OneHotEncoder.
    ignore_columns : list[str]
        List of column names to ignore during encoding.
    ignore_columns_ : list[str]
        List of column names ignored during encoding after fitting.
    """

    def __init__(
        self,
        features: list[str] | None = None,
    ) -> None:
        self.features: list[str] | None = features
        self.encoder: OneHotEncoder = OneHotEncoder(handle_unknown="ignore")
        self.ignore_columns: list[str] = []
        self.ignore_columns_: list[str] = []

    def fit(
        self, X: pl.DataFrame, y: pl.DataFrame | None = None
    ) -> "CustomOneHotEncoder":
        """
        Fit the OneHotEncoder to the input data.

        Parameters
        ----------
        X : pl.DataFrame, shape (n_samples, n_features)
            Input features to fit the encoder.
        y : pl.DataFrame | None, optional
            Ignored. Kept for scikit-learn compatibility.

        Returns
        -------
        CustomOneHotEncoder
            The fitted encoder.
        """
        if isinstance(X, pd.DataFrame):
            X = pl.from_pandas(X)
        if self.features is None:
            self.features = X.columns
        self.ignore_columns_: list[str] = list(set(X.columns) - set(self.features))
        self.encoder.fit(X.select(self.features))
        return self

    def transform(self, X: pl.DataFrame) -> pl.DataFrame:
        """
        Transform the input data using the fitted encoder.

        Parameters
        ----------
        X : pl.DataFrame, shape (n_samples, n_features)
            Input features to transform.

        Returns
        -------
        pl.DataFrame, shape (n_samples, n_encoded_features)
            Transformed DataFrame with one-hot encoded features.
        """
        if isinstance(X, pd.DataFrame):
            X = pl.from_pandas(X)
        vector: np.ndarray = self.encoder.transform(X.select(self.features)).toarray()
        ignore_df: pl.DataFrame = X.select(self.ignore_columns_)
        vector_df: pl.DataFrame = pl.DataFrame(
            vector, schema=sorted(self.encoder.get_feature_names_out())
        )
        df: pl.DataFrame = pl.concat([ignore_df, vector_df], how="horizontal")
        return df

In [17]:
temporal_transf: ExtractTemporalFeatures = ExtractTemporalFeatures(
    date_column="pubDate", date_format="%Y-%m-%d %H:%M:%S"
)
temporal_transf.fit_transform(articles_df).head(2)

articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata,day,day_of_week,week_of_year,month,year,quarter
str,i64,str,date,str,str,str,i8,i8,i8,i8,i32,i8
"""58927e0495d0e0392607e1b3""",1129,"""article""",2017-02-02,"""The New York Times""","""news""","""ken belson n f l vs politics has been battle all season long football super bowl national football l…",2,4,5,2,2017,1
"""5893033d95d0e0392607e2d6""",3082,"""article""",2017-02-02,"""The New York Times""","""news""","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…",2,4,5,2,2017,1


In [18]:
cyclic_transf: CyclicTemporalFeatures = CyclicTemporalFeatures()
cyclic_transf.fit_transform(df).head(2)

articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata,day,day_of_week,week_of_year,month,year,quarter,day_sin,day_cos,day_of_week_sin,day_of_week_cos,week_of_year_sin,week_of_year_cos,month_sin,month_cos
str,i64,str,date,str,str,str,i8,i8,i8,i8,i32,i8,f64,f64,f64,f64,f64,f64,f64,f64
"""58927e0495d0e0392607e1b3""",1129,"""article""",2017-02-02,"""The New York Times""","""news""","""ken belson n f l vs politics has been battle all season long football super bowl national football l…",2,4,5,2,2017,1,0.406737,0.913545,-0.433884,-0.900969,0.568065,0.822984,0.866025,0.5
"""5893033d95d0e0392607e2d6""",3082,"""article""",2017-02-02,"""The New York Times""","""news""","""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…",2,4,5,2,2017,1,0.406737,0.913545,-0.433884,-0.900969,0.568065,0.822984,0.866025,0.5


In [19]:
A: pl.DataFrame = pl.DataFrame(
    {"income": [10000, 20000, 30000, 40000, 50000], "age": [25, 30, 35, 40, 45]}
)

scaler: NumericalScaler = NumericalScaler(
    scaler_type="min_max", features=["age"], exclude_features=None
)
print(scaler.fit_transform(A).describe())

shape: (9, 3)
┌────────────┬──────────────┬──────────┐
│ statistic  ┆ income       ┆ age      │
│ ---        ┆ ---          ┆ ---      │
│ str        ┆ f64          ┆ f64      │
╞════════════╪══════════════╪══════════╡
│ count      ┆ 5.0          ┆ 5.0      │
│ null_count ┆ 0.0          ┆ 0.0      │
│ mean       ┆ 30000.0      ┆ 0.5      │
│ std        ┆ 15811.388301 ┆ 0.395285 │
│ min        ┆ 10000.0      ┆ 0.0      │
│ 25%        ┆ 20000.0      ┆ 0.25     │
│ 50%        ┆ 30000.0      ┆ 0.5      │
│ 75%        ┆ 40000.0      ┆ 0.75     │
│ max        ┆ 50000.0      ┆ 1.0      │
└────────────┴──────────────┴──────────┘


In [20]:
ohe: CustomOneHotEncoder = CustomOneHotEncoder(features=["documentType"])
ohe.fit_transform(articles_df).head(2)

pubDate,articleID,source,typeOfMaterial,articleWordCount,metadata,documentType_article,documentType_blogpost
date,str,str,str,i64,str,f64,f64
2017-02-02,"""58927e0495d0e0392607e1b3""","""The New York Times""","""news""",1129,"""ken belson n f l vs politics has been battle all season long football super bowl national football l…",1.0,0.0
2017-02-02,"""5893033d95d0e0392607e2d6""","""The New York Times""","""news""",3082,"""unknown voice vice veracity television home box office girls tv program dunham lena mamet zosia kirk…",1.0,0.0


<hr>

### Vectorize The Metadata

- using TF-IDF

In [21]:
# tfidf, SVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix


n_components: int = 100
seed: int = 123

tfidf_vec: TfidfVectorizer = TfidfVectorizer(stop_words="english", max_df=0.9, min_df=2)
tfidf_matrix: csr_matrix = tfidf_vec.fit_transform(df["metadata"])

# Extract the latent features
svd = TruncatedSVD(n_components=n_components, random_state=seed)
svd_matrix: np.ndarray = svd.fit_transform(tfidf_matrix)

print(f"{svd_matrix.shape = }")

svd_matrix.shape = (9335, 100)


In [22]:
class TextDimensionalityReducer(BaseEstimator, TransformerMixin):
    """
    A transformer that reduces the dimensionality of text data using TF-IDF and SVD.

    Parameters
    ----------
    feature : str
        The name of the feature column containing the text data.
    max_df : float, default=0.9
        The maximum document frequency for TfidfVectorizer.
    min_df : int, default=2
        The minimum document frequency for TfidfVectorizer.
    n_components : int, default=100
        The number of components to keep in TruncatedSVD.
    random_state : int, default=42
        The random state for TruncatedSVD.

    Attributes
    ----------
    tfidf : TfidfVectorizer
        The TF-IDF vectorizer.
    svd : TruncatedSVD
        The SVD transformer.
    ignore_columns_ : list[str]
        List of columns to ignore during transformation.
    """

    def __init__(
        self,
        feature: str,
        max_df: float = 0.9,
        min_df: int = 2,
        n_components: int = 100,
        random_state: int = 42,
    ) -> None:
        self.feature: str = feature
        self.max_df: float = max_df
        self.min_df: int = min_df
        self.n_components: int = n_components
        self.random_state: int = random_state
        self.tfidf: TfidfVectorizer = TfidfVectorizer(
            stop_words="english", max_df=max_df, min_df=min_df
        )
        self.svd: TruncatedSVD = TruncatedSVD(
            n_components=n_components, random_state=random_state
        )

    def fit(
        self, X: pd.DataFrame | pl.DataFrame, y: None = None
    ) -> "TextDimensionalityReducer":
        """
        Fit the TF-IDF vectorizer and SVD transformer.

        Parameters
        ----------
        X : pd.DataFrame | pl.DataFrame, shape (n_samples, n_features)
            The input DataFrame containing the text data.
        y : None
            Ignored. This parameter exists only for compatibility with sklearn.

        Returns
        -------
        self : TextDimensionalityReducer
            The fitted transformer.
        """
        if isinstance(X, pd.DataFrame):
            X = pl.from_pandas(X)
        self.ignore_columns_: list[str] = list(set(X.columns) - {self.feature})
        self.tfidf.fit(X[self.feature])
        self.svd.fit(tfidf_matrix)
        return self

    def transform(self, X: pd.DataFrame | pl.DataFrame) -> pl.DataFrame:
        """
        Transform the input text data using the fitted TF-IDF vectorizer and SVD transformer.

        Parameters
        ----------
        X : pd.DataFrame | pl.DataFrame, shape (n_samples, n_features)
            The input DataFrame containing the text data.

        Returns
        -------
        pl.DataFrame, shape (n_samples, n_components + n_ignore_columns)
            The transformed data as a Polars DataFrame.
        """
        if isinstance(X, pd.DataFrame):
            X = pl.from_pandas(X)
        tfidf_matrix: csr_matrix = self.tfidf.transform(X[self.feature])
        svd_matrix: np.ndarray = self.svd.transform(tfidf_matrix)
        svd_df: pl.DataFrame = pl.DataFrame(
            svd_matrix, schema=[f"svd_{i}" for i in range(self.svd.n_components)]
        )
        ignore_df: pl.DataFrame = X.select(self.ignore_columns_)
        df: pl.DataFrame = pl.concat([ignore_df, svd_df], how="horizontal")
        return df

In [23]:
vectorizer: TextDimensionalityReducer = TextDimensionalityReducer(
    feature="metadata", max_df=0.9, min_df=2, n_components=100, random_state=42
)

vectorizer.fit_transform(articles_df).head()

pubDate,articleID,source,documentType,articleWordCount,typeOfMaterial,svd_0,svd_1,svd_2,svd_3,svd_4,svd_5,svd_6,svd_7,svd_8,svd_9,svd_10,svd_11,svd_12,svd_13,svd_14,svd_15,svd_16,svd_17,svd_18,svd_19,svd_20,svd_21,svd_22,svd_23,svd_24,svd_25,svd_26,svd_27,svd_28,svd_29,svd_30,svd_31,svd_32,svd_33,svd_34,svd_35,svd_36,svd_37,svd_38,svd_39,svd_40,svd_41,svd_42,svd_43,svd_44,svd_45,svd_46,svd_47,svd_48,svd_49,svd_50,svd_51,svd_52,svd_53,svd_54,svd_55,svd_56,svd_57,svd_58,svd_59,svd_60,svd_61,svd_62,svd_63,svd_64,svd_65,svd_66,svd_67,svd_68,svd_69,svd_70,svd_71,svd_72,svd_73,svd_74,svd_75,svd_76,svd_77,svd_78,svd_79,svd_80,svd_81,svd_82,svd_83,svd_84,svd_85,svd_86,svd_87,svd_88,svd_89,svd_90,svd_91,svd_92,svd_93,svd_94,svd_95,svd_96,svd_97,svd_98,svd_99
date,str,str,str,i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2017-02-02,"""58927e0495d0e0392607e1b3""","""The New York Times""","""article""",1129,"""news""",0.100895,0.022488,-0.023352,-0.018642,0.002417,0.032182,-0.024601,-0.002699,-0.029058,-0.063426,0.008866,-0.022927,-0.035261,-0.004732,-0.079328,-0.041945,0.008468,0.032477,0.021302,0.099695,-0.022621,-0.198508,0.008905,-0.014157,0.07043,0.098127,0.072687,-0.080219,0.027539,-0.00941,-0.006321,-8.1e-05,-0.00272,0.01373,-0.04271,0.055854,-0.025265,0.009899,-0.015902,0.016586,-0.050916,-0.083509,-0.032591,0.017223,0.035879,-0.036042,0.055502,0.006153,-0.038048,0.018617,0.040787,-0.079771,0.066853,0.127337,-0.027599,0.060539,-0.039366,-0.020806,0.035107,-0.18739,0.06328,-0.018303,0.010282,0.104725,0.08834,-0.037025,0.112204,-0.004099,-0.154091,-0.039279,0.226976,0.180241,-0.045481,0.019167,0.244074,-0.003071,-0.007402,0.056651,0.078674,-0.020901,0.10741,0.078625,0.081812,-0.043659,-0.01258,-0.019796,0.011319,0.026419,0.065572,-0.002698,-0.050238,0.019966,0.035497,0.002884,-0.027083,-0.004999,-0.027153,0.034294,-0.00785,-0.043813
2017-02-02,"""5893033d95d0e0392607e2d6""","""The New York Times""","""article""",3082,"""news""",0.041555,0.058805,-0.02818,0.001639,0.124177,0.308575,-0.038061,-0.035133,-0.005005,-0.011528,-0.004598,0.04539,-0.00771,0.01497,-0.020833,0.058481,-0.040176,0.043832,-0.071226,-0.071868,0.013714,-0.032309,0.029302,-0.036629,-0.057186,-0.012025,-0.012631,-0.009917,0.021313,-0.005152,0.033018,0.000208,-0.002369,-0.016012,0.037983,-0.034068,-0.011144,-0.005574,0.012004,0.006483,-0.009692,-0.001914,0.007596,-0.048251,0.035707,-0.006464,-0.001257,0.036971,0.010817,0.01674,0.01783,-0.024946,-0.010972,-0.013334,0.014042,0.021861,-0.02585,0.001552,0.008714,0.005651,-0.016779,0.025961,-0.005864,-0.008823,0.019601,-0.009458,-0.033054,0.0072,-0.024101,-0.003405,0.005678,-0.044982,-0.020048,0.014874,0.005044,0.018393,0.002576,-0.009803,0.013122,-0.011626,-0.005898,0.001834,0.015161,0.012166,-0.023194,0.019016,-0.001903,-0.027748,0.016442,-0.016357,-0.010636,0.009057,0.002796,-0.020858,0.005906,-0.000172,-0.010808,-0.005737,0.01132,-0.010516
2017-02-02,"""5893039595d0e0392607e2da""","""The New York Times""","""article""",693,"""review""",0.030966,0.032109,-0.011958,-0.003396,0.012757,0.01639,-0.009756,0.011066,-0.008788,-0.009722,-0.006251,0.031462,0.012068,-0.002771,-0.006811,-0.015055,0.009893,0.010433,0.046428,-0.017879,0.003603,-0.002502,0.005325,0.014589,0.014721,0.000152,-0.021086,0.003915,0.018467,0.00969,-0.007866,-0.016882,0.029003,0.00475,-0.013102,-0.01276,-0.010295,-0.015794,-0.004824,-0.011895,0.022185,-0.012387,0.003158,0.02942,0.009181,-0.017519,0.031438,-0.025329,0.042852,-0.024429,-0.005455,-0.027267,-0.013211,0.038425,-0.018333,-0.079209,0.037963,0.012434,0.050055,0.062355,-0.02679,0.059736,-0.064228,0.074443,0.060387,0.076455,0.013092,-0.044053,-0.062355,0.042078,-0.002416,-0.095753,-0.01508,0.05168,-0.006775,0.031804,-0.008068,0.050228,0.0268,-0.010438,0.000234,0.012462,0.034272,0.014695,-0.00891,0.026493,0.005453,0.022017,0.011463,0.011974,-0.000244,-0.025305,0.023798,-0.004011,0.016579,0.008921,0.011715,-0.011884,-0.004107,0.002139
2017-02-02,"""5893109995d0e0392607e2ef""","""The New York Times""","""article""",1049,"""briefing""",0.062635,0.250217,-0.212203,-0.010359,-0.170476,-0.018051,-0.032361,-0.127276,0.051845,-0.054418,0.011024,0.01381,0.009555,0.001174,0.015495,0.034745,-0.015226,-0.041232,-0.001054,0.000935,-0.020575,0.00742,0.0074,0.000259,-0.00572,-0.015784,-0.014189,-0.041085,0.00723,-0.024442,-0.015389,0.021408,0.014139,-0.027881,-0.004779,-0.034449,2.5e-05,0.00662,0.01326,-0.036029,-0.001541,-0.016801,-0.026094,0.000748,0.018119,-0.003227,0.018734,-0.002024,-0.00713,-0.022174,-0.042681,0.003473,-0.003275,0.015525,-0.004715,0.012637,0.014904,0.031276,0.023136,-0.024663,0.007261,-0.002209,-0.001269,0.002493,-0.023573,-0.017415,-0.02779,0.008781,-0.003988,-0.006393,-0.0433,-0.000779,0.010382,-0.006996,0.026055,0.01668,-0.006054,0.009759,0.015302,0.005052,0.003658,0.031809,-0.037286,0.030493,0.019201,0.032971,-0.056591,-0.02735,-0.043749,-0.069914,-0.035567,-0.064828,0.033528,0.011405,-0.01617,-0.004999,-0.013951,0.035276,0.041575,-0.009052
2017-02-02,"""5893114495d0e0392607e2f1""","""The New York Times""","""article""",1283,"""news""",0.017078,0.030619,-0.014294,0.00037,0.014991,0.006356,0.014298,0.018701,-0.017072,-0.014875,-0.006343,0.028603,0.000795,-0.009578,-0.02139,-0.030533,0.009234,-0.004695,0.009761,0.05839,0.013345,0.004477,0.000683,0.042255,0.029589,-0.015623,-0.076327,-0.014968,0.038243,0.071236,0.077405,-0.031501,0.1288,-0.054009,0.193003,0.039292,0.113187,0.031528,-0.116956,0.001387,-0.01882,0.033208,-0.022851,-0.016735,-0.0281,0.040313,-0.001682,-0.018842,-0.028066,0.011163,-0.02713,0.001483,-0.002988,0.01335,0.015921,-0.014958,0.003447,-0.004519,-0.014044,-0.015712,-0.014374,-0.006123,0.010723,0.034792,-0.007591,0.003766,-0.019346,-0.035151,0.016207,-0.030456,-0.029779,0.012984,-0.005369,-0.00979,0.015454,0.007897,0.021799,0.021894,-0.004975,-0.020386,-0.017996,-0.012728,-0.001723,-0.016804,0.003863,-0.000101,0.01166,0.006501,-0.001733,-0.002659,0.006069,-0.007165,-0.013148,0.006045,0.003475,-0.005855,0.005089,-0.017252,-0.031038,0.004825


In [24]:
# One-hot encode
ohe_columns: list[str] = ["documentType", "source", "typeOfMaterial", "year"]
# Others
text_columns: list[str] = ["metadata"]
numeric_columns: list[str] = ["articleWordCount"]
date_columns: list[str] = ["pubDate"]

# Columns to drop
drop_columns: list[str] = cyclical_columns + date_columns
drop_columns

['day', 'day_of_week', 'week_of_year', 'month', 'pubDate']

In [25]:
max_df: float = 0.9
min_df: float = 2
n_components: int = 100

preprocessor_pipe: Pipeline = Pipeline(
    steps=[
        (
            "temporal_features",
            ExtractTemporalFeatures(
                date_column="pubDate", date_format="%Y-%m-%d %H:%M:%S"
            ),
        ),
        ("cyclical_features", CyclicTemporalFeatures()),
        ("ohe", CustomOneHotEncoder(features=ohe_columns)),
        (
            "tfidf_dim_reducer",
            TextDimensionalityReducer(
                feature="metadata",
                max_df=max_df,
                min_df=min_df,
                n_components=n_components,
                random_state=seed,
            ),
        ),
        ("drop_features", DropFeatures(features=drop_columns)),
        (
            "scaler",
            NumericalScaler(scaler_type="min_max", exclude_features=["articleID"]),
        ),
    ]
)

preprocessor_pipe

In [26]:
articles_df.head(1)

articleID,articleWordCount,documentType,pubDate,source,typeOfMaterial,metadata
str,i64,str,date,str,str,str
"""58927e0495d0e0392607e1b3""",1129,"""article""",2017-02-02,"""The New York Times""","""news""","""ken belson n f l vs politics has been battle all season long football super bowl national football l…"


In [27]:
temp_df: pl.DataFrame = preprocessor_pipe.fit_transform(articles_df)
temp_df.head(2)

articleID,articleWordCount,day_cos,day_of_week_cos,day_of_week_sin,day_sin,documentType_article,documentType_blogpost,month_cos,month_sin,quarter,source_International New York Times,source_The New York Times,svd_0,svd_1,svd_10,svd_11,svd_12,svd_13,svd_14,svd_15,svd_16,svd_17,svd_18,svd_19,svd_2,svd_20,svd_21,svd_22,svd_23,svd_24,svd_25,svd_26,svd_27,svd_28,svd_29,svd_3,svd_30,svd_31,svd_32,svd_33,svd_34,svd_35,svd_36,svd_37,svd_38,svd_39,svd_4,svd_40,svd_41,svd_42,svd_43,svd_44,svd_45,svd_46,svd_47,svd_48,svd_49,svd_5,svd_50,svd_51,svd_52,svd_53,svd_54,svd_55,svd_56,svd_57,svd_58,svd_59,svd_6,svd_60,svd_61,svd_62,svd_63,svd_64,svd_65,svd_66,svd_67,svd_68,svd_69,svd_7,svd_70,svd_71,svd_72,svd_73,svd_74,svd_75,svd_76,svd_77,svd_78,svd_79,svd_8,svd_80,svd_81,svd_82,svd_83,svd_84,svd_85,svd_86,svd_87,svd_88,svd_89,svd_9,svd_90,svd_91,svd_92,svd_93,svd_94,svd_95,svd_96,svd_97,svd_98,svd_99,typeOfMaterial_blog,typeOfMaterial_briefing,typeOfMaterial_editorial,typeOfMaterial_news,typeOfMaterial_op-ed,typeOfMaterial_other,typeOfMaterial_review,week_of_year_cos,week_of_year_sin,year_2017,year_2018
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""58927e0495d0e0392607e1b3""",0.068484,0.956773,0.0,0.277479,0.704489,1.0,0.0,0.803848,0.866025,0.0,0.0,1.0,0.221033,0.224845,0.439223,0.629152,0.441818,0.417972,0.40732,0.255846,0.384934,0.408569,0.453728,0.661723,0.355873,0.37715,0.105389,0.541629,0.293892,0.703852,0.596195,0.630198,0.232013,0.429232,0.388551,0.202904,0.475688,0.356828,0.460916,0.412173,0.325551,0.546227,0.291511,0.504872,0.398499,0.341415,0.348516,0.513175,0.2962,0.387273,0.383808,0.642245,0.462672,0.520645,0.622692,0.266177,0.608373,0.317671,0.527612,0.353455,0.588171,0.232648,0.364551,0.666549,0.395409,0.403708,0.560287,0.029548,0.425799,0.509684,0.54772,0.493882,0.707498,0.696135,0.305023,0.303554,0.357959,0.122474,0.059384,0.322989,0.043071,0.975874,0.549446,0.770333,0.68325,0.42825,0.607905,0.504084,0.454795,0.682183,0.346192,0.482609,0.383231,0.43304,0.41259,0.237244,0.342547,0.464673,0.434048,0.588901,0.164109,0.41491,0.327889,0.454726,0.377299,0.330293,0.45115,0.437275,0.445588,0.404767,0.505335,0.583252,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.906115,0.568065,1.0,0.0
"""5893033d95d0e0392607e2d6""",0.188116,0.956773,0.0,0.277479,0.704489,1.0,0.0,0.803848,0.866025,0.0,0.0,1.0,0.085475,0.280732,0.420079,0.732249,0.487813,0.450618,0.493545,0.425642,0.321191,0.428561,0.308506,0.318579,0.350913,0.435888,0.370166,0.579572,0.260129,0.423124,0.397649,0.423097,0.368915,0.419444,0.39696,0.22507,0.527499,0.356854,0.461112,0.357428,0.461275,0.355368,0.323737,0.481986,0.455444,0.333798,0.490713,0.580282,0.46006,0.48781,0.248732,0.64079,0.501682,0.383896,0.691055,0.360212,0.579203,0.654952,0.481954,0.456426,0.433663,0.585768,0.510012,0.53418,0.415686,0.390055,0.553613,0.569359,0.40715,0.385237,0.443113,0.412654,0.480637,0.42159,0.389437,0.503989,0.443793,0.310672,0.495726,0.277242,0.361981,0.337403,0.614467,0.32672,0.456525,0.615385,0.529001,0.471156,0.458697,0.388267,0.377877,0.510209,0.375302,0.428508,0.539983,0.391533,0.469195,0.459612,0.445002,0.551439,0.190916,0.491709,0.479321,0.480244,0.477779,0.382118,0.465296,0.477828,0.472797,0.406203,0.319982,0.598885,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.906115,0.568065,1.0,0.0
