Skip to content

Commit

Permalink
refactor: add support for polars (#87)
Browse files Browse the repository at this point in the history
Co-authored-by: Chris Lemke <1@lemke.ai>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people committed Apr 18, 2023
1 parent d2cf4ae commit b831485
Show file tree
Hide file tree
Showing 12 changed files with 927 additions and 1,781 deletions.
2 changes: 1 addition & 1 deletion examples/playground.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -665,7 +665,7 @@
"from sk_transformers import StringSlicerTransformer\n",
"\n",
"X = pd.DataFrame({\"foo\": [\"abc\", \"def\", \"ghi\"], \"bar\": [\"jkl\", \"mno\", \"pqr\"]})\n",
"transformer = StringSlicerTransformer([(\"foo\", (0, 3, 2)), (\"bar\", (2,))])\n",
"transformer = StringSlicerTransformer([(\"foo\", (1, 3)), (\"bar\", (2,))])\n",
"transformer.fit_transform(X)"
]
},
Expand Down
1,205 changes: 527 additions & 678 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,9 @@ numpy = "1.23.5"
pandas = "^1.5.2"
phonenumbers = "^8.13.4"
scikit-learn = "^1.2.0"
swifter = "^1.3.4"
rich = "^13.3.1"
polars = "^0.16.7"
pyarrow = "^11.0.0"

[tool.poetry.group.test.dependencies]
pytest = "^7.2.0"
Expand Down Expand Up @@ -108,6 +109,7 @@ disallow_subclassing_any = false
ignore_missing_imports = true
disallow_any_generics = false
warn_return_any = false
warn_unused_ignores = false
no_namespace_packages = true
exclude = ["tests", "docs"]

Expand Down
799 changes: 0 additions & 799 deletions requirements.txt

This file was deleted.

130 changes: 73 additions & 57 deletions src/sk_transformers/datetime_transformer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from datetime import datetime
from typing import List, Tuple
from typing import Dict, List, Tuple

import pandas as pd
import polars as pl

from sk_transformers.base_transformer import BaseTransformer
from sk_transformers.utils import check_ready_to_transform
Expand Down Expand Up @@ -78,43 +78,49 @@ def transform( # pylint: disable=too-many-branches
pandas.DataFrame: Dataframe with transformed columns.
"""

X = check_ready_to_transform(self, X, self.features)
X = check_ready_to_transform(self, X, self.features, return_polars=True)

for column in self.features:
X[column] = pd.to_datetime(
X[column], format=self.date_format, errors=self.errors
for column in self.features: # pylint: disable=duplicate-code
X = X.with_columns(
pl.col(column)
.str.strptime(pl.Datetime, fmt=self.date_format)
.alias(column + "_datetime")
)
if "year" in self.date_elements:
X[f"{column}_year"] = X[column].dt.year
if "month" in self.date_elements:
X[f"{column}_month"] = X[column].dt.month
if "day" in self.date_elements:
X[f"{column}_day"] = X[column].dt.day
if "day_of_week" in self.date_elements:
X[f"{column}_day_of_week"] = X[column].dt.dayofweek
if "day_of_year" in self.date_elements:
X[f"{column}_day_of_year"] = X[column].dt.dayofyear
if "week_of_year" in self.date_elements:
X[f"{column}_week_of_year"] = X[column].dt.isocalendar().week
if "quarter" in self.date_elements:
X[f"{column}_quarter"] = X[column].dt.quarter
if "is_leap_year" in self.date_elements:
X[f"{column}_is_leap_year"] = X[column].dt.is_leap_year
if "is_month_start" in self.date_elements:
X[f"{column}_is_month_start"] = X[column].dt.is_month_start
if "is_month_end" in self.date_elements:
X[f"{column}_is_month_end"] = X[column].dt.is_month_end
if "is_quarter_start" in self.date_elements:
X[f"{column}_is_quarter_start"] = X[column].dt.is_quarter_start
if "is_quarter_end" in self.date_elements:
X[f"{column}_is_quarter_end"] = X[column].dt.is_quarter_end
if "is_year_start" in self.date_elements:
X[f"{column}_is_year_start"] = X[column].dt.is_year_start
if "is_year_end" in self.date_elements:
X[f"{column}_is_year_end"] = X[column].dt.is_year_end
if "is_weekend" in self.date_elements:
X[f"{column}_is_weekend"] = X[column].dt.dayofweek.isin([5, 6])
return X

date_element_dict: Dict[str, pl.Expr] = {
"year": pl.col(f"{column}_datetime").dt.year(),
"month": pl.col(f"{column}_datetime").dt.month(),
"day": pl.col(f"{column}_datetime").dt.day(),
"day_of_week": pl.col(f"{column}_datetime").dt.weekday() - 1,
"day_of_year": pl.col(f"{column}_datetime").dt.ordinal_day(),
"week_of_year": pl.col(f"{column}_datetime").dt.week(),
"quarter": pl.col(f"{column}_datetime").dt.quarter(),
"is_leap_year": pl.col(f"{column}_datetime").dt.year() % 4 == 0,
"is_month_start": pl.col(f"{column}_datetime").dt.day() == 1,
"is_month_end": pl.col(f"{column}_datetime")
.dt.day()
.is_in([28, 29, 30, 31]),
"is_quarter_start": pl.col(f"{column}_datetime")
.dt.ordinal_day()
.is_in([1, 91, 183, 275]),
"is_quarter_end": pl.col(f"{column}_datetime")
.dt.ordinal_day()
.is_in([90, 182, 274, 365]),
"is_year_start": pl.col(f"{column}_datetime").dt.ordinal_day() == 1,
"is_year_end": pl.col(f"{column}_datetime")
.dt.ordinal_day()
.is_in([365, 366]),
"is_weekend": pl.col(f"{column}_datetime").dt.weekday().is_in([6, 7]),
}

X = X.with_columns(
[
date_element_dict[element].alias(f"{column}_{element}")
for element in self.date_elements
]
).drop(f"{column}_datetime")

return X.to_pandas()


class DurationCalculatorTransformer(BaseTransformer):
Expand Down Expand Up @@ -167,18 +173,25 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
Returns:
pandas.DataFrame: The transformed DataFrame.
"""
X = check_ready_to_transform(self, X, list(self.features))

duration_series = pd.to_datetime(
X[self.features[1]], utc=True, errors="raise"
) - pd.to_datetime(X[self.features[0]], utc=True, errors="raise")

X[self.new_column_name] = (
duration_series.dt.days
if self.unit == "days"
else duration_series.dt.total_seconds()
)
return X
X = check_ready_to_transform(self, X, list(self.features), return_polars=True)

if self.unit == "seconds":
return X.with_columns(
(
pl.col(self.features[1]).str.strptime(pl.Datetime, fmt="%Y-%m-%d")
- pl.col(self.features[0]).str.strptime(pl.Datetime, fmt="%Y-%m-%d")
)
.dt.seconds()
.alias(self.new_column_name)
).to_pandas()
return X.with_columns(
(
pl.col(self.features[1]).str.strptime(pl.Datetime, fmt="%Y-%m-%d")
- pl.col(self.features[0]).str.strptime(pl.Datetime, fmt="%Y-%m-%d")
)
.dt.days()
.alias(self.new_column_name)
).to_pandas()


class TimestampTransformer(BaseTransformer):
Expand Down Expand Up @@ -224,11 +237,14 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
Returns:
pandas.DataFrame: Dataframe with transformed columns.
"""
X = check_ready_to_transform(self, X, self.features)

for column in self.features:
X[column] = pd.to_datetime(
X[column], format=self.date_format, errors="raise"
)
X[column] = (X[column] - datetime(1970, 1, 1)).dt.total_seconds()
return X
X = check_ready_to_transform(self, X, self.features, return_polars=True)

return X.with_columns(
[
pl.col(column)
.str.strptime(pl.Datetime, self.date_format)
.dt.timestamp("ms")
/ 1000
for column in self.features
]
).to_pandas()
Loading

0 comments on commit b831485

Please sign in to comment.