From e87758387163939887064137cd261163c26f6473 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Frederik=20Peter=20H=C3=B8ngaard?= Date: Sun, 7 May 2023 22:40:51 +0200 Subject: [PATCH 1/8] datetime seems to be interpreted correctly --- pyproject.toml | 2 +- .../interpreter_step.py | 23 ++++++++++++------- python/src/lazylearn/lazylearn.py | 2 +- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 425f362..a45714a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "lazylearn" -version = "0.0.1" +version = "0.0.2" authors = [ { name="Frederik P. Høngaard", email="mail@frederikhoengaard.com" }, ] diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py index 10f9434..8d80d7e 100644 --- a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py +++ b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd from pandas import Series from pipeline.pipeline import IngestionPipeline @@ -33,16 +34,20 @@ def analyze_column(self, column: Series): values = column.tolist() types = [type(value) for value in values] - if self.categorical_test(values): - return "categorical" + column_type = None + if self.categorical_test(values): + column_type = "categorical" elif self.numeric_test(types): - return "numeric" + column_type = "numeric" + + if self.datetime_check(column) and not self.numeric_test(types): + column_type = "datetime" - elif self.datetime_check(column): - return "datetime" - else: - return "object" + if column_type is None: + column_type = "object" + + return column_type @staticmethod def categorical_test(values: list): @@ -79,8 +84,10 @@ def string_test(types: set): raise NotImplementedError def datetime_check(self, column: Series): + if self.df[column.name].dtype.type == np.datetime64: + return True try: - self.df[column.name] = pd.to_datetime(column) + self.df[column.name] = pd.to_datetime(self.df[column.name]) return True except Exception as e: # noqa return False diff --git a/python/src/lazylearn/lazylearn.py b/python/src/lazylearn/lazylearn.py index a6d620a..dc746ca 100644 --- a/python/src/lazylearn/lazylearn.py +++ b/python/src/lazylearn/lazylearn.py @@ -7,7 +7,7 @@ def __init__(self): def create_project(self, data, target, task="infer"): # ingest data - ingestion_response = Ingestion().run(data) # noqa + self.dataset = Ingestion().run(data) # noqa # preprocess From 523d1901796e492574e04d8b9a8a8c72b9fe052a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Frederik=20Peter=20H=C3=B8ngaard?= Date: Mon, 8 May 2023 14:53:34 +0200 Subject: [PATCH 2/8] Update interpreter_step.py --- .../interpreter_step.py | 49 ++++++++++++++++--- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py index 8d80d7e..362919a 100644 --- a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py +++ b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py @@ -1,11 +1,14 @@ import numpy as np import pandas as pd -from pandas import Series +from pandas import DataFrame, Series from pipeline.pipeline import IngestionPipeline from tqdm import tqdm class ColumnTypeInterpreter: + def __int__(self): + self.df: DataFrame = None + def apply(self, pipeline: IngestionPipeline): """ This method is responsible for inferring the @@ -38,6 +41,8 @@ def analyze_column(self, column: Series): if self.categorical_test(values): column_type = "categorical" + elif self.numeric_test(types) and self.id_check(types, values): + column_type = "id" elif self.numeric_test(types): column_type = "numeric" @@ -77,17 +82,45 @@ def numeric_test(types: list): :param types: list of type objects :return: True if column is numeric, False otherwise """ - return all([item == float or item == int for item in set(types)]) + return all( + [item == float or item == int for item in set(types) if item is not None] + ) @staticmethod def string_test(types: set): raise NotImplementedError def datetime_check(self, column: Series): - if self.df[column.name].dtype.type == np.datetime64: - return True - try: - self.df[column.name] = pd.to_datetime(self.df[column.name]) + """ + + :param column: + :return: + """ + col_name = str(column.name) + + # if type of column is actually datetime + if self.df[col_name].dtype.type == np.datetime64: return True - except Exception as e: # noqa - return False + + # if date or time is in column name and can be cast as date + if "date" in col_name.lower() or "time" in col_name.lower(): + try: + self.df[col_name] = pd.to_datetime(self.df[col_name]) + return True + except Exception as e: # noqa + pass + + # if format of values look like dates + + return False + + def id_check(self, types, values): + """ + + :param types: + :param values: + :return: + """ + return all([item == int for item in set(types) if item is not None]) and len( + set(values) + ) == len(self.df) From bb76e1c7a3c97e9eb03f84ce0b273dbc597a047c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Frederik=20Peter=20H=C3=B8ngaard?= Date: Wed, 17 May 2023 14:29:17 +0200 Subject: [PATCH 3/8] add some date preprocessing --- .../data_parser_step.py | 1 + .../interpreter_step.py | 19 +++++++++-- python/src/lazylearn/lazylearn.py | 17 +++++++++- python/src/lazylearn/models/models.py | 10 +++++- python/src/lazylearn/pipeline/pipeline.py | 19 ++++++++++- .../lazylearn/preprocessing/time/__init__.py | 0 .../preprocessing/time/date_processor.py | 34 +++++++++++++++++++ python/src/lazylearn/regression/__init__.py | 0 .../lazylearn/regression/models/__init__.py | 0 .../models/randomforest/__init__.py | 0 .../models/randomforest/randomforest.py | 16 +++++++++ .../regression/models/xgboost/__init__.py | 0 python/src/lazylearn/strategies/__init__.py | 0 13 files changed, 111 insertions(+), 5 deletions(-) create mode 100644 python/src/lazylearn/preprocessing/time/__init__.py create mode 100644 python/src/lazylearn/preprocessing/time/date_processor.py create mode 100644 python/src/lazylearn/regression/__init__.py create mode 100644 python/src/lazylearn/regression/models/__init__.py create mode 100644 python/src/lazylearn/regression/models/randomforest/__init__.py create mode 100644 python/src/lazylearn/regression/models/randomforest/randomforest.py create mode 100644 python/src/lazylearn/regression/models/xgboost/__init__.py create mode 100644 python/src/lazylearn/strategies/__init__.py diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py index 42f323a..94a5399 100644 --- a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py +++ b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py @@ -17,5 +17,6 @@ def apply(self, pipeline: IngestionPipeline): if isinstance(pipeline.raw_data, DataFrame): pipeline.df = pipeline.raw_data + # check if raw data is a path to a csv file and read it into csv else: raise DataSourceError diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py index 362919a..baad31c 100644 --- a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py +++ b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py @@ -27,6 +27,10 @@ def apply(self, pipeline: IngestionPipeline): ) # noqa pipeline.column_type_map = column_types + if "unknown" in pipeline.column_type_map.values(): + pipeline.needs_type_map = True + + pipeline.type_collections = self.build_type_collections(column_types) def analyze_column(self, column: Series): """ @@ -50,7 +54,7 @@ def analyze_column(self, column: Series): column_type = "datetime" if column_type is None: - column_type = "object" + column_type = "unknown" return column_type @@ -110,7 +114,7 @@ def datetime_check(self, column: Series): except Exception as e: # noqa pass - # if format of values look like dates + # if format of values looks like dates return False @@ -124,3 +128,14 @@ def id_check(self, types, values): return all([item == int for item in set(types) if item is not None]) and len( set(values) ) == len(self.df) + + @staticmethod + def build_type_collections(column_type_map): + collections = {} + + for data_type in ["datetime", "numeric", "categorical"]: + collections[data_type] = [ + col for col in column_type_map if column_type_map[col] == data_type + ] + + return collections diff --git a/python/src/lazylearn/lazylearn.py b/python/src/lazylearn/lazylearn.py index dc746ca..255f311 100644 --- a/python/src/lazylearn/lazylearn.py +++ b/python/src/lazylearn/lazylearn.py @@ -1,13 +1,28 @@ from ingestion.ingestion_pipeline import Ingestion +from preprocessing.time.date_processor import date_processor class LazyLearner: def __init__(self): self.dataset = None + self.task = None + self.models = None + self.leaderboard = None def create_project(self, data, target, task="infer"): # ingest data - self.dataset = Ingestion().run(data) # noqa + self.dataset = Ingestion().run(data) + + if task == "infer": + # if target is numeric then regression, else classification + if self.dataset.column_type_map[target] == "numeric": + self.task = "regression" + else: + self.task = "classification" + + # process dates + + self.dataset = date_processor(self.dataset) # preprocess diff --git a/python/src/lazylearn/models/models.py b/python/src/lazylearn/models/models.py index d826534..dda68de 100644 --- a/python/src/lazylearn/models/models.py +++ b/python/src/lazylearn/models/models.py @@ -2,11 +2,19 @@ class Dataset: - def __init__(self, df: DataFrame, column_type_map: dict): + def __init__( + self, + df: DataFrame, + column_type_map: dict, + summary_stats: dict, + type_collections: dict, + ): self.name = None self.description = None self.df = df self.column_type_map = column_type_map + self.summary_stats = summary_stats + self.type_collections = type_collections def save(self): raise NotImplementedError diff --git a/python/src/lazylearn/pipeline/pipeline.py b/python/src/lazylearn/pipeline/pipeline.py index e7256f2..a395eed 100644 --- a/python/src/lazylearn/pipeline/pipeline.py +++ b/python/src/lazylearn/pipeline/pipeline.py @@ -29,6 +29,23 @@ def __init__(self): self.df: DataFrame = None self.column_type_map: dict = None self.summary_stats: dict = {} + self.needs_type_map: bool = False + self.type_collections: dict = None def response(self): - return Dataset(df=self.df, column_type_map=self.column_type_map) + return Dataset( + df=self.df, + column_type_map=self.column_type_map, + summary_stats=self.summary_stats, + type_collections=self.type_collections, + ) + + +class ModelPipeline(Pipeline): + def __init__(self): + super().__init__() + + +class RegressionPipeline(ModelPipeline): + def __init__(self): + super().__init__() diff --git a/python/src/lazylearn/preprocessing/time/__init__.py b/python/src/lazylearn/preprocessing/time/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/preprocessing/time/date_processor.py b/python/src/lazylearn/preprocessing/time/date_processor.py new file mode 100644 index 0000000..37ddbf6 --- /dev/null +++ b/python/src/lazylearn/preprocessing/time/date_processor.py @@ -0,0 +1,34 @@ +from models.models import Dataset + + +def date_processor(dataset: Dataset) -> Dataset: + """ + Method that transform date variables into + categorical features. + + :param dataset: Dataset object with date features + :return: Dataset object with categorical date + features + """ + new_categorical_cols = [] + + for date_column in dataset.type_collections["datetime"]: + dataset.df[f"{date_column}_year"] = ( + dataset.df[date_column].dt.isocalendar().year + ) + dataset.df[f"{date_column}_month"] = dataset.df[date_column].dt.month + dataset.df[f"{date_column}_week"] = ( + dataset.df[date_column].dt.isocalendar().week + ) + dataset.df[f"{date_column}_day"] = dataset.df[date_column].dt.isocalendar().day + + new_categorical_cols.append(f"{date_column}_year") + new_categorical_cols.append(f"{date_column}_month") + new_categorical_cols.append(f"{date_column}_week") + new_categorical_cols.append(f"{date_column}_day") + + for cat in new_categorical_cols: + dataset.column_type_map[cat] = "categorical" + dataset.type_collections["categorical"].append(cat) + + return dataset diff --git a/python/src/lazylearn/regression/__init__.py b/python/src/lazylearn/regression/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/regression/models/__init__.py b/python/src/lazylearn/regression/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/regression/models/randomforest/__init__.py b/python/src/lazylearn/regression/models/randomforest/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/regression/models/randomforest/randomforest.py b/python/src/lazylearn/regression/models/randomforest/randomforest.py new file mode 100644 index 0000000..c5fdb06 --- /dev/null +++ b/python/src/lazylearn/regression/models/randomforest/randomforest.py @@ -0,0 +1,16 @@ +from models.models import Dataset +from pipeline.pipeline import RegressionPipeline +from sklearn.ensemble import RandomForestRegressor + + +class RandomForestRegressionPipeline(RegressionPipeline): + def __init__(self): + self.target = None + self.dataset: Dataset = None + + def run(self): + # preprocess numeric vars + + # preprocess categorical vars + + pass diff --git a/python/src/lazylearn/regression/models/xgboost/__init__.py b/python/src/lazylearn/regression/models/xgboost/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/strategies/__init__.py b/python/src/lazylearn/strategies/__init__.py new file mode 100644 index 0000000..e69de29 From d7fca34e745c7850be54466cd73cd783787a9488 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Frederik=20Peter=20H=C3=B8ngaard?= Date: Wed, 17 May 2023 14:31:55 +0200 Subject: [PATCH 4/8] linting --- .../interpreter_step.py | 18 ++++++++++++++---- .../preprocessing/time/date_processor.py | 4 +++- .../models/randomforest/randomforest.py | 1 - 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py index baad31c..6bdcd31 100644 --- a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py +++ b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py @@ -87,7 +87,11 @@ def numeric_test(types: list): :return: True if column is numeric, False otherwise """ return all( - [item == float or item == int for item in set(types) if item is not None] + [ + item == float or item == int + for item in set(types) + if item is not None # noqa + ] ) @staticmethod @@ -125,9 +129,13 @@ def id_check(self, types, values): :param values: :return: """ - return all([item == int for item in set(types) if item is not None]) and len( + return all( + [item == int for item in set(types) if item is not None] + ) and len( # noqa set(values) - ) == len(self.df) + ) == len( + self.df + ) @staticmethod def build_type_collections(column_type_map): @@ -135,7 +143,9 @@ def build_type_collections(column_type_map): for data_type in ["datetime", "numeric", "categorical"]: collections[data_type] = [ - col for col in column_type_map if column_type_map[col] == data_type + col + for col in column_type_map + if column_type_map[col] == data_type # noqa ] return collections diff --git a/python/src/lazylearn/preprocessing/time/date_processor.py b/python/src/lazylearn/preprocessing/time/date_processor.py index 37ddbf6..aed4f3d 100644 --- a/python/src/lazylearn/preprocessing/time/date_processor.py +++ b/python/src/lazylearn/preprocessing/time/date_processor.py @@ -20,7 +20,9 @@ def date_processor(dataset: Dataset) -> Dataset: dataset.df[f"{date_column}_week"] = ( dataset.df[date_column].dt.isocalendar().week ) - dataset.df[f"{date_column}_day"] = dataset.df[date_column].dt.isocalendar().day + dataset.df[f"{date_column}_day"] = ( + dataset.df[date_column].dt.isocalendar().day + ) # noqa new_categorical_cols.append(f"{date_column}_year") new_categorical_cols.append(f"{date_column}_month") diff --git a/python/src/lazylearn/regression/models/randomforest/randomforest.py b/python/src/lazylearn/regression/models/randomforest/randomforest.py index c5fdb06..4a78978 100644 --- a/python/src/lazylearn/regression/models/randomforest/randomforest.py +++ b/python/src/lazylearn/regression/models/randomforest/randomforest.py @@ -1,6 +1,5 @@ from models.models import Dataset from pipeline.pipeline import RegressionPipeline -from sklearn.ensemble import RandomForestRegressor class RandomForestRegressionPipeline(RegressionPipeline): From f3fbd0d59b3a84cf64659a2d874cd901d083e071 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Frederik=20Peter=20H=C3=B8ngaard?= Date: Sat, 20 May 2023 16:15:51 +0200 Subject: [PATCH 5/8] durations and simple splits --- .../data_parser_step.py | 4 ++++ .../src/lazylearn/ingestion/utils/__init__.py | 0 python/src/lazylearn/ingestion/utils/csv.py | 2 ++ python/src/lazylearn/lazylearn.py | 15 ++++++++++-- .../src/lazylearn/model_selection/__init__.py | 0 .../lazylearn/model_selection/splitters.py | 17 +++++++++++++ python/src/lazylearn/models/models.py | 1 + .../lazylearn/preprocessing/time/duration.py | 24 +++++++++++++++++++ 8 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 python/src/lazylearn/ingestion/utils/__init__.py create mode 100644 python/src/lazylearn/ingestion/utils/csv.py create mode 100644 python/src/lazylearn/model_selection/__init__.py create mode 100644 python/src/lazylearn/model_selection/splitters.py create mode 100644 python/src/lazylearn/preprocessing/time/duration.py diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py index 94a5399..19608ad 100644 --- a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py +++ b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py @@ -1,4 +1,6 @@ +import pandas as pd from errors.errors import DataSourceError +from ingestion.utils.csv import csv_check from pandas import DataFrame from pipeline.pipeline import IngestionPipeline, PipelineStep @@ -18,5 +20,7 @@ def apply(self, pipeline: IngestionPipeline): if isinstance(pipeline.raw_data, DataFrame): pipeline.df = pipeline.raw_data # check if raw data is a path to a csv file and read it into csv + elif csv_check(pipeline.df): + pipeline.df = pd.read_csv(pipeline.raw_data) else: raise DataSourceError diff --git a/python/src/lazylearn/ingestion/utils/__init__.py b/python/src/lazylearn/ingestion/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/ingestion/utils/csv.py b/python/src/lazylearn/ingestion/utils/csv.py new file mode 100644 index 0000000..22d44fb --- /dev/null +++ b/python/src/lazylearn/ingestion/utils/csv.py @@ -0,0 +1,2 @@ +def csv_check(path): + raise NotImplementedError diff --git a/python/src/lazylearn/lazylearn.py b/python/src/lazylearn/lazylearn.py index 255f311..d0cc518 100644 --- a/python/src/lazylearn/lazylearn.py +++ b/python/src/lazylearn/lazylearn.py @@ -1,13 +1,16 @@ from ingestion.ingestion_pipeline import Ingestion +from model_selection.splitters import test_train_splitter from preprocessing.time.date_processor import date_processor +from preprocessing.time.duration import duration_builder class LazyLearner: - def __init__(self): + def __init__(self, random_state=None): self.dataset = None self.task = None self.models = None self.leaderboard = None + self.random_state = random_state def create_project(self, data, target, task="infer"): # ingest data @@ -23,11 +26,19 @@ def create_project(self, data, target, task="infer"): # process dates self.dataset = date_processor(self.dataset) + self.dataset = duration_builder(self.dataset) - # preprocess + # split partitions + + self.dataset = test_train_splitter(self.dataset, random_state=self.random_state) # set modelling configurations + def run_autopilot(self): + raise NotImplementedError + + # preprocess + # train # eval diff --git a/python/src/lazylearn/model_selection/__init__.py b/python/src/lazylearn/model_selection/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/model_selection/splitters.py b/python/src/lazylearn/model_selection/splitters.py new file mode 100644 index 0000000..7773deb --- /dev/null +++ b/python/src/lazylearn/model_selection/splitters.py @@ -0,0 +1,17 @@ +from models.models import Dataset +from sklearn.model_selection import train_test_split + + +def test_train_splitter(dataset: Dataset, random_state=None) -> Dataset: + train_partition, test_partition = train_test_split( + dataset.df, test_size=0.2, random_state=random_state + ) + + dataset.partitions["test"] = test_partition + dataset.partitions["train"] = train_partition + + return dataset + + +def cv_splitter(dataset: Dataset) -> Dataset: + return dataset diff --git a/python/src/lazylearn/models/models.py b/python/src/lazylearn/models/models.py index dda68de..dfdf2e3 100644 --- a/python/src/lazylearn/models/models.py +++ b/python/src/lazylearn/models/models.py @@ -15,6 +15,7 @@ def __init__( self.column_type_map = column_type_map self.summary_stats = summary_stats self.type_collections = type_collections + self.partitions: dict = {} def save(self): raise NotImplementedError diff --git a/python/src/lazylearn/preprocessing/time/duration.py b/python/src/lazylearn/preprocessing/time/duration.py new file mode 100644 index 0000000..b21d05e --- /dev/null +++ b/python/src/lazylearn/preprocessing/time/duration.py @@ -0,0 +1,24 @@ +from models.models import Dataset + + +def duration_builder(dataset: Dataset) -> Dataset: + """ + + :param dataset: + :return: + """ + date_cols = dataset.type_collections.get("datetime") + + if len(date_cols) > 1: + for i in range(len(date_cols)): + for j in range(i + 1, len(date_cols)): + col_name = f"duration({date_cols[i]}-{date_cols[j]})" + dataset.df[col_name] = ( + (dataset.df[date_cols[i]] - dataset.df[date_cols[j]]) + .astype("timedelta64[D]") + .astype(int) + ) + dataset.column_type_map[col_name] = "numeric" + dataset.type_collections["numeric"].append(col_name) + + return dataset From 744f495c37b0cdb993c6ded45327e21b06070709 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Frederik=20Peter=20H=C3=B8ngaard?= Date: Sat, 20 May 2023 23:22:39 +0200 Subject: [PATCH 6/8] crude categorical ordinal encoding by frequency --- python/src/lazylearn/lazylearn.py | 18 +++++--- python/src/lazylearn/pipeline/pipeline.py | 16 ++++++- .../preprocessing/encoding/encoders.py | 45 ++++++++++++++++++- .../preprocessing/imputation/__init__.py | 0 .../random_forest_steps/__init__.py | 0 .../random_forest_steps/regressor_step.py | 17 +++++++ .../models/randomforest/randomforest.py | 31 ++++++++++--- .../lazylearn/strategies/strategy_builder.py | 1 + 8 files changed, 111 insertions(+), 17 deletions(-) create mode 100644 python/src/lazylearn/preprocessing/imputation/__init__.py create mode 100644 python/src/lazylearn/regression/models/randomforest/random_forest_steps/__init__.py create mode 100644 python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py create mode 100644 python/src/lazylearn/strategies/strategy_builder.py diff --git a/python/src/lazylearn/lazylearn.py b/python/src/lazylearn/lazylearn.py index d0cc518..52df770 100644 --- a/python/src/lazylearn/lazylearn.py +++ b/python/src/lazylearn/lazylearn.py @@ -2,6 +2,7 @@ from model_selection.splitters import test_train_splitter from preprocessing.time.date_processor import date_processor from preprocessing.time.duration import duration_builder +from regression.models.randomforest.randomforest import RandomForestRegressionRunner class LazyLearner: @@ -11,9 +12,11 @@ def __init__(self, random_state=None): self.models = None self.leaderboard = None self.random_state = random_state + self.target = None def create_project(self, data, target, task="infer"): # ingest data + self.target = target self.dataset = Ingestion().run(data) if task == "infer": @@ -30,15 +33,16 @@ def create_project(self, data, target, task="infer"): # split partitions - self.dataset = test_train_splitter(self.dataset, random_state=self.random_state) + self.dataset = test_train_splitter( + self.dataset, random_state=self.random_state + ) # noqa # set modelling configurations def run_autopilot(self): - raise NotImplementedError + simple_random_forest = RandomForestRegressionRunner( + target=self.target, dataset=self.dataset + ) + simple_random_forest.fit() - # preprocess - - # train - - # eval + return simple_random_forest diff --git a/python/src/lazylearn/pipeline/pipeline.py b/python/src/lazylearn/pipeline/pipeline.py index a395eed..2e69e14 100644 --- a/python/src/lazylearn/pipeline/pipeline.py +++ b/python/src/lazylearn/pipeline/pipeline.py @@ -1,7 +1,7 @@ from typing import List from models.models import Dataset -from pandas import DataFrame +from pandas import DataFrame, Series class Pipeline: @@ -21,6 +21,9 @@ class PipelineStep: def apply(self, pipeline: Pipeline): pass + def fit(self, pipeline: Pipeline): + pass + class IngestionPipeline(Pipeline): def __init__(self): @@ -44,8 +47,19 @@ def response(self): class ModelPipeline(Pipeline): def __init__(self): super().__init__() + self._is_fitted = False + self.feature_list: list = [] + + def fit(self): + [step.fit(self) for step in self._steps] + self._is_fitted = True class RegressionPipeline(ModelPipeline): def __init__(self): super().__init__() + self.train_features_df: DataFrame = None + self.train_targets: Series = None + self.holdout_features_df: DataFrame = None + self.holdout_targets: Series = None + self.holdout_score: float = None diff --git a/python/src/lazylearn/preprocessing/encoding/encoders.py b/python/src/lazylearn/preprocessing/encoding/encoders.py index a206bc0..6126cdf 100644 --- a/python/src/lazylearn/preprocessing/encoding/encoders.py +++ b/python/src/lazylearn/preprocessing/encoding/encoders.py @@ -1,15 +1,56 @@ +from models.models import Dataset +from pipeline.pipeline import ModelPipeline + + class OrdinalConverter: def __init__( self, + cat_vars: list, max_cardinality: int = None, min_support: int = 5, other_category: bool = True, method: str = "freq", ): + self.cat_vars = cat_vars self.card_max = max_cardinality self.min_support = min_support self.other_category = other_category self.method = method + self.cat_freqs = {} + self.cat_maps = {} + + def fit(self, pipeline: ModelPipeline): + for var in self.cat_vars: + pipeline.train_features_df = self.convert(pipeline.train_features_df, var) + pipeline.feature_list.append(var) + + def convert(self, df, col_name): + """ + + :param df: + :param col_name: + :return: + """ + if self.method == "freq": + self.cat_freqs[col_name] = {} + for item in df[col_name].tolist(): + if item in self.cat_freqs[col_name]: + self.cat_freqs[col_name][item] += 1 + else: + self.cat_freqs[col_name][item] = 1 + + freq_pairs = sorted( + [(key, val) for key, val in self.cat_freqs[col_name].items()], + key=lambda x: x[1], + ) + print(freq_pairs) + self.cat_maps[col_name] = {key: val for key, val in freq_pairs} - def convert(self, df, col): - pass + df[col_name] = df[col_name].apply( + lambda x: self.cat_maps[col_name][x] + if self.cat_maps[col_name][x] >= self.min_support + else -1 + ) + return df + else: + raise ValueError("Unsupported encoding method, try [freq]") diff --git a/python/src/lazylearn/preprocessing/imputation/__init__.py b/python/src/lazylearn/preprocessing/imputation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/regression/models/randomforest/random_forest_steps/__init__.py b/python/src/lazylearn/regression/models/randomforest/random_forest_steps/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py b/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py new file mode 100644 index 0000000..dd62ca7 --- /dev/null +++ b/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py @@ -0,0 +1,17 @@ +from pipeline.pipeline import PipelineStep, RegressionPipeline +from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import mean_absolute_error + + +class RandomForestRegressorStep(PipelineStep): + def __init__(self): + self.regressor = RandomForestRegressor() + + def fit(self, pipeline: RegressionPipeline): + self.regressor.fit(X=pipeline.train_features_df, y=pipeline.train_targets) + + #y_hat = self.regressor.predict(X=pipeline.holdout_features_df) + #pipeline.holdout_score = mean_absolute_error(pipeline.holdout_targets, y_hat) + + def predict(self, pipeline: RegressionPipeline): + raise NotImplementedError \ No newline at end of file diff --git a/python/src/lazylearn/regression/models/randomforest/randomforest.py b/python/src/lazylearn/regression/models/randomforest/randomforest.py index 4a78978..cad9c61 100644 --- a/python/src/lazylearn/regression/models/randomforest/randomforest.py +++ b/python/src/lazylearn/regression/models/randomforest/randomforest.py @@ -1,15 +1,32 @@ from models.models import Dataset from pipeline.pipeline import RegressionPipeline +from preprocessing.encoding.encoders import OrdinalConverter +from regression.models.randomforest.random_forest_steps.regressor_step import ( + RandomForestRegressorStep, +) +from sklearn.ensemble import RandomForestRegressor -class RandomForestRegressionPipeline(RegressionPipeline): - def __init__(self): - self.target = None - self.dataset: Dataset = None +class RandomForestRegressionRunner: + def __init__(self, target, dataset): + self.target = target + self.dataset: Dataset = dataset + self.pipeline = RegressionPipeline() - def run(self): + self.pipeline.train_features_df = self.dataset.partitions["train"].copy() + self.pipeline.train_targets = self.dataset.partitions["train"][target] + self.pipeline.holdout_features_df = self.dataset.partitions["test"].copy() + self.pipeline.holdout_targets = self.dataset.partitions["test"][target] + + def fit(self): # preprocess numeric vars + cat_vars = self.dataset.type_collections["categorical"] + + self.pipeline.add(OrdinalConverter(cat_vars=cat_vars)) + + # self.pipeline.add(RandomForestRegressorStep()) - # preprocess categorical vars + self.pipeline.fit() - pass + def predict(self): + raise NotImplementedError diff --git a/python/src/lazylearn/strategies/strategy_builder.py b/python/src/lazylearn/strategies/strategy_builder.py new file mode 100644 index 0000000..b94ab4c --- /dev/null +++ b/python/src/lazylearn/strategies/strategy_builder.py @@ -0,0 +1 @@ +from lazylearn import LazyLearner From cd2c4ffc9d981dee126cf7c1b88a869abb58693d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Frederik=20Peter=20H=C3=B8ngaard?= Date: Sat, 20 May 2023 23:23:20 +0200 Subject: [PATCH 7/8] linting --- .../randomforest/random_forest_steps/regressor_step.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py b/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py index dd62ca7..57c45ee 100644 --- a/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py +++ b/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py @@ -10,8 +10,8 @@ def __init__(self): def fit(self, pipeline: RegressionPipeline): self.regressor.fit(X=pipeline.train_features_df, y=pipeline.train_targets) - #y_hat = self.regressor.predict(X=pipeline.holdout_features_df) - #pipeline.holdout_score = mean_absolute_error(pipeline.holdout_targets, y_hat) + # y_hat = self.regressor.predict(X=pipeline.holdout_features_df) + # pipeline.holdout_score = mean_absolute_error(pipeline.holdout_targets, y_hat) def predict(self, pipeline: RegressionPipeline): - raise NotImplementedError \ No newline at end of file + raise NotImplementedError From 35acb802d6038907606f333a6a9114c6cbda1def Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Frederik=20Peter=20H=C3=B8ngaard?= Date: Sun, 21 May 2023 01:13:03 +0200 Subject: [PATCH 8/8] basic pipeline functional --- python/src/lazylearn/lazylearn.py | 23 ++++++++++-- python/src/lazylearn/pipeline/pipeline.py | 11 ++++++ .../preprocessing/encoding/encoders.py | 36 +++++++++++++++---- .../random_forest_steps/regressor_step.py | 22 +++++++----- .../models/randomforest/randomforest.py | 22 ++++++++---- .../lazylearn/strategies/strategy_builder.py | 1 - 6 files changed, 90 insertions(+), 25 deletions(-) diff --git a/python/src/lazylearn/lazylearn.py b/python/src/lazylearn/lazylearn.py index 52df770..5b91328 100644 --- a/python/src/lazylearn/lazylearn.py +++ b/python/src/lazylearn/lazylearn.py @@ -2,7 +2,10 @@ from model_selection.splitters import test_train_splitter from preprocessing.time.date_processor import date_processor from preprocessing.time.duration import duration_builder -from regression.models.randomforest.randomforest import RandomForestRegressionRunner +from regression.models.randomforest.randomforest import ( # noqa + RandomForestRegressionRunner, +) +from sklearn.metrics import mean_absolute_error class LazyLearner: @@ -40,9 +43,25 @@ def create_project(self, data, target, task="infer"): # set modelling configurations def run_autopilot(self): + """ + TODO: Everything here must be abstracted away into strategies + TODO: such that several models are run and their scores are added to + TODO: the leaderboard + + :return: + """ + simple_random_forest = RandomForestRegressionRunner( - target=self.target, dataset=self.dataset + target=self.target, + dataset=self.dataset, + random_state=self.random_state, # noqa ) simple_random_forest.fit() + # get holdout scores + simple_random_forest.predict(self.dataset.partitions["test"]) + simple_random_forest.pipeline.holdout_score = mean_absolute_error( + self.dataset.partitions["test"][self.target], + simple_random_forest.pipeline.tmp_pred, + ) return simple_random_forest diff --git a/python/src/lazylearn/pipeline/pipeline.py b/python/src/lazylearn/pipeline/pipeline.py index 2e69e14..c26ec72 100644 --- a/python/src/lazylearn/pipeline/pipeline.py +++ b/python/src/lazylearn/pipeline/pipeline.py @@ -24,6 +24,9 @@ def apply(self, pipeline: Pipeline): def fit(self, pipeline: Pipeline): pass + def predict(self, pipeline: Pipeline): + pass + class IngestionPipeline(Pipeline): def __init__(self): @@ -49,11 +52,19 @@ def __init__(self): super().__init__() self._is_fitted = False self.feature_list: list = [] + self.tmp_test = None + self.tmp_pred = None + self.target = None def fit(self): [step.fit(self) for step in self._steps] self._is_fitted = True + def predict(self): + assert self._is_fitted + [step.predict(self) for step in self._steps] + return self.tmp_pred + class RegressionPipeline(ModelPipeline): def __init__(self): diff --git a/python/src/lazylearn/preprocessing/encoding/encoders.py b/python/src/lazylearn/preprocessing/encoding/encoders.py index 6126cdf..4d6b4e7 100644 --- a/python/src/lazylearn/preprocessing/encoding/encoders.py +++ b/python/src/lazylearn/preprocessing/encoding/encoders.py @@ -1,4 +1,4 @@ -from models.models import Dataset +from pandas import DataFrame from pipeline.pipeline import ModelPipeline @@ -21,15 +21,25 @@ def __init__( def fit(self, pipeline: ModelPipeline): for var in self.cat_vars: - pipeline.train_features_df = self.convert(pipeline.train_features_df, var) + pipeline.train_features_df = self.convert( + pipeline.train_features_df, var + ) # noqa pipeline.feature_list.append(var) - def convert(self, df, col_name): + def convert(self, df: DataFrame, col_name: str) -> DataFrame: """ + Encodes a categorical column ordinally. + Currently only the "freq" method is supported, + and it encodes a value with an integer id by + increasing frequency i.e. more frequent values + receive a higher encoding - :param df: - :param col_name: - :return: + Note that this should only be done on the training + data! + + :param df: pandas DataFrame of features + :param col_name: column to consider + :return: transformed DataFrame """ if self.method == "freq": self.cat_freqs[col_name] = {} @@ -43,7 +53,7 @@ def convert(self, df, col_name): [(key, val) for key, val in self.cat_freqs[col_name].items()], key=lambda x: x[1], ) - print(freq_pairs) + self.cat_maps[col_name] = {key: val for key, val in freq_pairs} df[col_name] = df[col_name].apply( @@ -54,3 +64,15 @@ def convert(self, df, col_name): return df else: raise ValueError("Unsupported encoding method, try [freq]") + + def predict(self, pipeline: ModelPipeline): + df = pipeline.tmp_test + + for var in self.cat_vars: + df[var] = df[var].apply( + lambda x: self.cat_maps[var][x] + if x in self.cat_maps[var] + else -2 # noqa + ) + + pipeline.tmp_test = df diff --git a/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py b/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py index 57c45ee..2217332 100644 --- a/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py +++ b/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py @@ -1,17 +1,23 @@ from pipeline.pipeline import PipelineStep, RegressionPipeline from sklearn.ensemble import RandomForestRegressor -from sklearn.metrics import mean_absolute_error class RandomForestRegressorStep(PipelineStep): - def __init__(self): - self.regressor = RandomForestRegressor() + def __init__(self, random_state=None): + self.regressor = RandomForestRegressor(random_state=random_state) def fit(self, pipeline: RegressionPipeline): - self.regressor.fit(X=pipeline.train_features_df, y=pipeline.train_targets) - - # y_hat = self.regressor.predict(X=pipeline.holdout_features_df) - # pipeline.holdout_score = mean_absolute_error(pipeline.holdout_targets, y_hat) + pipeline.feature_list = [ + item for item in pipeline.feature_list if item != pipeline.target + ] + print("Fitting RandomForestRegressor") + self.regressor.fit( + X=pipeline.train_features_df[pipeline.feature_list], + y=pipeline.train_targets, + ) # noqa + print("RandomForestRegressor fitted!") def predict(self, pipeline: RegressionPipeline): - raise NotImplementedError + pipeline.tmp_pred = self.regressor.predict( + X=pipeline.tmp_test[pipeline.feature_list] + ) diff --git a/python/src/lazylearn/regression/models/randomforest/randomforest.py b/python/src/lazylearn/regression/models/randomforest/randomforest.py index cad9c61..2ba5aa3 100644 --- a/python/src/lazylearn/regression/models/randomforest/randomforest.py +++ b/python/src/lazylearn/regression/models/randomforest/randomforest.py @@ -4,29 +4,37 @@ from regression.models.randomforest.random_forest_steps.regressor_step import ( RandomForestRegressorStep, ) -from sklearn.ensemble import RandomForestRegressor class RandomForestRegressionRunner: - def __init__(self, target, dataset): + def __init__(self, target, dataset, random_state=42): self.target = target self.dataset: Dataset = dataset + self.random_state = random_state self.pipeline = RegressionPipeline() + self.pipeline.target = target - self.pipeline.train_features_df = self.dataset.partitions["train"].copy() + self.pipeline.train_features_df = self.dataset.partitions[ + "train" + ].copy() # noqa self.pipeline.train_targets = self.dataset.partitions["train"][target] - self.pipeline.holdout_features_df = self.dataset.partitions["test"].copy() + self.pipeline.holdout_features_df = self.dataset.partitions[ + "test" + ].copy() # noqa self.pipeline.holdout_targets = self.dataset.partitions["test"][target] def fit(self): # preprocess numeric vars cat_vars = self.dataset.type_collections["categorical"] + num_vars = self.dataset.type_collections["numeric"] + self.pipeline.feature_list.extend(num_vars) self.pipeline.add(OrdinalConverter(cat_vars=cat_vars)) - # self.pipeline.add(RandomForestRegressorStep()) + self.pipeline.add(RandomForestRegressorStep()) self.pipeline.fit() - def predict(self): - raise NotImplementedError + def predict(self, features): + self.pipeline.tmp_test = features + return self.pipeline.predict() diff --git a/python/src/lazylearn/strategies/strategy_builder.py b/python/src/lazylearn/strategies/strategy_builder.py index b94ab4c..e69de29 100644 --- a/python/src/lazylearn/strategies/strategy_builder.py +++ b/python/src/lazylearn/strategies/strategy_builder.py @@ -1 +0,0 @@ -from lazylearn import LazyLearner