From e87758387163939887064137cd261163c26f6473 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Frederik=20Peter=20H=C3=B8ngaard?=
 <mail@frederikhoengaard.com>
Date: Sun, 7 May 2023 22:40:51 +0200
Subject: [PATCH 1/8] datetime seems to be interpreted correctly

---
 pyproject.toml                                |  2 +-
 .../interpreter_step.py                       | 23 ++++++++++++-------
 python/src/lazylearn/lazylearn.py             |  2 +-
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 425f362..a45714a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "lazylearn"
-version = "0.0.1"
+version = "0.0.2"
 authors = [
   { name="Frederik P. Høngaard", email="mail@frederikhoengaard.com" },
 ]
diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py
index 10f9434..8d80d7e 100644
--- a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py
+++ b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pandas as pd
 from pandas import Series
 from pipeline.pipeline import IngestionPipeline
@@ -33,16 +34,20 @@ def analyze_column(self, column: Series):
         values = column.tolist()
         types = [type(value) for value in values]
 
-        if self.categorical_test(values):
-            return "categorical"
+        column_type = None
 
+        if self.categorical_test(values):
+            column_type = "categorical"
         elif self.numeric_test(types):
-            return "numeric"
+            column_type = "numeric"
+
+        if self.datetime_check(column) and not self.numeric_test(types):
+            column_type = "datetime"
 
-        elif self.datetime_check(column):
-            return "datetime"
-        else:
-            return "object"
+        if column_type is None:
+            column_type = "object"
+
+        return column_type
 
     @staticmethod
     def categorical_test(values: list):
@@ -79,8 +84,10 @@ def string_test(types: set):
         raise NotImplementedError
 
     def datetime_check(self, column: Series):
+        if self.df[column.name].dtype.type == np.datetime64:
+            return True
         try:
-            self.df[column.name] = pd.to_datetime(column)
+            self.df[column.name] = pd.to_datetime(self.df[column.name])
             return True
         except Exception as e:  # noqa
             return False
diff --git a/python/src/lazylearn/lazylearn.py b/python/src/lazylearn/lazylearn.py
index a6d620a..dc746ca 100644
--- a/python/src/lazylearn/lazylearn.py
+++ b/python/src/lazylearn/lazylearn.py
@@ -7,7 +7,7 @@ def __init__(self):
 
     def create_project(self, data, target, task="infer"):
         # ingest data
-        ingestion_response = Ingestion().run(data)  # noqa
+        self.dataset = Ingestion().run(data)  # noqa
 
         # preprocess
 

From 523d1901796e492574e04d8b9a8a8c72b9fe052a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Frederik=20Peter=20H=C3=B8ngaard?=
 <mail@frederikhoengaard.com>
Date: Mon, 8 May 2023 14:53:34 +0200
Subject: [PATCH 2/8] Update interpreter_step.py

---
 .../interpreter_step.py                       | 49 ++++++++++++++++---
 1 file changed, 41 insertions(+), 8 deletions(-)

diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py
index 8d80d7e..362919a 100644
--- a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py
+++ b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py
@@ -1,11 +1,14 @@
 import numpy as np
 import pandas as pd
-from pandas import Series
+from pandas import DataFrame, Series
 from pipeline.pipeline import IngestionPipeline
 from tqdm import tqdm
 
 
 class ColumnTypeInterpreter:
+    def __int__(self):
+        self.df: DataFrame = None
+
     def apply(self, pipeline: IngestionPipeline):
         """
         This method is responsible for inferring the
@@ -38,6 +41,8 @@ def analyze_column(self, column: Series):
 
         if self.categorical_test(values):
             column_type = "categorical"
+        elif self.numeric_test(types) and self.id_check(types, values):
+            column_type = "id"
         elif self.numeric_test(types):
             column_type = "numeric"
 
@@ -77,17 +82,45 @@ def numeric_test(types: list):
         :param types: list of type objects
         :return: True if column is numeric, False otherwise
         """
-        return all([item == float or item == int for item in set(types)])
+        return all(
+            [item == float or item == int for item in set(types) if item is not None]
+        )
 
     @staticmethod
     def string_test(types: set):
         raise NotImplementedError
 
     def datetime_check(self, column: Series):
-        if self.df[column.name].dtype.type == np.datetime64:
-            return True
-        try:
-            self.df[column.name] = pd.to_datetime(self.df[column.name])
+        """
+
+        :param column:
+        :return:
+        """
+        col_name = str(column.name)
+
+        # if type of column is actually datetime
+        if self.df[col_name].dtype.type == np.datetime64:
             return True
-        except Exception as e:  # noqa
-            return False
+
+        # if date or time is in column name and can be cast as date
+        if "date" in col_name.lower() or "time" in col_name.lower():
+            try:
+                self.df[col_name] = pd.to_datetime(self.df[col_name])
+                return True
+            except Exception as e:  # noqa
+                pass
+
+        # if format of values look like dates
+
+        return False
+
+    def id_check(self, types, values):
+        """
+
+        :param types:
+        :param values:
+        :return:
+        """
+        return all([item == int for item in set(types) if item is not None]) and len(
+            set(values)
+        ) == len(self.df)

From bb76e1c7a3c97e9eb03f84ce0b273dbc597a047c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Frederik=20Peter=20H=C3=B8ngaard?=
 <mail@frederikhoengaard.com>
Date: Wed, 17 May 2023 14:29:17 +0200
Subject: [PATCH 3/8] add some date preprocessing

---
 .../data_parser_step.py                       |  1 +
 .../interpreter_step.py                       | 19 +++++++++--
 python/src/lazylearn/lazylearn.py             | 17 +++++++++-
 python/src/lazylearn/models/models.py         | 10 +++++-
 python/src/lazylearn/pipeline/pipeline.py     | 19 ++++++++++-
 .../lazylearn/preprocessing/time/__init__.py  |  0
 .../preprocessing/time/date_processor.py      | 34 +++++++++++++++++++
 python/src/lazylearn/regression/__init__.py   |  0
 .../lazylearn/regression/models/__init__.py   |  0
 .../models/randomforest/__init__.py           |  0
 .../models/randomforest/randomforest.py       | 16 +++++++++
 .../regression/models/xgboost/__init__.py     |  0
 python/src/lazylearn/strategies/__init__.py   |  0
 13 files changed, 111 insertions(+), 5 deletions(-)
 create mode 100644 python/src/lazylearn/preprocessing/time/__init__.py
 create mode 100644 python/src/lazylearn/preprocessing/time/date_processor.py
 create mode 100644 python/src/lazylearn/regression/__init__.py
 create mode 100644 python/src/lazylearn/regression/models/__init__.py
 create mode 100644 python/src/lazylearn/regression/models/randomforest/__init__.py
 create mode 100644 python/src/lazylearn/regression/models/randomforest/randomforest.py
 create mode 100644 python/src/lazylearn/regression/models/xgboost/__init__.py
 create mode 100644 python/src/lazylearn/strategies/__init__.py

diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py
index 42f323a..94a5399 100644
--- a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py
+++ b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py
@@ -17,5 +17,6 @@ def apply(self, pipeline: IngestionPipeline):
 
         if isinstance(pipeline.raw_data, DataFrame):
             pipeline.df = pipeline.raw_data
+        # check if raw data is a path to a csv file and read it into csv
         else:
             raise DataSourceError
diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py
index 362919a..baad31c 100644
--- a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py
+++ b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py
@@ -27,6 +27,10 @@ def apply(self, pipeline: IngestionPipeline):
             )  # noqa
 
         pipeline.column_type_map = column_types
+        if "unknown" in pipeline.column_type_map.values():
+            pipeline.needs_type_map = True
+
+        pipeline.type_collections = self.build_type_collections(column_types)
 
     def analyze_column(self, column: Series):
         """
@@ -50,7 +54,7 @@ def analyze_column(self, column: Series):
             column_type = "datetime"
 
         if column_type is None:
-            column_type = "object"
+            column_type = "unknown"
 
         return column_type
 
@@ -110,7 +114,7 @@ def datetime_check(self, column: Series):
             except Exception as e:  # noqa
                 pass
 
-        # if format of values look like dates
+        # if format of values looks like dates
 
         return False
 
@@ -124,3 +128,14 @@ def id_check(self, types, values):
         return all([item == int for item in set(types) if item is not None]) and len(
             set(values)
         ) == len(self.df)
+
+    @staticmethod
+    def build_type_collections(column_type_map):
+        collections = {}
+
+        for data_type in ["datetime", "numeric", "categorical"]:
+            collections[data_type] = [
+                col for col in column_type_map if column_type_map[col] == data_type
+            ]
+
+        return collections
diff --git a/python/src/lazylearn/lazylearn.py b/python/src/lazylearn/lazylearn.py
index dc746ca..255f311 100644
--- a/python/src/lazylearn/lazylearn.py
+++ b/python/src/lazylearn/lazylearn.py
@@ -1,13 +1,28 @@
 from ingestion.ingestion_pipeline import Ingestion
+from preprocessing.time.date_processor import date_processor
 
 
 class LazyLearner:
     def __init__(self):
         self.dataset = None
+        self.task = None
+        self.models = None
+        self.leaderboard = None
 
     def create_project(self, data, target, task="infer"):
         # ingest data
-        self.dataset = Ingestion().run(data)  # noqa
+        self.dataset = Ingestion().run(data)
+
+        if task == "infer":
+            # if target is numeric then regression, else classification
+            if self.dataset.column_type_map[target] == "numeric":
+                self.task = "regression"
+            else:
+                self.task = "classification"
+
+        # process dates
+
+        self.dataset = date_processor(self.dataset)
 
         # preprocess
 
diff --git a/python/src/lazylearn/models/models.py b/python/src/lazylearn/models/models.py
index d826534..dda68de 100644
--- a/python/src/lazylearn/models/models.py
+++ b/python/src/lazylearn/models/models.py
@@ -2,11 +2,19 @@
 
 
 class Dataset:
-    def __init__(self, df: DataFrame, column_type_map: dict):
+    def __init__(
+        self,
+        df: DataFrame,
+        column_type_map: dict,
+        summary_stats: dict,
+        type_collections: dict,
+    ):
         self.name = None
         self.description = None
         self.df = df
         self.column_type_map = column_type_map
+        self.summary_stats = summary_stats
+        self.type_collections = type_collections
 
     def save(self):
         raise NotImplementedError
diff --git a/python/src/lazylearn/pipeline/pipeline.py b/python/src/lazylearn/pipeline/pipeline.py
index e7256f2..a395eed 100644
--- a/python/src/lazylearn/pipeline/pipeline.py
+++ b/python/src/lazylearn/pipeline/pipeline.py
@@ -29,6 +29,23 @@ def __init__(self):
         self.df: DataFrame = None
         self.column_type_map: dict = None
         self.summary_stats: dict = {}
+        self.needs_type_map: bool = False
+        self.type_collections: dict = None
 
     def response(self):
-        return Dataset(df=self.df, column_type_map=self.column_type_map)
+        return Dataset(
+            df=self.df,
+            column_type_map=self.column_type_map,
+            summary_stats=self.summary_stats,
+            type_collections=self.type_collections,
+        )
+
+
+class ModelPipeline(Pipeline):
+    def __init__(self):
+        super().__init__()
+
+
+class RegressionPipeline(ModelPipeline):
+    def __init__(self):
+        super().__init__()
diff --git a/python/src/lazylearn/preprocessing/time/__init__.py b/python/src/lazylearn/preprocessing/time/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/src/lazylearn/preprocessing/time/date_processor.py b/python/src/lazylearn/preprocessing/time/date_processor.py
new file mode 100644
index 0000000..37ddbf6
--- /dev/null
+++ b/python/src/lazylearn/preprocessing/time/date_processor.py
@@ -0,0 +1,34 @@
+from models.models import Dataset
+
+
+def date_processor(dataset: Dataset) -> Dataset:
+    """
+    Method that transform date variables into
+    categorical features.
+
+    :param dataset: Dataset object with date features
+    :return: Dataset object with categorical date
+    features
+    """
+    new_categorical_cols = []
+
+    for date_column in dataset.type_collections["datetime"]:
+        dataset.df[f"{date_column}_year"] = (
+            dataset.df[date_column].dt.isocalendar().year
+        )
+        dataset.df[f"{date_column}_month"] = dataset.df[date_column].dt.month
+        dataset.df[f"{date_column}_week"] = (
+            dataset.df[date_column].dt.isocalendar().week
+        )
+        dataset.df[f"{date_column}_day"] = dataset.df[date_column].dt.isocalendar().day
+
+        new_categorical_cols.append(f"{date_column}_year")
+        new_categorical_cols.append(f"{date_column}_month")
+        new_categorical_cols.append(f"{date_column}_week")
+        new_categorical_cols.append(f"{date_column}_day")
+
+    for cat in new_categorical_cols:
+        dataset.column_type_map[cat] = "categorical"
+        dataset.type_collections["categorical"].append(cat)
+
+    return dataset
diff --git a/python/src/lazylearn/regression/__init__.py b/python/src/lazylearn/regression/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/src/lazylearn/regression/models/__init__.py b/python/src/lazylearn/regression/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/src/lazylearn/regression/models/randomforest/__init__.py b/python/src/lazylearn/regression/models/randomforest/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/src/lazylearn/regression/models/randomforest/randomforest.py b/python/src/lazylearn/regression/models/randomforest/randomforest.py
new file mode 100644
index 0000000..c5fdb06
--- /dev/null
+++ b/python/src/lazylearn/regression/models/randomforest/randomforest.py
@@ -0,0 +1,16 @@
+from models.models import Dataset
+from pipeline.pipeline import RegressionPipeline
+from sklearn.ensemble import RandomForestRegressor
+
+
+class RandomForestRegressionPipeline(RegressionPipeline):
+    def __init__(self):
+        self.target = None
+        self.dataset: Dataset = None
+
+    def run(self):
+        # preprocess numeric vars
+
+        # preprocess categorical vars
+
+        pass
diff --git a/python/src/lazylearn/regression/models/xgboost/__init__.py b/python/src/lazylearn/regression/models/xgboost/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/src/lazylearn/strategies/__init__.py b/python/src/lazylearn/strategies/__init__.py
new file mode 100644
index 0000000..e69de29

From d7fca34e745c7850be54466cd73cd783787a9488 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Frederik=20Peter=20H=C3=B8ngaard?=
 <mail@frederikhoengaard.com>
Date: Wed, 17 May 2023 14:31:55 +0200
Subject: [PATCH 4/8] linting

---
 .../interpreter_step.py                        | 18 ++++++++++++++----
 .../preprocessing/time/date_processor.py       |  4 +++-
 .../models/randomforest/randomforest.py        |  1 -
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py
index baad31c..6bdcd31 100644
--- a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py
+++ b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py
@@ -87,7 +87,11 @@ def numeric_test(types: list):
         :return: True if column is numeric, False otherwise
         """
         return all(
-            [item == float or item == int for item in set(types) if item is not None]
+            [
+                item == float or item == int
+                for item in set(types)
+                if item is not None  # noqa
+            ]
         )
 
     @staticmethod
@@ -125,9 +129,13 @@ def id_check(self, types, values):
         :param values:
         :return:
         """
-        return all([item == int for item in set(types) if item is not None]) and len(
+        return all(
+            [item == int for item in set(types) if item is not None]
+        ) and len(  # noqa
             set(values)
-        ) == len(self.df)
+        ) == len(
+            self.df
+        )
 
     @staticmethod
     def build_type_collections(column_type_map):
@@ -135,7 +143,9 @@ def build_type_collections(column_type_map):
 
         for data_type in ["datetime", "numeric", "categorical"]:
             collections[data_type] = [
-                col for col in column_type_map if column_type_map[col] == data_type
+                col
+                for col in column_type_map
+                if column_type_map[col] == data_type  # noqa
             ]
 
         return collections
diff --git a/python/src/lazylearn/preprocessing/time/date_processor.py b/python/src/lazylearn/preprocessing/time/date_processor.py
index 37ddbf6..aed4f3d 100644
--- a/python/src/lazylearn/preprocessing/time/date_processor.py
+++ b/python/src/lazylearn/preprocessing/time/date_processor.py
@@ -20,7 +20,9 @@ def date_processor(dataset: Dataset) -> Dataset:
         dataset.df[f"{date_column}_week"] = (
             dataset.df[date_column].dt.isocalendar().week
         )
-        dataset.df[f"{date_column}_day"] = dataset.df[date_column].dt.isocalendar().day
+        dataset.df[f"{date_column}_day"] = (
+            dataset.df[date_column].dt.isocalendar().day
+        )  # noqa
 
         new_categorical_cols.append(f"{date_column}_year")
         new_categorical_cols.append(f"{date_column}_month")
diff --git a/python/src/lazylearn/regression/models/randomforest/randomforest.py b/python/src/lazylearn/regression/models/randomforest/randomforest.py
index c5fdb06..4a78978 100644
--- a/python/src/lazylearn/regression/models/randomforest/randomforest.py
+++ b/python/src/lazylearn/regression/models/randomforest/randomforest.py
@@ -1,6 +1,5 @@
 from models.models import Dataset
 from pipeline.pipeline import RegressionPipeline
-from sklearn.ensemble import RandomForestRegressor
 
 
 class RandomForestRegressionPipeline(RegressionPipeline):

From f3fbd0d59b3a84cf64659a2d874cd901d083e071 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Frederik=20Peter=20H=C3=B8ngaard?=
 <mail@frederikhoengaard.com>
Date: Sat, 20 May 2023 16:15:51 +0200
Subject: [PATCH 5/8] durations and simple splits

---
 .../data_parser_step.py                       |  4 ++++
 .../src/lazylearn/ingestion/utils/__init__.py |  0
 python/src/lazylearn/ingestion/utils/csv.py   |  2 ++
 python/src/lazylearn/lazylearn.py             | 15 ++++++++++--
 .../src/lazylearn/model_selection/__init__.py |  0
 .../lazylearn/model_selection/splitters.py    | 17 +++++++++++++
 python/src/lazylearn/models/models.py         |  1 +
 .../lazylearn/preprocessing/time/duration.py  | 24 +++++++++++++++++++
 8 files changed, 61 insertions(+), 2 deletions(-)
 create mode 100644 python/src/lazylearn/ingestion/utils/__init__.py
 create mode 100644 python/src/lazylearn/ingestion/utils/csv.py
 create mode 100644 python/src/lazylearn/model_selection/__init__.py
 create mode 100644 python/src/lazylearn/model_selection/splitters.py
 create mode 100644 python/src/lazylearn/preprocessing/time/duration.py

diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py
index 94a5399..19608ad 100644
--- a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py
+++ b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py
@@ -1,4 +1,6 @@
+import pandas as pd
 from errors.errors import DataSourceError
+from ingestion.utils.csv import csv_check
 from pandas import DataFrame
 from pipeline.pipeline import IngestionPipeline, PipelineStep
 
@@ -18,5 +20,7 @@ def apply(self, pipeline: IngestionPipeline):
         if isinstance(pipeline.raw_data, DataFrame):
             pipeline.df = pipeline.raw_data
         # check if raw data is a path to a csv file and read it into csv
+        elif csv_check(pipeline.df):
+            pipeline.df = pd.read_csv(pipeline.raw_data)
         else:
             raise DataSourceError
diff --git a/python/src/lazylearn/ingestion/utils/__init__.py b/python/src/lazylearn/ingestion/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/src/lazylearn/ingestion/utils/csv.py b/python/src/lazylearn/ingestion/utils/csv.py
new file mode 100644
index 0000000..22d44fb
--- /dev/null
+++ b/python/src/lazylearn/ingestion/utils/csv.py
@@ -0,0 +1,2 @@
+def csv_check(path):
+    raise NotImplementedError
diff --git a/python/src/lazylearn/lazylearn.py b/python/src/lazylearn/lazylearn.py
index 255f311..d0cc518 100644
--- a/python/src/lazylearn/lazylearn.py
+++ b/python/src/lazylearn/lazylearn.py
@@ -1,13 +1,16 @@
 from ingestion.ingestion_pipeline import Ingestion
+from model_selection.splitters import test_train_splitter
 from preprocessing.time.date_processor import date_processor
+from preprocessing.time.duration import duration_builder
 
 
 class LazyLearner:
-    def __init__(self):
+    def __init__(self, random_state=None):
         self.dataset = None
         self.task = None
         self.models = None
         self.leaderboard = None
+        self.random_state = random_state
 
     def create_project(self, data, target, task="infer"):
         # ingest data
@@ -23,11 +26,19 @@ def create_project(self, data, target, task="infer"):
         # process dates
 
         self.dataset = date_processor(self.dataset)
+        self.dataset = duration_builder(self.dataset)
 
-        # preprocess
+        # split partitions
+
+        self.dataset = test_train_splitter(self.dataset, random_state=self.random_state)
 
         # set modelling configurations
 
+    def run_autopilot(self):
+        raise NotImplementedError
+
+        # preprocess
+
         # train
 
         # eval
diff --git a/python/src/lazylearn/model_selection/__init__.py b/python/src/lazylearn/model_selection/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/src/lazylearn/model_selection/splitters.py b/python/src/lazylearn/model_selection/splitters.py
new file mode 100644
index 0000000..7773deb
--- /dev/null
+++ b/python/src/lazylearn/model_selection/splitters.py
@@ -0,0 +1,17 @@
+from models.models import Dataset
+from sklearn.model_selection import train_test_split
+
+
+def test_train_splitter(dataset: Dataset, random_state=None) -> Dataset:
+    train_partition, test_partition = train_test_split(
+        dataset.df, test_size=0.2, random_state=random_state
+    )
+
+    dataset.partitions["test"] = test_partition
+    dataset.partitions["train"] = train_partition
+
+    return dataset
+
+
+def cv_splitter(dataset: Dataset) -> Dataset:
+    return dataset
diff --git a/python/src/lazylearn/models/models.py b/python/src/lazylearn/models/models.py
index dda68de..dfdf2e3 100644
--- a/python/src/lazylearn/models/models.py
+++ b/python/src/lazylearn/models/models.py
@@ -15,6 +15,7 @@ def __init__(
         self.column_type_map = column_type_map
         self.summary_stats = summary_stats
         self.type_collections = type_collections
+        self.partitions: dict = {}
 
     def save(self):
         raise NotImplementedError
diff --git a/python/src/lazylearn/preprocessing/time/duration.py b/python/src/lazylearn/preprocessing/time/duration.py
new file mode 100644
index 0000000..b21d05e
--- /dev/null
+++ b/python/src/lazylearn/preprocessing/time/duration.py
@@ -0,0 +1,24 @@
+from models.models import Dataset
+
+
+def duration_builder(dataset: Dataset) -> Dataset:
+    """
+
+    :param dataset:
+    :return:
+    """
+    date_cols = dataset.type_collections.get("datetime")
+
+    if len(date_cols) > 1:
+        for i in range(len(date_cols)):
+            for j in range(i + 1, len(date_cols)):
+                col_name = f"duration({date_cols[i]}-{date_cols[j]})"
+                dataset.df[col_name] = (
+                    (dataset.df[date_cols[i]] - dataset.df[date_cols[j]])
+                    .astype("timedelta64[D]")
+                    .astype(int)
+                )
+                dataset.column_type_map[col_name] = "numeric"
+                dataset.type_collections["numeric"].append(col_name)
+
+    return dataset

From 744f495c37b0cdb993c6ded45327e21b06070709 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Frederik=20Peter=20H=C3=B8ngaard?=
 <mail@frederikhoengaard.com>
Date: Sat, 20 May 2023 23:22:39 +0200
Subject: [PATCH 6/8] crude categorical ordinal encoding by frequency

---
 python/src/lazylearn/lazylearn.py             | 18 +++++---
 python/src/lazylearn/pipeline/pipeline.py     | 16 ++++++-
 .../preprocessing/encoding/encoders.py        | 45 ++++++++++++++++++-
 .../preprocessing/imputation/__init__.py      |  0
 .../random_forest_steps/__init__.py           |  0
 .../random_forest_steps/regressor_step.py     | 17 +++++++
 .../models/randomforest/randomforest.py       | 31 ++++++++++---
 .../lazylearn/strategies/strategy_builder.py  |  1 +
 8 files changed, 111 insertions(+), 17 deletions(-)
 create mode 100644 python/src/lazylearn/preprocessing/imputation/__init__.py
 create mode 100644 python/src/lazylearn/regression/models/randomforest/random_forest_steps/__init__.py
 create mode 100644 python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py
 create mode 100644 python/src/lazylearn/strategies/strategy_builder.py

diff --git a/python/src/lazylearn/lazylearn.py b/python/src/lazylearn/lazylearn.py
index d0cc518..52df770 100644
--- a/python/src/lazylearn/lazylearn.py
+++ b/python/src/lazylearn/lazylearn.py
@@ -2,6 +2,7 @@
 from model_selection.splitters import test_train_splitter
 from preprocessing.time.date_processor import date_processor
 from preprocessing.time.duration import duration_builder
+from regression.models.randomforest.randomforest import RandomForestRegressionRunner
 
 
 class LazyLearner:
@@ -11,9 +12,11 @@ def __init__(self, random_state=None):
         self.models = None
         self.leaderboard = None
         self.random_state = random_state
+        self.target = None
 
     def create_project(self, data, target, task="infer"):
         # ingest data
+        self.target = target
         self.dataset = Ingestion().run(data)
 
         if task == "infer":
@@ -30,15 +33,16 @@ def create_project(self, data, target, task="infer"):
 
         # split partitions
 
-        self.dataset = test_train_splitter(self.dataset, random_state=self.random_state)
+        self.dataset = test_train_splitter(
+            self.dataset, random_state=self.random_state
+        )  # noqa
 
         # set modelling configurations
 
     def run_autopilot(self):
-        raise NotImplementedError
+        simple_random_forest = RandomForestRegressionRunner(
+            target=self.target, dataset=self.dataset
+        )
+        simple_random_forest.fit()
 
-        # preprocess
-
-        # train
-
-        # eval
+        return simple_random_forest
diff --git a/python/src/lazylearn/pipeline/pipeline.py b/python/src/lazylearn/pipeline/pipeline.py
index a395eed..2e69e14 100644
--- a/python/src/lazylearn/pipeline/pipeline.py
+++ b/python/src/lazylearn/pipeline/pipeline.py
@@ -1,7 +1,7 @@
 from typing import List
 
 from models.models import Dataset
-from pandas import DataFrame
+from pandas import DataFrame, Series
 
 
 class Pipeline:
@@ -21,6 +21,9 @@ class PipelineStep:
     def apply(self, pipeline: Pipeline):
         pass
 
+    def fit(self, pipeline: Pipeline):
+        pass
+
 
 class IngestionPipeline(Pipeline):
     def __init__(self):
@@ -44,8 +47,19 @@ def response(self):
 class ModelPipeline(Pipeline):
     def __init__(self):
         super().__init__()
+        self._is_fitted = False
+        self.feature_list: list = []
+
+    def fit(self):
+        [step.fit(self) for step in self._steps]
+        self._is_fitted = True
 
 
 class RegressionPipeline(ModelPipeline):
     def __init__(self):
         super().__init__()
+        self.train_features_df: DataFrame = None
+        self.train_targets: Series = None
+        self.holdout_features_df: DataFrame = None
+        self.holdout_targets: Series = None
+        self.holdout_score: float = None
diff --git a/python/src/lazylearn/preprocessing/encoding/encoders.py b/python/src/lazylearn/preprocessing/encoding/encoders.py
index a206bc0..6126cdf 100644
--- a/python/src/lazylearn/preprocessing/encoding/encoders.py
+++ b/python/src/lazylearn/preprocessing/encoding/encoders.py
@@ -1,15 +1,56 @@
+from models.models import Dataset
+from pipeline.pipeline import ModelPipeline
+
+
 class OrdinalConverter:
     def __init__(
         self,
+        cat_vars: list,
         max_cardinality: int = None,
         min_support: int = 5,
         other_category: bool = True,
         method: str = "freq",
     ):
+        self.cat_vars = cat_vars
         self.card_max = max_cardinality
         self.min_support = min_support
         self.other_category = other_category
         self.method = method
+        self.cat_freqs = {}
+        self.cat_maps = {}
+
+    def fit(self, pipeline: ModelPipeline):
+        for var in self.cat_vars:
+            pipeline.train_features_df = self.convert(pipeline.train_features_df, var)
+            pipeline.feature_list.append(var)
+
+    def convert(self, df, col_name):
+        """
+
+        :param df:
+        :param col_name:
+        :return:
+        """
+        if self.method == "freq":
+            self.cat_freqs[col_name] = {}
+            for item in df[col_name].tolist():
+                if item in self.cat_freqs[col_name]:
+                    self.cat_freqs[col_name][item] += 1
+                else:
+                    self.cat_freqs[col_name][item] = 1
+
+            freq_pairs = sorted(
+                [(key, val) for key, val in self.cat_freqs[col_name].items()],
+                key=lambda x: x[1],
+            )
+            print(freq_pairs)
+            self.cat_maps[col_name] = {key: val for key, val in freq_pairs}
 
-    def convert(self, df, col):
-        pass
+            df[col_name] = df[col_name].apply(
+                lambda x: self.cat_maps[col_name][x]
+                if self.cat_maps[col_name][x] >= self.min_support
+                else -1
+            )
+            return df
+        else:
+            raise ValueError("Unsupported encoding method, try [freq]")
diff --git a/python/src/lazylearn/preprocessing/imputation/__init__.py b/python/src/lazylearn/preprocessing/imputation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/src/lazylearn/regression/models/randomforest/random_forest_steps/__init__.py b/python/src/lazylearn/regression/models/randomforest/random_forest_steps/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py b/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py
new file mode 100644
index 0000000..dd62ca7
--- /dev/null
+++ b/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py
@@ -0,0 +1,17 @@
+from pipeline.pipeline import PipelineStep, RegressionPipeline
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import mean_absolute_error
+
+
+class RandomForestRegressorStep(PipelineStep):
+    def __init__(self):
+        self.regressor = RandomForestRegressor()
+
+    def fit(self, pipeline: RegressionPipeline):
+        self.regressor.fit(X=pipeline.train_features_df, y=pipeline.train_targets)
+
+        #y_hat = self.regressor.predict(X=pipeline.holdout_features_df)
+        #pipeline.holdout_score = mean_absolute_error(pipeline.holdout_targets, y_hat)
+
+    def predict(self, pipeline: RegressionPipeline):
+        raise NotImplementedError
\ No newline at end of file
diff --git a/python/src/lazylearn/regression/models/randomforest/randomforest.py b/python/src/lazylearn/regression/models/randomforest/randomforest.py
index 4a78978..cad9c61 100644
--- a/python/src/lazylearn/regression/models/randomforest/randomforest.py
+++ b/python/src/lazylearn/regression/models/randomforest/randomforest.py
@@ -1,15 +1,32 @@
 from models.models import Dataset
 from pipeline.pipeline import RegressionPipeline
+from preprocessing.encoding.encoders import OrdinalConverter
+from regression.models.randomforest.random_forest_steps.regressor_step import (
+    RandomForestRegressorStep,
+)
+from sklearn.ensemble import RandomForestRegressor
 
 
-class RandomForestRegressionPipeline(RegressionPipeline):
-    def __init__(self):
-        self.target = None
-        self.dataset: Dataset = None
+class RandomForestRegressionRunner:
+    def __init__(self, target, dataset):
+        self.target = target
+        self.dataset: Dataset = dataset
+        self.pipeline = RegressionPipeline()
 
-    def run(self):
+        self.pipeline.train_features_df = self.dataset.partitions["train"].copy()
+        self.pipeline.train_targets = self.dataset.partitions["train"][target]
+        self.pipeline.holdout_features_df = self.dataset.partitions["test"].copy()
+        self.pipeline.holdout_targets = self.dataset.partitions["test"][target]
+
+    def fit(self):
         # preprocess numeric vars
+        cat_vars = self.dataset.type_collections["categorical"]
+
+        self.pipeline.add(OrdinalConverter(cat_vars=cat_vars))
+
+        # self.pipeline.add(RandomForestRegressorStep())
 
-        # preprocess categorical vars
+        self.pipeline.fit()
 
-        pass
+    def predict(self):
+        raise NotImplementedError
diff --git a/python/src/lazylearn/strategies/strategy_builder.py b/python/src/lazylearn/strategies/strategy_builder.py
new file mode 100644
index 0000000..b94ab4c
--- /dev/null
+++ b/python/src/lazylearn/strategies/strategy_builder.py
@@ -0,0 +1 @@
+from lazylearn import LazyLearner

From cd2c4ffc9d981dee126cf7c1b88a869abb58693d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Frederik=20Peter=20H=C3=B8ngaard?=
 <mail@frederikhoengaard.com>
Date: Sat, 20 May 2023 23:23:20 +0200
Subject: [PATCH 7/8] linting

---
 .../randomforest/random_forest_steps/regressor_step.py      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py b/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py
index dd62ca7..57c45ee 100644
--- a/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py
+++ b/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py
@@ -10,8 +10,8 @@ def __init__(self):
     def fit(self, pipeline: RegressionPipeline):
         self.regressor.fit(X=pipeline.train_features_df, y=pipeline.train_targets)
 
-        #y_hat = self.regressor.predict(X=pipeline.holdout_features_df)
-        #pipeline.holdout_score = mean_absolute_error(pipeline.holdout_targets, y_hat)
+        # y_hat = self.regressor.predict(X=pipeline.holdout_features_df)
+        # pipeline.holdout_score = mean_absolute_error(pipeline.holdout_targets, y_hat)
 
     def predict(self, pipeline: RegressionPipeline):
-        raise NotImplementedError
\ No newline at end of file
+        raise NotImplementedError

From 35acb802d6038907606f333a6a9114c6cbda1def Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Frederik=20Peter=20H=C3=B8ngaard?=
 <mail@frederikhoengaard.com>
Date: Sun, 21 May 2023 01:13:03 +0200
Subject: [PATCH 8/8] basic pipeline functional

---
 python/src/lazylearn/lazylearn.py             | 23 ++++++++++--
 python/src/lazylearn/pipeline/pipeline.py     | 11 ++++++
 .../preprocessing/encoding/encoders.py        | 36 +++++++++++++++----
 .../random_forest_steps/regressor_step.py     | 22 +++++++-----
 .../models/randomforest/randomforest.py       | 22 ++++++++----
 .../lazylearn/strategies/strategy_builder.py  |  1 -
 6 files changed, 90 insertions(+), 25 deletions(-)

diff --git a/python/src/lazylearn/lazylearn.py b/python/src/lazylearn/lazylearn.py
index 52df770..5b91328 100644
--- a/python/src/lazylearn/lazylearn.py
+++ b/python/src/lazylearn/lazylearn.py
@@ -2,7 +2,10 @@
 from model_selection.splitters import test_train_splitter
 from preprocessing.time.date_processor import date_processor
 from preprocessing.time.duration import duration_builder
-from regression.models.randomforest.randomforest import RandomForestRegressionRunner
+from regression.models.randomforest.randomforest import (  # noqa
+    RandomForestRegressionRunner,
+)
+from sklearn.metrics import mean_absolute_error
 
 
 class LazyLearner:
@@ -40,9 +43,25 @@ def create_project(self, data, target, task="infer"):
         # set modelling configurations
 
     def run_autopilot(self):
+        """
+        TODO: Everything here must be abstracted away into strategies
+        TODO: such that several models are run and their scores are added to
+        TODO: the leaderboard
+
+        :return:
+        """
+
         simple_random_forest = RandomForestRegressionRunner(
-            target=self.target, dataset=self.dataset
+            target=self.target,
+            dataset=self.dataset,
+            random_state=self.random_state,  # noqa
         )
         simple_random_forest.fit()
 
+        # get holdout scores
+        simple_random_forest.predict(self.dataset.partitions["test"])
+        simple_random_forest.pipeline.holdout_score = mean_absolute_error(
+            self.dataset.partitions["test"][self.target],
+            simple_random_forest.pipeline.tmp_pred,
+        )
         return simple_random_forest
diff --git a/python/src/lazylearn/pipeline/pipeline.py b/python/src/lazylearn/pipeline/pipeline.py
index 2e69e14..c26ec72 100644
--- a/python/src/lazylearn/pipeline/pipeline.py
+++ b/python/src/lazylearn/pipeline/pipeline.py
@@ -24,6 +24,9 @@ def apply(self, pipeline: Pipeline):
     def fit(self, pipeline: Pipeline):
         pass
 
+    def predict(self, pipeline: Pipeline):
+        pass
+
 
 class IngestionPipeline(Pipeline):
     def __init__(self):
@@ -49,11 +52,19 @@ def __init__(self):
         super().__init__()
         self._is_fitted = False
         self.feature_list: list = []
+        self.tmp_test = None
+        self.tmp_pred = None
+        self.target = None
 
     def fit(self):
         [step.fit(self) for step in self._steps]
         self._is_fitted = True
 
+    def predict(self):
+        assert self._is_fitted
+        [step.predict(self) for step in self._steps]
+        return self.tmp_pred
+
 
 class RegressionPipeline(ModelPipeline):
     def __init__(self):
diff --git a/python/src/lazylearn/preprocessing/encoding/encoders.py b/python/src/lazylearn/preprocessing/encoding/encoders.py
index 6126cdf..4d6b4e7 100644
--- a/python/src/lazylearn/preprocessing/encoding/encoders.py
+++ b/python/src/lazylearn/preprocessing/encoding/encoders.py
@@ -1,4 +1,4 @@
-from models.models import Dataset
+from pandas import DataFrame
 from pipeline.pipeline import ModelPipeline
 
 
@@ -21,15 +21,25 @@ def __init__(
 
     def fit(self, pipeline: ModelPipeline):
         for var in self.cat_vars:
-            pipeline.train_features_df = self.convert(pipeline.train_features_df, var)
+            pipeline.train_features_df = self.convert(
+                pipeline.train_features_df, var
+            )  # noqa
             pipeline.feature_list.append(var)
 
-    def convert(self, df, col_name):
+    def convert(self, df: DataFrame, col_name: str) -> DataFrame:
         """
+        Encodes a categorical column ordinally.
+        Currently only the "freq" method is supported,
+        and it encodes a value with an integer id by
+        increasing frequency i.e. more frequent values
+        receive a higher encoding
 
-        :param df:
-        :param col_name:
-        :return:
+        Note that this should only be done on the training
+        data!
+
+        :param df: pandas DataFrame of features
+        :param col_name: column to consider
+        :return: transformed DataFrame
         """
         if self.method == "freq":
             self.cat_freqs[col_name] = {}
@@ -43,7 +53,7 @@ def convert(self, df, col_name):
                 [(key, val) for key, val in self.cat_freqs[col_name].items()],
                 key=lambda x: x[1],
             )
-            print(freq_pairs)
+
             self.cat_maps[col_name] = {key: val for key, val in freq_pairs}
 
             df[col_name] = df[col_name].apply(
@@ -54,3 +64,15 @@ def convert(self, df, col_name):
             return df
         else:
             raise ValueError("Unsupported encoding method, try [freq]")
+
+    def predict(self, pipeline: ModelPipeline):
+        df = pipeline.tmp_test
+
+        for var in self.cat_vars:
+            df[var] = df[var].apply(
+                lambda x: self.cat_maps[var][x]
+                if x in self.cat_maps[var]
+                else -2  # noqa
+            )
+
+        pipeline.tmp_test = df
diff --git a/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py b/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py
index 57c45ee..2217332 100644
--- a/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py
+++ b/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py
@@ -1,17 +1,23 @@
 from pipeline.pipeline import PipelineStep, RegressionPipeline
 from sklearn.ensemble import RandomForestRegressor
-from sklearn.metrics import mean_absolute_error
 
 
 class RandomForestRegressorStep(PipelineStep):
-    def __init__(self):
-        self.regressor = RandomForestRegressor()
+    def __init__(self, random_state=None):
+        self.regressor = RandomForestRegressor(random_state=random_state)
 
     def fit(self, pipeline: RegressionPipeline):
-        self.regressor.fit(X=pipeline.train_features_df, y=pipeline.train_targets)
-
-        # y_hat = self.regressor.predict(X=pipeline.holdout_features_df)
-        # pipeline.holdout_score = mean_absolute_error(pipeline.holdout_targets, y_hat)
+        pipeline.feature_list = [
+            item for item in pipeline.feature_list if item != pipeline.target
+        ]
+        print("Fitting RandomForestRegressor")
+        self.regressor.fit(
+            X=pipeline.train_features_df[pipeline.feature_list],
+            y=pipeline.train_targets,
+        )  # noqa
+        print("RandomForestRegressor fitted!")
 
     def predict(self, pipeline: RegressionPipeline):
-        raise NotImplementedError
+        pipeline.tmp_pred = self.regressor.predict(
+            X=pipeline.tmp_test[pipeline.feature_list]
+        )
diff --git a/python/src/lazylearn/regression/models/randomforest/randomforest.py b/python/src/lazylearn/regression/models/randomforest/randomforest.py
index cad9c61..2ba5aa3 100644
--- a/python/src/lazylearn/regression/models/randomforest/randomforest.py
+++ b/python/src/lazylearn/regression/models/randomforest/randomforest.py
@@ -4,29 +4,37 @@
 from regression.models.randomforest.random_forest_steps.regressor_step import (
     RandomForestRegressorStep,
 )
-from sklearn.ensemble import RandomForestRegressor
 
 
 class RandomForestRegressionRunner:
-    def __init__(self, target, dataset):
+    def __init__(self, target, dataset, random_state=42):
         self.target = target
         self.dataset: Dataset = dataset
+        self.random_state = random_state
         self.pipeline = RegressionPipeline()
+        self.pipeline.target = target
 
-        self.pipeline.train_features_df = self.dataset.partitions["train"].copy()
+        self.pipeline.train_features_df = self.dataset.partitions[
+            "train"
+        ].copy()  # noqa
         self.pipeline.train_targets = self.dataset.partitions["train"][target]
-        self.pipeline.holdout_features_df = self.dataset.partitions["test"].copy()
+        self.pipeline.holdout_features_df = self.dataset.partitions[
+            "test"
+        ].copy()  # noqa
         self.pipeline.holdout_targets = self.dataset.partitions["test"][target]
 
     def fit(self):
         # preprocess numeric vars
         cat_vars = self.dataset.type_collections["categorical"]
+        num_vars = self.dataset.type_collections["numeric"]
+        self.pipeline.feature_list.extend(num_vars)
 
         self.pipeline.add(OrdinalConverter(cat_vars=cat_vars))
 
-        # self.pipeline.add(RandomForestRegressorStep())
+        self.pipeline.add(RandomForestRegressorStep())
 
         self.pipeline.fit()
 
-    def predict(self):
-        raise NotImplementedError
+    def predict(self, features):
+        self.pipeline.tmp_test = features
+        return self.pipeline.predict()
diff --git a/python/src/lazylearn/strategies/strategy_builder.py b/python/src/lazylearn/strategies/strategy_builder.py
index b94ab4c..e69de29 100644
--- a/python/src/lazylearn/strategies/strategy_builder.py
+++ b/python/src/lazylearn/strategies/strategy_builder.py
@@ -1 +0,0 @@
-from lazylearn import LazyLearner