frederikhoengaard · frederikhoengaard · May 20, 2023 · May 7, 2023 · May 8, 2023 · May 17, 2023
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "lazylearn"
-version = "0.0.1"
+version = "0.0.2"
 authors = [
   { name="Frederik P. Høngaard", email="mail@frederikhoengaard.com" },
 ]

diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py
@@ -1,4 +1,6 @@
+import pandas as pd
 from errors.errors import DataSourceError
+from ingestion.utils.csv import csv_check
 from pandas import DataFrame
 from pipeline.pipeline import IngestionPipeline, PipelineStep
 
@@ -17,5 +19,8 @@ def apply(self, pipeline: IngestionPipeline):
 
         if isinstance(pipeline.raw_data, DataFrame):
             pipeline.df = pipeline.raw_data
+        # check if raw data is a path to a csv file and read it into csv
+        elif csv_check(pipeline.df):
+            pipeline.df = pd.read_csv(pipeline.raw_data)
         else:
             raise DataSourceError
diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py
@@ -1,10 +1,14 @@
+import numpy as np
 import pandas as pd
-from pandas import Series
+from pandas import DataFrame, Series
 from pipeline.pipeline import IngestionPipeline
 from tqdm import tqdm
 
 
 class ColumnTypeInterpreter:
+    def __int__(self):
+        self.df: DataFrame = None
+
     def apply(self, pipeline: IngestionPipeline):
         """
         This method is responsible for inferring the
@@ -23,6 +27,10 @@ def apply(self, pipeline: IngestionPipeline):
             )  # noqa
 
         pipeline.column_type_map = column_types
+        if "unknown" in pipeline.column_type_map.values():
+            pipeline.needs_type_map = True
+
+        pipeline.type_collections = self.build_type_collections(column_types)
 
     def analyze_column(self, column: Series):
         """
@@ -33,16 +41,22 @@ def analyze_column(self, column: Series):
         values = column.tolist()
         types = [type(value) for value in values]
 
-        if self.categorical_test(values):
-            return "categorical"
+        column_type = None
 
+        if self.categorical_test(values):
+            column_type = "categorical"
+        elif self.numeric_test(types) and self.id_check(types, values):
+            column_type = "id"
         elif self.numeric_test(types):
-            return "numeric"
+            column_type = "numeric"
 
-        elif self.datetime_check(column):
-            return "datetime"
-        else:
-            return "object"
+        if self.datetime_check(column) and not self.numeric_test(types):
+            column_type = "datetime"
+
+        if column_type is None:
+            column_type = "unknown"
+
+        return column_type
 
     @staticmethod
     def categorical_test(values: list):
@@ -72,15 +86,66 @@ def numeric_test(types: list):
         :param types: list of type objects
         :return: True if column is numeric, False otherwise
         """
-        return all([item == float or item == int for item in set(types)])
+        return all(
+            [
+                item == float or item == int
+                for item in set(types)
+                if item is not None  # noqa
+            ]
+        )
 
     @staticmethod
     def string_test(types: set):
         raise NotImplementedError
 
     def datetime_check(self, column: Series):
-        try:
-            self.df[column.name] = pd.to_datetime(column)
+        """
+
+        :param column:
+        :return:
+        """
+        col_name = str(column.name)
+
+        # if type of column is actually datetime
+        if self.df[col_name].dtype.type == np.datetime64:
             return True
-        except Exception as e:  # noqa
-            return False
+
+        # if date or time is in column name and can be cast as date
+        if "date" in col_name.lower() or "time" in col_name.lower():
+            try:
+                self.df[col_name] = pd.to_datetime(self.df[col_name])
+                return True
+            except Exception as e:  # noqa
+                pass
+
+        # if format of values looks like dates
+
+        return False
+
+    def id_check(self, types, values):
+        """
+
+        :param types:
+        :param values:
+        :return:
+        """
+        return all(
+            [item == int for item in set(types) if item is not None]
+        ) and len(  # noqa
+            set(values)
+        ) == len(
+            self.df
+        )
+
+    @staticmethod
+    def build_type_collections(column_type_map):
+        collections = {}
+
+        for data_type in ["datetime", "numeric", "categorical"]:
+            collections[data_type] = [
+                col
+                for col in column_type_map
+                if column_type_map[col] == data_type  # noqa
+            ]
+
+        return collections
diff --git a/python/src/lazylearn/ingestion/utils/__init__.py b/python/src/lazylearn/ingestion/utils/__init__.py
diff --git a/python/src/lazylearn/ingestion/utils/csv.py b/python/src/lazylearn/ingestion/utils/csv.py
@@ -0,0 +1,2 @@
+def csv_check(path):
+    raise NotImplementedError
diff --git a/python/src/lazylearn/lazylearn.py b/python/src/lazylearn/lazylearn.py
@@ -1,18 +1,67 @@
 from ingestion.ingestion_pipeline import Ingestion
+from model_selection.splitters import test_train_splitter
+from preprocessing.time.date_processor import date_processor
+from preprocessing.time.duration import duration_builder
+from regression.models.randomforest.randomforest import (  # noqa
+    RandomForestRegressionRunner,
+)
+from sklearn.metrics import mean_absolute_error
 
 
 class LazyLearner:
-    def __init__(self):
+    def __init__(self, random_state=None):
         self.dataset = None
+        self.task = None
+        self.models = None
+        self.leaderboard = None
+        self.random_state = random_state
+        self.target = None
 
     def create_project(self, data, target, task="infer"):
         # ingest data
-        ingestion_response = Ingestion().run(data)  # noqa
+        self.target = target
+        self.dataset = Ingestion().run(data)
 
-        # preprocess
+        if task == "infer":
+            # if target is numeric then regression, else classification
+            if self.dataset.column_type_map[target] == "numeric":
+                self.task = "regression"
+            else:
+                self.task = "classification"
+
+        # process dates
+
+        self.dataset = date_processor(self.dataset)
+        self.dataset = duration_builder(self.dataset)
+
+        # split partitions
+
+        self.dataset = test_train_splitter(
+            self.dataset, random_state=self.random_state
+        )  # noqa
 
         # set modelling configurations
 
-        # train
+    def run_autopilot(self):
+        """
+        TODO: Everything here must be abstracted away into strategies
+        TODO: such that several models are run and their scores are added to
+        TODO: the leaderboard
+
+        :return:
+        """
+
+        simple_random_forest = RandomForestRegressionRunner(
+            target=self.target,
+            dataset=self.dataset,
+            random_state=self.random_state,  # noqa
+        )
+        simple_random_forest.fit()
 
-        # eval
+        # get holdout scores
+        simple_random_forest.predict(self.dataset.partitions["test"])
+        simple_random_forest.pipeline.holdout_score = mean_absolute_error(
+            self.dataset.partitions["test"][self.target],
+            simple_random_forest.pipeline.tmp_pred,
+        )
+        return simple_random_forest
diff --git a/python/src/lazylearn/model_selection/__init__.py b/python/src/lazylearn/model_selection/__init__.py
diff --git a/python/src/lazylearn/model_selection/splitters.py b/python/src/lazylearn/model_selection/splitters.py
@@ -0,0 +1,17 @@
+from models.models import Dataset
+from sklearn.model_selection import train_test_split
+
+
+def test_train_splitter(dataset: Dataset, random_state=None) -> Dataset:
+    train_partition, test_partition = train_test_split(
+        dataset.df, test_size=0.2, random_state=random_state
+    )
+
+    dataset.partitions["test"] = test_partition
+    dataset.partitions["train"] = train_partition
+
+    return dataset
+
+
+def cv_splitter(dataset: Dataset) -> Dataset:
+    return dataset
diff --git a/python/src/lazylearn/models/models.py b/python/src/lazylearn/models/models.py
@@ -2,11 +2,20 @@
 
 
 class Dataset:
-    def __init__(self, df: DataFrame, column_type_map: dict):
+    def __init__(
+        self,
+        df: DataFrame,
+        column_type_map: dict,
+        summary_stats: dict,
+        type_collections: dict,
+    ):
         self.name = None
         self.description = None
         self.df = df
         self.column_type_map = column_type_map
+        self.summary_stats = summary_stats
+        self.type_collections = type_collections
+        self.partitions: dict = {}
 
     def save(self):
         raise NotImplementedError

diff --git a/python/src/lazylearn/pipeline/pipeline.py b/python/src/lazylearn/pipeline/pipeline.py
@@ -1,7 +1,7 @@
 from typing import List
 
 from models.models import Dataset
-from pandas import DataFrame
+from pandas import DataFrame, Series
 
 
 class Pipeline:
@@ -21,6 +21,12 @@ class PipelineStep:
     def apply(self, pipeline: Pipeline):
         pass
 
+    def fit(self, pipeline: Pipeline):
+        pass
+
+    def predict(self, pipeline: Pipeline):
+        pass
+
 
 class IngestionPipeline(Pipeline):
     def __init__(self):
@@ -29,6 +35,42 @@ def __init__(self):
         self.df: DataFrame = None
         self.column_type_map: dict = None
         self.summary_stats: dict = {}
+        self.needs_type_map: bool = False
+        self.type_collections: dict = None
 
     def response(self):
-        return Dataset(df=self.df, column_type_map=self.column_type_map)
+        return Dataset(
+            df=self.df,
+            column_type_map=self.column_type_map,
+            summary_stats=self.summary_stats,
+            type_collections=self.type_collections,
+        )
+
+
+class ModelPipeline(Pipeline):
+    def __init__(self):
+        super().__init__()
+        self._is_fitted = False
+        self.feature_list: list = []
+        self.tmp_test = None
+        self.tmp_pred = None
+        self.target = None
+
+    def fit(self):
+        [step.fit(self) for step in self._steps]
+        self._is_fitted = True
+
+    def predict(self):
+        assert self._is_fitted
+        [step.predict(self) for step in self._steps]
+        return self.tmp_pred
+
+
+class RegressionPipeline(ModelPipeline):
+    def __init__(self):
+        super().__init__()
+        self.train_features_df: DataFrame = None
+        self.train_targets: Series = None
+        self.holdout_features_df: DataFrame = None
+        self.holdout_targets: Series = None
+        self.holdout_score: float = None
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		def csv_check(path):
		raise NotImplementedError