frederikhoengaard · frederikhoengaard · May 7, 2023 · May 5, 2023 · May 5, 2023 · May 5, 2023
diff --git a/.github/workflows/verify.yaml b/.github/workflows/verify.yaml
@@ -4,7 +4,7 @@ on:
   pull_request:
 
 env:
-  PYTHONPATH: ./python/src/main/
+  PYTHONPATH: ./python/src/lazylearn/
 
 jobs:
   testing:
@@ -54,12 +54,12 @@ jobs:
 
       - name: black
         run: |
-          python -m black --check python/src/main/
+          python -m black --check python/src/lazylearn/
 
       - name: isort
         run: |
-          python -m isort python/src/main/ --multi-line 3 --profile black --check
+          python -m isort python/src/lazylearn/ --multi-line 3 --profile black --check
 
       - name: flake8
         run: |
-          python -m flake8 python/src/main/
+          python -m flake8 python/src/lazylearn/
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,9 @@ __pycache__/
 # JetBrains
 .idea
 
+# local
+notebooks/
+
 # Distribution / packaging
 .Python
 build/

diff --git a/Pipfile b/Pipfile
@@ -6,6 +6,9 @@ verify_ssl = true
 [packages]
 loguru = "==0.6.*"
 pandas = "==1.5.*"
+scikit-learn = "*"
+tqdm = "*"
+jupyter = "*"
 
 [dev-packages]
 black = "==23.*"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -1,12 +1,29 @@
-# lazy-learn
 
----
+<img width="500" src="doc/logo/transparent_small.png">
 
-## About
-
-lazy-learn is a high-level Python interface for automated machine learning (AutoML). While there are many AutoML libraries available each typically solves a niche area of the overall ML pipeline without providing a covering and approachable end-to-end system.
+**lazy-learn** is a high-level Python interface for automated machine learning (AutoML). While there are many AutoML libraries available each typically solves a niche area of the overall ML pipeline without providing a covering and approachable end-to-end system.
 
 The aim of lazy-learn is exactly that. Given a dataset, easy-learn will analyse types and distributions of attributes, preprocess, feature-engineer and ultimately train models to be used for further evaluation or inference. 
 
 ## Usage
 
+Using lazy-learn revolves around the `LazyLearner` class. You can think of it as a kind of project, and it is the wrapper for any experiment within lazy-learn.
+
+## Installation
+
+### Dependencies
+
+lazy-learn requires:
+
+- pandas
+- scikit-learn
+
+### User Installation 
+```
+pip install lazy-learn
+```
+
+## Help and Support
+### Documentation
+
+### Citation
diff --git a/doc/logo/grayscale_transparent.png b/doc/logo/grayscale_transparent.png
diff --git a/doc/logo/original.png b/doc/logo/original.png
diff --git a/doc/logo/transparent.png b/doc/logo/transparent.png
diff --git a/doc/logo/transparent_small.png b/doc/logo/transparent_small.png
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ version = "0.0.1"
 authors = [
   { name="Frederik P. Høngaard", email="mail@frederikhoengaard.com" },
 ]
-description = "A small example package"
+description = "lazy-learn is a high-level Python interface for automated machine learning (AutoML) for the lazy data scientist. While there are many AutoML libraries available each typically solves a niche area of the overall ML pipeline without providing a covering and approachable end-to-end system. lazy-learn aims at providing the most approachable and fastest access to building baseline models."
 readme = "README.md"
 requires-python = ">=3.7"
 classifiers = [
@@ -18,4 +18,4 @@ classifiers = [
 ]
 
 [project.urls]
-"Homepage" = "https://github.com/pypa/sampleproject"
+"Homepage" = "https://github.com/frederikhoengaard/lazy-learn"
diff --git a/python/src/lazylearn/errors/__init__.py b/python/src/lazylearn/errors/__init__.py
diff --git a/python/src/lazylearn/errors/errors.py b/python/src/lazylearn/errors/errors.py
@@ -0,0 +1,2 @@
+class DataSourceError(Exception):
+    """Raised if passing an incompatible argument as data source"""
diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline.py b/python/src/lazylearn/ingestion/ingestion_pipeline.py
@@ -0,0 +1,32 @@
+from ingestion.ingestion_pipeline_steps.data_parser_step import DataSourceParser  # noqa
+from ingestion.ingestion_pipeline_steps.interpreter_step import (  # noqa
+    ColumnTypeInterpreter,
+)
+from ingestion.ingestion_pipeline_steps.summary_stats_step import (  # noqa
+    SummaryStatistics,
+)
+from pipeline.pipeline import IngestionPipeline
+
+
+class Ingestion:
+    def __init__(self):
+        pass
+
+    def run(self, data):
+        """
+
+        :param data:
+        :return:
+        """
+        pipeline = IngestionPipeline()
+        pipeline.raw_data = data
+
+        pipeline.add(DataSourceParser())
+
+        pipeline.add(ColumnTypeInterpreter())
+
+        pipeline.add(SummaryStatistics())
+
+        pipeline.run()
+
+        return pipeline.response()
diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/__init__.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/__init__.py
diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py
@@ -0,0 +1,21 @@
+from errors.errors import DataSourceError
+from pandas import DataFrame
+from pipeline.pipeline import IngestionPipeline, PipelineStep
+
+
+class DataSourceParser(PipelineStep):
+    def apply(self, pipeline: IngestionPipeline):
+        """
+        This method is responsible for parsing the raw data
+        source from its parent pipeline into a DataFrame
+        object.
+
+        :param pipeline: parent IngestionPipeline
+        :return:
+        """
+        assert pipeline.raw_data is not None
+
+        if isinstance(pipeline.raw_data, DataFrame):
+            pipeline.df = pipeline.raw_data
+        else:
+            raise DataSourceError
diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py
@@ -0,0 +1,86 @@
+import pandas as pd
+from pandas import Series
+from pipeline.pipeline import IngestionPipeline
+from tqdm import tqdm
+
+
+class ColumnTypeInterpreter:
+    def apply(self, pipeline: IngestionPipeline):
+        """
+        This method is responsible for inferring the
+        types of the columns of the project dataset
+
+        :param pipeline: parent IngestionPipeline
+        :return:
+        """
+        self.df = pipeline.df
+        columns = pipeline.df.columns
+        column_types = {}
+
+        for column_name in tqdm(columns):
+            column_types[column_name] = self.analyze_column(
+                pipeline.df[column_name]
+            )  # noqa
+
+        pipeline.column_type_map = column_types
+
+    def analyze_column(self, column: Series):
+        """
+
+        :param column:
+        :return:
+        """
+        values = column.tolist()
+        types = [type(value) for value in values]
+
+        if self.categorical_test(values):
+            return "categorical"
+
+        elif self.numeric_test(types):
+            return "numeric"
+
+        elif self.datetime_check(column):
+            return "datetime"
+        else:
+            return "object"
+
+    @staticmethod
+    def categorical_test(values: list):
+        """
+        Tests whether a column is of categorical type.
+        This is decided as the case if the number of unique values is
+        less than 5% of the total number of values in the column.
+
+        :param values: list of values of any type
+        :return: True if column is categorical, False otherwise
+        """
+        n_total = len(values)
+        n_unique = len(set(values))
+        percentage_unique = n_unique / n_total
+
+        if percentage_unique < 0.05:
+            return True
+        return False
+
+    @staticmethod
+    def numeric_test(types: list):
+        """
+        Tests whether a column is of numeric tyoe.
+        This is decided as the case if all values
+        of a column is either float or int.
+
+        :param types: list of type objects
+        :return: True if column is numeric, False otherwise
+        """
+        return all([item == float or item == int for item in set(types)])
+
+    @staticmethod
+    def string_test(types: set):
+        raise NotImplementedError
+
+    def datetime_check(self, column: Series):
+        try:
+            self.df[column.name] = pd.to_datetime(column)
+            return True
+        except Exception as e:  # noqa
+            return False
diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/summary_stats_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/summary_stats_step.py
@@ -0,0 +1,22 @@
+from pipeline.pipeline import IngestionPipeline, PipelineStep
+
+
+class SummaryStatistics(PipelineStep):
+    def apply(self, pipeline: IngestionPipeline):
+        """
+        This step computes summary statistics for
+        numeric attributes in the dataset.
+
+        :param pipeline: parent IngestionPipeline
+        :return:
+        """
+        numeric_attributes = [
+            column
+            for column in pipeline.column_type_map
+            if pipeline.column_type_map[column] == "numeric"
+        ]
+
+        for attr in numeric_attributes:
+            pipeline.summary_stats[attr] = (
+                pipeline.df[attr].describe().to_dict()
+            )  # noqa
diff --git a/python/src/lazylearn/lazylearn.py b/python/src/lazylearn/lazylearn.py
@@ -0,0 +1,18 @@
+from ingestion.ingestion_pipeline import Ingestion
+
+
+class LazyLearner:
+    def __init__(self):
+        self.dataset = None
+
+    def create_project(self, data, target, task="infer"):
+        # ingest data
+        ingestion_response = Ingestion().run(data)  # noqa
+
+        # preprocess
+
+        # set modelling configurations
+
+        # train
+
+        # eval
diff --git a/python/src/lazylearn/models/models.py b/python/src/lazylearn/models/models.py
@@ -1,7 +1,15 @@
+from pandas import DataFrame
+
+
 class Dataset:
-    def __init__(self):
+    def __init__(self, df: DataFrame, column_type_map: dict):
         self.name = None
         self.description = None
+        self.df = df
+        self.column_type_map = column_type_map
+
+    def save(self):
+        raise NotImplementedError
 
 
 class Model:

diff --git a/python/src/lazylearn/pipeline/pipeline.py b/python/src/lazylearn/pipeline/pipeline.py
@@ -0,0 +1,34 @@
+from typing import List
+
+from models.models import Dataset
+from pandas import DataFrame
+
+
+class Pipeline:
+    def __init__(self):
+        self._has_run: bool = False
+        self._steps: List[PipelineStep] = []
+
+    def add(self, pipeline_step):
+        self._steps.append(pipeline_step)
+
+    def run(self):
+        [step.apply(self) for step in self._steps]
+        self._has_run = True
+
+
+class PipelineStep:
+    def apply(self, pipeline: Pipeline):
+        pass
+
+
+class IngestionPipeline(Pipeline):
+    def __init__(self):
+        super().__init__()
+        self.raw_data = None
+        self.df: DataFrame = None
+        self.column_type_map: dict = None
+        self.summary_stats: dict = {}
+
+    def response(self):
+        return Dataset(df=self.df, column_type_map=self.column_type_map)
diff --git a/python/src/lazylearn/preprocessing/__init__.py b/python/src/lazylearn/preprocessing/__init__.py
diff --git a/python/src/lazylearn/preprocessing/encoding/__init__.py b/python/src/lazylearn/preprocessing/encoding/__init__.py
diff --git a/python/src/lazylearn/preprocessing/encoding/encoders.py b/python/src/lazylearn/preprocessing/encoding/encoders.py
@@ -0,0 +1,15 @@
+class OrdinalConverter:
+    def __init__(
+        self,
+        max_cardinality: int = None,
+        min_support: int = 5,
+        other_category: bool = True,
+        method: str = "freq",
+    ):
+        self.card_max = max_cardinality
+        self.min_support = min_support
+        self.other_category = other_category
+        self.method = method
+
+    def convert(self, df, col):
+        pass
diff --git a/python/src/test/ingestion/ingestion_pipeline_steps/test_data_parser_step.py b/python/src/test/ingestion/ingestion_pipeline_steps/test_data_parser_step.py
@@ -0,0 +1,12 @@
+from ingestion.ingestion_pipeline_steps.data_parser_step import DataSourceParser  # noqa
+from pipeline.pipeline import IngestionPipeline
+from sklearn.datasets import load_iris
+
+
+def test_iris_okay():
+    pipeline = IngestionPipeline()
+    pipeline.raw_data = load_iris(return_X_y=True, as_frame=True)[0]
+    pipeline.add(DataSourceParser())
+    pipeline.run()
+
+    assert pipeline.raw_data.equals(pipeline.df)
diff --git a/python/src/test/ingestion/ingestion_pipeline_steps/test_interpreter_step.py b/python/src/test/ingestion/ingestion_pipeline_steps/test_interpreter_step.py
@@ -0,0 +1,19 @@
+from ingestion.ingestion_pipeline_steps.interpreter_step import (  # noqa
+    ColumnTypeInterpreter,
+)
+from pipeline.pipeline import IngestionPipeline
+from sklearn.datasets import load_iris
+
+
+def test_iris_types_numeric():
+    pipeline = IngestionPipeline()
+    pipeline.df = load_iris(return_X_y=True, as_frame=True)[0]
+    pipeline.add(ColumnTypeInterpreter())
+    pipeline.run()
+
+    assert pipeline.column_type_map == {
+        "sepal length (cm)": "numeric",
+        "sepal width (cm)": "numeric",
+        "petal length (cm)": "numeric",
+        "petal width (cm)": "numeric",
+    }
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,6 +13,9 @@ __pycache__/ @@
     # JetBrains
     .idea
+    # local
+    notebooks/
     # Distribution / packaging
     .Python
     build/
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		class DataSourceError(Exception):
		"""Raised if passing an incompatible argument as data source"""