Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/verify.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
pull_request:

env:
PYTHONPATH: ./python/src/main/
PYTHONPATH: ./python/src/lazylearn/

jobs:
testing:
Expand Down Expand Up @@ -54,12 +54,12 @@ jobs:

- name: black
run: |
python -m black --check python/src/main/
python -m black --check python/src/lazylearn/

- name: isort
run: |
python -m isort python/src/main/ --multi-line 3 --profile black --check
python -m isort python/src/lazylearn/ --multi-line 3 --profile black --check

- name: flake8
run: |
python -m flake8 python/src/main/
python -m flake8 python/src/lazylearn/
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ __pycache__/
# JetBrains
.idea

# local
notebooks/

# Distribution / packaging
.Python
build/
Expand Down
3 changes: 3 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ verify_ssl = true
[packages]
loguru = "==0.6.*"
pandas = "==1.5.*"
scikit-learn = "*"
tqdm = "*"
jupyter = "*"

[dev-packages]
black = "==23.*"
Expand Down
1,027 changes: 1,024 additions & 3 deletions Pipfile.lock

Large diffs are not rendered by default.

27 changes: 22 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,29 @@
# lazy-learn

---
<img width="500" src="doc/logo/transparent_small.png">

## About

lazy-learn is a high-level Python interface for automated machine learning (AutoML). While there are many AutoML libraries available each typically solves a niche area of the overall ML pipeline without providing a covering and approachable end-to-end system.
**lazy-learn** is a high-level Python interface for automated machine learning (AutoML). While there are many AutoML libraries available each typically solves a niche area of the overall ML pipeline without providing a covering and approachable end-to-end system.

The aim of lazy-learn is exactly that. Given a dataset, easy-learn will analyse types and distributions of attributes, preprocess, feature-engineer and ultimately train models to be used for further evaluation or inference.

## Usage

Using lazy-learn revolves around the `LazyLearner` class. You can think of it as a kind of project, and it is the wrapper for any experiment within lazy-learn.

## Installation

### Dependencies

lazy-learn requires:

- pandas
- scikit-learn

### User Installation
```
pip install lazy-learn
```

## Help and Support
### Documentation

### Citation
Binary file added doc/logo/grayscale_transparent.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/logo/original.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/logo/transparent.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/logo/transparent_small.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ version = "0.0.1"
authors = [
{ name="Frederik P. Høngaard", email="mail@frederikhoengaard.com" },
]
description = "A small example package"
description = "lazy-learn is a high-level Python interface for automated machine learning (AutoML) for the lazy data scientist. While there are many AutoML libraries available each typically solves a niche area of the overall ML pipeline without providing a covering and approachable end-to-end system. lazy-learn aims at providing the most approachable and fastest access to building baseline models."
readme = "README.md"
requires-python = ">=3.7"
classifiers = [
Expand All @@ -18,4 +18,4 @@ classifiers = [
]

[project.urls]
"Homepage" = "https://github.com/pypa/sampleproject"
"Homepage" = "https://github.com/frederikhoengaard/lazy-learn"
Empty file.
2 changes: 2 additions & 0 deletions python/src/lazylearn/errors/errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class DataSourceError(Exception):
"""Raised if passing an incompatible argument as data source"""
32 changes: 32 additions & 0 deletions python/src/lazylearn/ingestion/ingestion_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from ingestion.ingestion_pipeline_steps.data_parser_step import DataSourceParser # noqa
from ingestion.ingestion_pipeline_steps.interpreter_step import ( # noqa
ColumnTypeInterpreter,
)
from ingestion.ingestion_pipeline_steps.summary_stats_step import ( # noqa
SummaryStatistics,
)
from pipeline.pipeline import IngestionPipeline


class Ingestion:
def __init__(self):
pass

def run(self, data):
"""

:param data:
:return:
"""
pipeline = IngestionPipeline()
pipeline.raw_data = data

pipeline.add(DataSourceParser())

pipeline.add(ColumnTypeInterpreter())

pipeline.add(SummaryStatistics())

pipeline.run()

return pipeline.response()
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from errors.errors import DataSourceError
from pandas import DataFrame
from pipeline.pipeline import IngestionPipeline, PipelineStep


class DataSourceParser(PipelineStep):
def apply(self, pipeline: IngestionPipeline):
"""
This method is responsible for parsing the raw data
source from its parent pipeline into a DataFrame
object.

:param pipeline: parent IngestionPipeline
:return:
"""
assert pipeline.raw_data is not None

if isinstance(pipeline.raw_data, DataFrame):
pipeline.df = pipeline.raw_data
else:
raise DataSourceError
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import pandas as pd
from pandas import Series
from pipeline.pipeline import IngestionPipeline
from tqdm import tqdm


class ColumnTypeInterpreter:
def apply(self, pipeline: IngestionPipeline):
"""
This method is responsible for inferring the
types of the columns of the project dataset

:param pipeline: parent IngestionPipeline
:return:
"""
self.df = pipeline.df
columns = pipeline.df.columns
column_types = {}

for column_name in tqdm(columns):
column_types[column_name] = self.analyze_column(
pipeline.df[column_name]
) # noqa

pipeline.column_type_map = column_types

def analyze_column(self, column: Series):
"""

:param column:
:return:
"""
values = column.tolist()
types = [type(value) for value in values]

if self.categorical_test(values):
return "categorical"

elif self.numeric_test(types):
return "numeric"

elif self.datetime_check(column):
return "datetime"
else:
return "object"

@staticmethod
def categorical_test(values: list):
"""
Tests whether a column is of categorical type.
This is decided as the case if the number of unique values is
less than 5% of the total number of values in the column.

:param values: list of values of any type
:return: True if column is categorical, False otherwise
"""
n_total = len(values)
n_unique = len(set(values))
percentage_unique = n_unique / n_total

if percentage_unique < 0.05:
return True
return False

@staticmethod
def numeric_test(types: list):
"""
Tests whether a column is of numeric tyoe.
This is decided as the case if all values
of a column is either float or int.

:param types: list of type objects
:return: True if column is numeric, False otherwise
"""
return all([item == float or item == int for item in set(types)])

@staticmethod
def string_test(types: set):
raise NotImplementedError

def datetime_check(self, column: Series):
try:
self.df[column.name] = pd.to_datetime(column)
return True
except Exception as e: # noqa
return False
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from pipeline.pipeline import IngestionPipeline, PipelineStep


class SummaryStatistics(PipelineStep):
def apply(self, pipeline: IngestionPipeline):
"""
This step computes summary statistics for
numeric attributes in the dataset.

:param pipeline: parent IngestionPipeline
:return:
"""
numeric_attributes = [
column
for column in pipeline.column_type_map
if pipeline.column_type_map[column] == "numeric"
]

for attr in numeric_attributes:
pipeline.summary_stats[attr] = (
pipeline.df[attr].describe().to_dict()
) # noqa
18 changes: 18 additions & 0 deletions python/src/lazylearn/lazylearn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from ingestion.ingestion_pipeline import Ingestion


class LazyLearner:
def __init__(self):
self.dataset = None

def create_project(self, data, target, task="infer"):
# ingest data
ingestion_response = Ingestion().run(data) # noqa

# preprocess

# set modelling configurations

# train

# eval
10 changes: 9 additions & 1 deletion python/src/lazylearn/models/models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
from pandas import DataFrame


class Dataset:
def __init__(self):
def __init__(self, df: DataFrame, column_type_map: dict):
self.name = None
self.description = None
self.df = df
self.column_type_map = column_type_map

def save(self):
raise NotImplementedError


class Model:
Expand Down
34 changes: 34 additions & 0 deletions python/src/lazylearn/pipeline/pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from typing import List

from models.models import Dataset
from pandas import DataFrame


class Pipeline:
def __init__(self):
self._has_run: bool = False
self._steps: List[PipelineStep] = []

def add(self, pipeline_step):
self._steps.append(pipeline_step)

def run(self):
[step.apply(self) for step in self._steps]
self._has_run = True


class PipelineStep:
def apply(self, pipeline: Pipeline):
pass


class IngestionPipeline(Pipeline):
def __init__(self):
super().__init__()
self.raw_data = None
self.df: DataFrame = None
self.column_type_map: dict = None
self.summary_stats: dict = {}

def response(self):
return Dataset(df=self.df, column_type_map=self.column_type_map)
Empty file.
Empty file.
15 changes: 15 additions & 0 deletions python/src/lazylearn/preprocessing/encoding/encoders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
class OrdinalConverter:
def __init__(
self,
max_cardinality: int = None,
min_support: int = 5,
other_category: bool = True,
method: str = "freq",
):
self.card_max = max_cardinality
self.min_support = min_support
self.other_category = other_category
self.method = method

def convert(self, df, col):
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from ingestion.ingestion_pipeline_steps.data_parser_step import DataSourceParser # noqa
from pipeline.pipeline import IngestionPipeline
from sklearn.datasets import load_iris


def test_iris_okay():
pipeline = IngestionPipeline()
pipeline.raw_data = load_iris(return_X_y=True, as_frame=True)[0]
pipeline.add(DataSourceParser())
pipeline.run()

assert pipeline.raw_data.equals(pipeline.df)
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from ingestion.ingestion_pipeline_steps.interpreter_step import ( # noqa
ColumnTypeInterpreter,
)
from pipeline.pipeline import IngestionPipeline
from sklearn.datasets import load_iris


def test_iris_types_numeric():
pipeline = IngestionPipeline()
pipeline.df = load_iris(return_X_y=True, as_frame=True)[0]
pipeline.add(ColumnTypeInterpreter())
pipeline.run()

assert pipeline.column_type_map == {
"sepal length (cm)": "numeric",
"sepal width (cm)": "numeric",
"petal length (cm)": "numeric",
"petal width (cm)": "numeric",
}
Loading