Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "lazylearn"
version = "0.0.1"
version = "0.0.2"
authors = [
{ name="Frederik P. Høngaard", email="mail@frederikhoengaard.com" },
]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import pandas as pd
from errors.errors import DataSourceError
from ingestion.utils.csv import csv_check
from pandas import DataFrame
from pipeline.pipeline import IngestionPipeline, PipelineStep

Expand All @@ -17,5 +19,8 @@ def apply(self, pipeline: IngestionPipeline):

if isinstance(pipeline.raw_data, DataFrame):
pipeline.df = pipeline.raw_data
# check if raw data is a path to a csv file and read it into csv
elif csv_check(pipeline.df):
pipeline.df = pd.read_csv(pipeline.raw_data)
else:
raise DataSourceError
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import numpy as np
import pandas as pd
from pandas import Series
from pandas import DataFrame, Series
from pipeline.pipeline import IngestionPipeline
from tqdm import tqdm


class ColumnTypeInterpreter:
def __int__(self):
self.df: DataFrame = None

def apply(self, pipeline: IngestionPipeline):
"""
This method is responsible for inferring the
Expand All @@ -23,6 +27,10 @@ def apply(self, pipeline: IngestionPipeline):
) # noqa

pipeline.column_type_map = column_types
if "unknown" in pipeline.column_type_map.values():
pipeline.needs_type_map = True

pipeline.type_collections = self.build_type_collections(column_types)

def analyze_column(self, column: Series):
"""
Expand All @@ -33,16 +41,22 @@ def analyze_column(self, column: Series):
values = column.tolist()
types = [type(value) for value in values]

if self.categorical_test(values):
return "categorical"
column_type = None

if self.categorical_test(values):
column_type = "categorical"
elif self.numeric_test(types) and self.id_check(types, values):
column_type = "id"
elif self.numeric_test(types):
return "numeric"
column_type = "numeric"

elif self.datetime_check(column):
return "datetime"
else:
return "object"
if self.datetime_check(column) and not self.numeric_test(types):
column_type = "datetime"

if column_type is None:
column_type = "unknown"

return column_type

@staticmethod
def categorical_test(values: list):
Expand Down Expand Up @@ -72,15 +86,66 @@ def numeric_test(types: list):
:param types: list of type objects
:return: True if column is numeric, False otherwise
"""
return all([item == float or item == int for item in set(types)])
return all(
[
item == float or item == int
for item in set(types)
if item is not None # noqa
]
)

@staticmethod
def string_test(types: set):
raise NotImplementedError

def datetime_check(self, column: Series):
try:
self.df[column.name] = pd.to_datetime(column)
"""

:param column:
:return:
"""
col_name = str(column.name)

# if type of column is actually datetime
if self.df[col_name].dtype.type == np.datetime64:
return True
except Exception as e: # noqa
return False

# if date or time is in column name and can be cast as date
if "date" in col_name.lower() or "time" in col_name.lower():
try:
self.df[col_name] = pd.to_datetime(self.df[col_name])
return True
except Exception as e: # noqa
pass

# if format of values looks like dates

return False

def id_check(self, types, values):
"""

:param types:
:param values:
:return:
"""
return all(
[item == int for item in set(types) if item is not None]
) and len( # noqa
set(values)
) == len(
self.df
)

@staticmethod
def build_type_collections(column_type_map):
collections = {}

for data_type in ["datetime", "numeric", "categorical"]:
collections[data_type] = [
col
for col in column_type_map
if column_type_map[col] == data_type # noqa
]

return collections
Empty file.
2 changes: 2 additions & 0 deletions python/src/lazylearn/ingestion/utils/csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
def csv_check(path):
raise NotImplementedError
59 changes: 54 additions & 5 deletions python/src/lazylearn/lazylearn.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,67 @@
from ingestion.ingestion_pipeline import Ingestion
from model_selection.splitters import test_train_splitter
from preprocessing.time.date_processor import date_processor
from preprocessing.time.duration import duration_builder
from regression.models.randomforest.randomforest import ( # noqa
RandomForestRegressionRunner,
)
from sklearn.metrics import mean_absolute_error


class LazyLearner:
def __init__(self):
def __init__(self, random_state=None):
self.dataset = None
self.task = None
self.models = None
self.leaderboard = None
self.random_state = random_state
self.target = None

def create_project(self, data, target, task="infer"):
# ingest data
ingestion_response = Ingestion().run(data) # noqa
self.target = target
self.dataset = Ingestion().run(data)

# preprocess
if task == "infer":
# if target is numeric then regression, else classification
if self.dataset.column_type_map[target] == "numeric":
self.task = "regression"
else:
self.task = "classification"

# process dates

self.dataset = date_processor(self.dataset)
self.dataset = duration_builder(self.dataset)

# split partitions

self.dataset = test_train_splitter(
self.dataset, random_state=self.random_state
) # noqa

# set modelling configurations

# train
def run_autopilot(self):
"""
TODO: Everything here must be abstracted away into strategies
TODO: such that several models are run and their scores are added to
TODO: the leaderboard

:return:
"""

simple_random_forest = RandomForestRegressionRunner(
target=self.target,
dataset=self.dataset,
random_state=self.random_state, # noqa
)
simple_random_forest.fit()

# eval
# get holdout scores
simple_random_forest.predict(self.dataset.partitions["test"])
simple_random_forest.pipeline.holdout_score = mean_absolute_error(
self.dataset.partitions["test"][self.target],
simple_random_forest.pipeline.tmp_pred,
)
return simple_random_forest
Empty file.
17 changes: 17 additions & 0 deletions python/src/lazylearn/model_selection/splitters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from models.models import Dataset
from sklearn.model_selection import train_test_split


def test_train_splitter(dataset: Dataset, random_state=None) -> Dataset:
train_partition, test_partition = train_test_split(
dataset.df, test_size=0.2, random_state=random_state
)

dataset.partitions["test"] = test_partition
dataset.partitions["train"] = train_partition

return dataset


def cv_splitter(dataset: Dataset) -> Dataset:
return dataset
11 changes: 10 additions & 1 deletion python/src/lazylearn/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,20 @@


class Dataset:
def __init__(self, df: DataFrame, column_type_map: dict):
def __init__(
self,
df: DataFrame,
column_type_map: dict,
summary_stats: dict,
type_collections: dict,
):
self.name = None
self.description = None
self.df = df
self.column_type_map = column_type_map
self.summary_stats = summary_stats
self.type_collections = type_collections
self.partitions: dict = {}

def save(self):
raise NotImplementedError
Expand Down
46 changes: 44 additions & 2 deletions python/src/lazylearn/pipeline/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import List

from models.models import Dataset
from pandas import DataFrame
from pandas import DataFrame, Series


class Pipeline:
Expand All @@ -21,6 +21,12 @@ class PipelineStep:
def apply(self, pipeline: Pipeline):
pass

def fit(self, pipeline: Pipeline):
pass

def predict(self, pipeline: Pipeline):
pass


class IngestionPipeline(Pipeline):
def __init__(self):
Expand All @@ -29,6 +35,42 @@ def __init__(self):
self.df: DataFrame = None
self.column_type_map: dict = None
self.summary_stats: dict = {}
self.needs_type_map: bool = False
self.type_collections: dict = None

def response(self):
return Dataset(df=self.df, column_type_map=self.column_type_map)
return Dataset(
df=self.df,
column_type_map=self.column_type_map,
summary_stats=self.summary_stats,
type_collections=self.type_collections,
)


class ModelPipeline(Pipeline):
def __init__(self):
super().__init__()
self._is_fitted = False
self.feature_list: list = []
self.tmp_test = None
self.tmp_pred = None
self.target = None

def fit(self):
[step.fit(self) for step in self._steps]
self._is_fitted = True

def predict(self):
assert self._is_fitted
[step.predict(self) for step in self._steps]
return self.tmp_pred


class RegressionPipeline(ModelPipeline):
def __init__(self):
super().__init__()
self.train_features_df: DataFrame = None
self.train_targets: Series = None
self.holdout_features_df: DataFrame = None
self.holdout_targets: Series = None
self.holdout_score: float = None
Loading