diff --git a/pyproject.toml b/pyproject.toml index 425f362..a45714a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "lazylearn" -version = "0.0.1" +version = "0.0.2" authors = [ { name="Frederik P. Høngaard", email="mail@frederikhoengaard.com" }, ] diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py index 42f323a..19608ad 100644 --- a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py +++ b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py @@ -1,4 +1,6 @@ +import pandas as pd from errors.errors import DataSourceError +from ingestion.utils.csv import csv_check from pandas import DataFrame from pipeline.pipeline import IngestionPipeline, PipelineStep @@ -17,5 +19,8 @@ def apply(self, pipeline: IngestionPipeline): if isinstance(pipeline.raw_data, DataFrame): pipeline.df = pipeline.raw_data + # check if raw data is a path to a csv file and read it into csv + elif csv_check(pipeline.df): + pipeline.df = pd.read_csv(pipeline.raw_data) else: raise DataSourceError diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py index 10f9434..6bdcd31 100644 --- a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py +++ b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py @@ -1,10 +1,14 @@ +import numpy as np import pandas as pd -from pandas import Series +from pandas import DataFrame, Series from pipeline.pipeline import IngestionPipeline from tqdm import tqdm class ColumnTypeInterpreter: + def __int__(self): + self.df: DataFrame = None + def apply(self, pipeline: IngestionPipeline): """ This method is responsible for inferring the @@ -23,6 +27,10 @@ def apply(self, pipeline: IngestionPipeline): ) # noqa pipeline.column_type_map = column_types + if "unknown" in pipeline.column_type_map.values(): + pipeline.needs_type_map = True + + pipeline.type_collections = self.build_type_collections(column_types) def analyze_column(self, column: Series): """ @@ -33,16 +41,22 @@ def analyze_column(self, column: Series): values = column.tolist() types = [type(value) for value in values] - if self.categorical_test(values): - return "categorical" + column_type = None + if self.categorical_test(values): + column_type = "categorical" + elif self.numeric_test(types) and self.id_check(types, values): + column_type = "id" elif self.numeric_test(types): - return "numeric" + column_type = "numeric" - elif self.datetime_check(column): - return "datetime" - else: - return "object" + if self.datetime_check(column) and not self.numeric_test(types): + column_type = "datetime" + + if column_type is None: + column_type = "unknown" + + return column_type @staticmethod def categorical_test(values: list): @@ -72,15 +86,66 @@ def numeric_test(types: list): :param types: list of type objects :return: True if column is numeric, False otherwise """ - return all([item == float or item == int for item in set(types)]) + return all( + [ + item == float or item == int + for item in set(types) + if item is not None # noqa + ] + ) @staticmethod def string_test(types: set): raise NotImplementedError def datetime_check(self, column: Series): - try: - self.df[column.name] = pd.to_datetime(column) + """ + + :param column: + :return: + """ + col_name = str(column.name) + + # if type of column is actually datetime + if self.df[col_name].dtype.type == np.datetime64: return True - except Exception as e: # noqa - return False + + # if date or time is in column name and can be cast as date + if "date" in col_name.lower() or "time" in col_name.lower(): + try: + self.df[col_name] = pd.to_datetime(self.df[col_name]) + return True + except Exception as e: # noqa + pass + + # if format of values looks like dates + + return False + + def id_check(self, types, values): + """ + + :param types: + :param values: + :return: + """ + return all( + [item == int for item in set(types) if item is not None] + ) and len( # noqa + set(values) + ) == len( + self.df + ) + + @staticmethod + def build_type_collections(column_type_map): + collections = {} + + for data_type in ["datetime", "numeric", "categorical"]: + collections[data_type] = [ + col + for col in column_type_map + if column_type_map[col] == data_type # noqa + ] + + return collections diff --git a/python/src/lazylearn/ingestion/utils/__init__.py b/python/src/lazylearn/ingestion/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/ingestion/utils/csv.py b/python/src/lazylearn/ingestion/utils/csv.py new file mode 100644 index 0000000..22d44fb --- /dev/null +++ b/python/src/lazylearn/ingestion/utils/csv.py @@ -0,0 +1,2 @@ +def csv_check(path): + raise NotImplementedError diff --git a/python/src/lazylearn/lazylearn.py b/python/src/lazylearn/lazylearn.py index a6d620a..5b91328 100644 --- a/python/src/lazylearn/lazylearn.py +++ b/python/src/lazylearn/lazylearn.py @@ -1,18 +1,67 @@ from ingestion.ingestion_pipeline import Ingestion +from model_selection.splitters import test_train_splitter +from preprocessing.time.date_processor import date_processor +from preprocessing.time.duration import duration_builder +from regression.models.randomforest.randomforest import ( # noqa + RandomForestRegressionRunner, +) +from sklearn.metrics import mean_absolute_error class LazyLearner: - def __init__(self): + def __init__(self, random_state=None): self.dataset = None + self.task = None + self.models = None + self.leaderboard = None + self.random_state = random_state + self.target = None def create_project(self, data, target, task="infer"): # ingest data - ingestion_response = Ingestion().run(data) # noqa + self.target = target + self.dataset = Ingestion().run(data) - # preprocess + if task == "infer": + # if target is numeric then regression, else classification + if self.dataset.column_type_map[target] == "numeric": + self.task = "regression" + else: + self.task = "classification" + + # process dates + + self.dataset = date_processor(self.dataset) + self.dataset = duration_builder(self.dataset) + + # split partitions + + self.dataset = test_train_splitter( + self.dataset, random_state=self.random_state + ) # noqa # set modelling configurations - # train + def run_autopilot(self): + """ + TODO: Everything here must be abstracted away into strategies + TODO: such that several models are run and their scores are added to + TODO: the leaderboard + + :return: + """ + + simple_random_forest = RandomForestRegressionRunner( + target=self.target, + dataset=self.dataset, + random_state=self.random_state, # noqa + ) + simple_random_forest.fit() - # eval + # get holdout scores + simple_random_forest.predict(self.dataset.partitions["test"]) + simple_random_forest.pipeline.holdout_score = mean_absolute_error( + self.dataset.partitions["test"][self.target], + simple_random_forest.pipeline.tmp_pred, + ) + return simple_random_forest diff --git a/python/src/lazylearn/model_selection/__init__.py b/python/src/lazylearn/model_selection/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/model_selection/splitters.py b/python/src/lazylearn/model_selection/splitters.py new file mode 100644 index 0000000..7773deb --- /dev/null +++ b/python/src/lazylearn/model_selection/splitters.py @@ -0,0 +1,17 @@ +from models.models import Dataset +from sklearn.model_selection import train_test_split + + +def test_train_splitter(dataset: Dataset, random_state=None) -> Dataset: + train_partition, test_partition = train_test_split( + dataset.df, test_size=0.2, random_state=random_state + ) + + dataset.partitions["test"] = test_partition + dataset.partitions["train"] = train_partition + + return dataset + + +def cv_splitter(dataset: Dataset) -> Dataset: + return dataset diff --git a/python/src/lazylearn/models/models.py b/python/src/lazylearn/models/models.py index d826534..dfdf2e3 100644 --- a/python/src/lazylearn/models/models.py +++ b/python/src/lazylearn/models/models.py @@ -2,11 +2,20 @@ class Dataset: - def __init__(self, df: DataFrame, column_type_map: dict): + def __init__( + self, + df: DataFrame, + column_type_map: dict, + summary_stats: dict, + type_collections: dict, + ): self.name = None self.description = None self.df = df self.column_type_map = column_type_map + self.summary_stats = summary_stats + self.type_collections = type_collections + self.partitions: dict = {} def save(self): raise NotImplementedError diff --git a/python/src/lazylearn/pipeline/pipeline.py b/python/src/lazylearn/pipeline/pipeline.py index e7256f2..c26ec72 100644 --- a/python/src/lazylearn/pipeline/pipeline.py +++ b/python/src/lazylearn/pipeline/pipeline.py @@ -1,7 +1,7 @@ from typing import List from models.models import Dataset -from pandas import DataFrame +from pandas import DataFrame, Series class Pipeline: @@ -21,6 +21,12 @@ class PipelineStep: def apply(self, pipeline: Pipeline): pass + def fit(self, pipeline: Pipeline): + pass + + def predict(self, pipeline: Pipeline): + pass + class IngestionPipeline(Pipeline): def __init__(self): @@ -29,6 +35,42 @@ def __init__(self): self.df: DataFrame = None self.column_type_map: dict = None self.summary_stats: dict = {} + self.needs_type_map: bool = False + self.type_collections: dict = None def response(self): - return Dataset(df=self.df, column_type_map=self.column_type_map) + return Dataset( + df=self.df, + column_type_map=self.column_type_map, + summary_stats=self.summary_stats, + type_collections=self.type_collections, + ) + + +class ModelPipeline(Pipeline): + def __init__(self): + super().__init__() + self._is_fitted = False + self.feature_list: list = [] + self.tmp_test = None + self.tmp_pred = None + self.target = None + + def fit(self): + [step.fit(self) for step in self._steps] + self._is_fitted = True + + def predict(self): + assert self._is_fitted + [step.predict(self) for step in self._steps] + return self.tmp_pred + + +class RegressionPipeline(ModelPipeline): + def __init__(self): + super().__init__() + self.train_features_df: DataFrame = None + self.train_targets: Series = None + self.holdout_features_df: DataFrame = None + self.holdout_targets: Series = None + self.holdout_score: float = None diff --git a/python/src/lazylearn/preprocessing/encoding/encoders.py b/python/src/lazylearn/preprocessing/encoding/encoders.py index a206bc0..4d6b4e7 100644 --- a/python/src/lazylearn/preprocessing/encoding/encoders.py +++ b/python/src/lazylearn/preprocessing/encoding/encoders.py @@ -1,15 +1,78 @@ +from pandas import DataFrame +from pipeline.pipeline import ModelPipeline + + class OrdinalConverter: def __init__( self, + cat_vars: list, max_cardinality: int = None, min_support: int = 5, other_category: bool = True, method: str = "freq", ): + self.cat_vars = cat_vars self.card_max = max_cardinality self.min_support = min_support self.other_category = other_category self.method = method + self.cat_freqs = {} + self.cat_maps = {} + + def fit(self, pipeline: ModelPipeline): + for var in self.cat_vars: + pipeline.train_features_df = self.convert( + pipeline.train_features_df, var + ) # noqa + pipeline.feature_list.append(var) + + def convert(self, df: DataFrame, col_name: str) -> DataFrame: + """ + Encodes a categorical column ordinally. + Currently only the "freq" method is supported, + and it encodes a value with an integer id by + increasing frequency i.e. more frequent values + receive a higher encoding + + Note that this should only be done on the training + data! + + :param df: pandas DataFrame of features + :param col_name: column to consider + :return: transformed DataFrame + """ + if self.method == "freq": + self.cat_freqs[col_name] = {} + for item in df[col_name].tolist(): + if item in self.cat_freqs[col_name]: + self.cat_freqs[col_name][item] += 1 + else: + self.cat_freqs[col_name][item] = 1 + + freq_pairs = sorted( + [(key, val) for key, val in self.cat_freqs[col_name].items()], + key=lambda x: x[1], + ) + + self.cat_maps[col_name] = {key: val for key, val in freq_pairs} + + df[col_name] = df[col_name].apply( + lambda x: self.cat_maps[col_name][x] + if self.cat_maps[col_name][x] >= self.min_support + else -1 + ) + return df + else: + raise ValueError("Unsupported encoding method, try [freq]") + + def predict(self, pipeline: ModelPipeline): + df = pipeline.tmp_test + + for var in self.cat_vars: + df[var] = df[var].apply( + lambda x: self.cat_maps[var][x] + if x in self.cat_maps[var] + else -2 # noqa + ) - def convert(self, df, col): - pass + pipeline.tmp_test = df diff --git a/python/src/lazylearn/preprocessing/imputation/__init__.py b/python/src/lazylearn/preprocessing/imputation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/preprocessing/time/__init__.py b/python/src/lazylearn/preprocessing/time/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/preprocessing/time/date_processor.py b/python/src/lazylearn/preprocessing/time/date_processor.py new file mode 100644 index 0000000..aed4f3d --- /dev/null +++ b/python/src/lazylearn/preprocessing/time/date_processor.py @@ -0,0 +1,36 @@ +from models.models import Dataset + + +def date_processor(dataset: Dataset) -> Dataset: + """ + Method that transform date variables into + categorical features. + + :param dataset: Dataset object with date features + :return: Dataset object with categorical date + features + """ + new_categorical_cols = [] + + for date_column in dataset.type_collections["datetime"]: + dataset.df[f"{date_column}_year"] = ( + dataset.df[date_column].dt.isocalendar().year + ) + dataset.df[f"{date_column}_month"] = dataset.df[date_column].dt.month + dataset.df[f"{date_column}_week"] = ( + dataset.df[date_column].dt.isocalendar().week + ) + dataset.df[f"{date_column}_day"] = ( + dataset.df[date_column].dt.isocalendar().day + ) # noqa + + new_categorical_cols.append(f"{date_column}_year") + new_categorical_cols.append(f"{date_column}_month") + new_categorical_cols.append(f"{date_column}_week") + new_categorical_cols.append(f"{date_column}_day") + + for cat in new_categorical_cols: + dataset.column_type_map[cat] = "categorical" + dataset.type_collections["categorical"].append(cat) + + return dataset diff --git a/python/src/lazylearn/preprocessing/time/duration.py b/python/src/lazylearn/preprocessing/time/duration.py new file mode 100644 index 0000000..b21d05e --- /dev/null +++ b/python/src/lazylearn/preprocessing/time/duration.py @@ -0,0 +1,24 @@ +from models.models import Dataset + + +def duration_builder(dataset: Dataset) -> Dataset: + """ + + :param dataset: + :return: + """ + date_cols = dataset.type_collections.get("datetime") + + if len(date_cols) > 1: + for i in range(len(date_cols)): + for j in range(i + 1, len(date_cols)): + col_name = f"duration({date_cols[i]}-{date_cols[j]})" + dataset.df[col_name] = ( + (dataset.df[date_cols[i]] - dataset.df[date_cols[j]]) + .astype("timedelta64[D]") + .astype(int) + ) + dataset.column_type_map[col_name] = "numeric" + dataset.type_collections["numeric"].append(col_name) + + return dataset diff --git a/python/src/lazylearn/regression/__init__.py b/python/src/lazylearn/regression/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/regression/models/__init__.py b/python/src/lazylearn/regression/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/regression/models/randomforest/__init__.py b/python/src/lazylearn/regression/models/randomforest/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/regression/models/randomforest/random_forest_steps/__init__.py b/python/src/lazylearn/regression/models/randomforest/random_forest_steps/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py b/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py new file mode 100644 index 0000000..2217332 --- /dev/null +++ b/python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py @@ -0,0 +1,23 @@ +from pipeline.pipeline import PipelineStep, RegressionPipeline +from sklearn.ensemble import RandomForestRegressor + + +class RandomForestRegressorStep(PipelineStep): + def __init__(self, random_state=None): + self.regressor = RandomForestRegressor(random_state=random_state) + + def fit(self, pipeline: RegressionPipeline): + pipeline.feature_list = [ + item for item in pipeline.feature_list if item != pipeline.target + ] + print("Fitting RandomForestRegressor") + self.regressor.fit( + X=pipeline.train_features_df[pipeline.feature_list], + y=pipeline.train_targets, + ) # noqa + print("RandomForestRegressor fitted!") + + def predict(self, pipeline: RegressionPipeline): + pipeline.tmp_pred = self.regressor.predict( + X=pipeline.tmp_test[pipeline.feature_list] + ) diff --git a/python/src/lazylearn/regression/models/randomforest/randomforest.py b/python/src/lazylearn/regression/models/randomforest/randomforest.py new file mode 100644 index 0000000..2ba5aa3 --- /dev/null +++ b/python/src/lazylearn/regression/models/randomforest/randomforest.py @@ -0,0 +1,40 @@ +from models.models import Dataset +from pipeline.pipeline import RegressionPipeline +from preprocessing.encoding.encoders import OrdinalConverter +from regression.models.randomforest.random_forest_steps.regressor_step import ( + RandomForestRegressorStep, +) + + +class RandomForestRegressionRunner: + def __init__(self, target, dataset, random_state=42): + self.target = target + self.dataset: Dataset = dataset + self.random_state = random_state + self.pipeline = RegressionPipeline() + self.pipeline.target = target + + self.pipeline.train_features_df = self.dataset.partitions[ + "train" + ].copy() # noqa + self.pipeline.train_targets = self.dataset.partitions["train"][target] + self.pipeline.holdout_features_df = self.dataset.partitions[ + "test" + ].copy() # noqa + self.pipeline.holdout_targets = self.dataset.partitions["test"][target] + + def fit(self): + # preprocess numeric vars + cat_vars = self.dataset.type_collections["categorical"] + num_vars = self.dataset.type_collections["numeric"] + self.pipeline.feature_list.extend(num_vars) + + self.pipeline.add(OrdinalConverter(cat_vars=cat_vars)) + + self.pipeline.add(RandomForestRegressorStep()) + + self.pipeline.fit() + + def predict(self, features): + self.pipeline.tmp_test = features + return self.pipeline.predict() diff --git a/python/src/lazylearn/regression/models/xgboost/__init__.py b/python/src/lazylearn/regression/models/xgboost/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/strategies/__init__.py b/python/src/lazylearn/strategies/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/lazylearn/strategies/strategy_builder.py b/python/src/lazylearn/strategies/strategy_builder.py new file mode 100644 index 0000000..e69de29