In [None]:
import re
import unicodedata

from pathlib import Path
from typing import Optional

import pandas as pd
from pycaret.regression import (compare_models, finalize_model, load_model,
                                save_model, setup)



In [None]:
def slugify(value, words_sep='-'):
    value = str(value).replace('ł', 'l').replace('Ł', 'l')
    value = value
    value = (
        unicodedata.normalize("NFKD", value)
        .encode("ascii", "ignore")
        .decode("ascii")
    )
    value = re.sub(r"[^\w\s%s]" % words_sep, "", value.lower())
    return re.sub(r"[%s\s]+" % words_sep, words_sep, value).strip("-_")


def prepare_regression_model(
        df: pd.DataFrame,
        model_name: str,
        target: str,
        session_id: Optional[int] = None,
        ignore_features: Optional[list[str]] = None,
        **kwargs,
):
    model_f_name = f'{model_name}.pkl'
    if not Path(model_f_name).exists():
        for kw_key, kw_val in [('ignore_features', ignore_features)]:
            if kw_val is not None:
                setup_kws = {kw_key: kw_val}
        setup(df, target=target, session_id=session_id or 123, **setup_kws, **kwargs)
        best_model = compare_models()
        final_model = finalize_model(best_model)
        save_model(final_model, model_name)
    else:
        final_model = load_model(model_name)
    return final_model


In [None]:
df = pd.read_csv('world-happiness-report.csv')
columns = df.columns.to_list()
new_columns = list(map(lambda c: slugify(c, '_'), columns))
df2 = df.copy()
df2.columns = new_columns
df3 = df2.copy()[df2['year'] == 2023][[c for c in new_columns if c not in ['year']]]
df3.set_index(['country_name'], inplace=True)

df3.to_csv('world-happiness-report_2023.csv')

prepare_regression_model(df3, 'world_happines_regression_pipeline', 'happiness_score',
                         ignore_features=['country_name'])