
# `Skore` - an abstraction to ease data science project

In [None]:
import os

os.environ["POLARS_ALLOW_FORKING_THREAD"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "true"


Let's open a skore project in which we will be able to store artifacts from our
experiments.

In [None]:
import skore

my_project = skore.Project("../data/my_project", if_exists=True)

In [None]:
from skrub.datasets import fetch_employee_salaries

datasets = fetch_employee_salaries()
df, y = datasets.X, datasets.y

In [None]:
from skrub import TableReport

table_report = TableReport(datasets.employee_salaries)
table_report


Let's model our problem.

In [None]:
from skrub import TableVectorizer, MinHashEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

model = make_pipeline(
    TableVectorizer(high_cardinality=MinHashEncoder()),
    RandomForestRegressor(n_estimators=20, max_leaf_nodes=40),
)
model


`skore` provides a couple of tools to ease the evaluation of model:

In [None]:
from skore import CrossValidationReport

report = CrossValidationReport(estimator=model, X=df, y=y, cv_splitter=5, n_jobs=4)

In [None]:
report.help()

In [None]:
import time

start = time.time()
score = report.metrics.r2()
end = time.time()
print(f"Time taken: {end - start:.2f} seconds")

In [None]:
score

In [None]:
start = time.time()
score = report.metrics.r2()
end = time.time()
print(f"Time taken: {end - start:.2f} seconds")

In [None]:
score

In [None]:
import time

start = time.time()
score = report.metrics.rmse()
end = time.time()
print(f"Time taken: {end - start:.2f} seconds")

In [None]:
score

In [None]:
report.cache_predictions(n_jobs=4)

In [None]:
my_project.put("Random Forest model report", report)

In [None]:
report = my_project.get("Random Forest model report")
report.help()

In [None]:
report.metrics.report_metrics(aggregate=["mean", "std"], indicator_favorability=True)

In [None]:
display = report.metrics.prediction_error()
display.plot(kind="actual_vs_predicted")

In [None]:
report.estimator_reports_

In [None]:
estimator_report = report.estimator_reports_[0]
estimator_report.help()

In [None]:
estimator_report.metrics.prediction_error().plot(kind="actual_vs_predicted")

In [None]:
list_df = []

for idx, estimator_report in enumerate(report.estimator_reports_):
    estimator_feature_importance = estimator_report.feature_importance.feature_permutation(
        n_jobs=-1
    )
    estimator_feature_importance = estimator_feature_importance.droplevel(level = 0).T
    estimator_feature_importance["model"] = idx
    list_df.append(estimator_feature_importance)

In [None]:
import pandas as pd
df_concat = pd.concat(list_df)
df_concat.head()

In [None]:
import plotly.express as px
df = px.data.tips()
fig = px.box(df_concat, color = "model", orientation = "h")
fig.show()


## Conclusions

**Vision**
- Develop tooling to create data science artifacts
- Help at following good practices for the problem at hand
- Help at the collaboration to carry on data science project

**Wrap-up**
- Provide tools to evaluate predictive models
- Make some internal magic to reduce user friction
- Allow for persistence of artifacts

**Roadmap**
- Cover multiple aspects of the data science life cycles: data, model, etc.
- Help at creating artifacts dedicated to the problem at hand and the model
- Reduce the complexity related to code