In [1]:
from __future__ import annotations

import sys
from typing import Protocol

import numpy as np
import polars as pl
import requests
from loguru import logger

logger.remove(0)
logger.add(sys.stderr, format="{message}")

1

In [2]:
train_df = pl.read_parquet("./train.parquet")
test_df = pl.read_parquet("./test.parquet")

In [3]:
train_df

cement,blast furnace slag,fly ash,water,superplasticizer,coarse aggregate,fine aggregate,age,concrete compressive strength
f64,f64,f64,f64,f64,f64,f64,i64,f64
425.0,106.3,0.0,153.5,16.5,852.1,887.1,91,65.196851
250.0,0.0,95.69,191.84,5.33,948.9,857.2,3,8.48745
300.0,0.0,0.0,184.0,0.0,1075.0,795.0,7,15.575263
190.34,0.0,125.18,161.85,9.88,1088.1,802.59,28,28.468464
153.0,145.0,113.0,178.0,8.0,867.0,824.0,28,26.227667
…,…,…,…,…,…,…,…,…
298.2,0.0,107.0,209.7,11.1,879.6,744.2,28,31.875165
380.0,95.0,0.0,228.0,0.0,932.0,594.0,270,41.151375
159.0,209.0,161.0,201.0,7.0,848.0,669.0,28,30.88163
152.0,178.0,139.0,168.0,18.0,944.0,695.0,28,36.349175


In [4]:
test_df

cement,blast furnace slag,fly ash,water,superplasticizer,coarse aggregate,fine aggregate,age
f64,f64,f64,f64,f64,f64,f64,i64
491.0,26.0,123.0,201.0,3.93,822.0,699.0,3
173.0,116.0,0.0,192.0,0.0,946.8,856.8,3
173.54,50.05,173.53,164.77,6.47,1006.2,793.5,56
317.9,0.0,126.5,209.7,5.7,860.5,736.6,28
425.0,106.3,0.0,153.5,16.5,852.1,887.1,56
…,…,…,…,…,…,…,…
252.0,0.0,0.0,185.0,0.0,1111.0,784.0,28
362.6,189.0,0.0,164.9,11.6,944.7,755.8,7
167.95,42.08,163.83,121.75,5.72,1058.7,780.11,100
516.0,0.0,0.0,162.0,8.26,801.0,802.0,28


In [5]:
train_df.plot.scatter(x="cement", y="concrete compressive strength")

%opts magic unavailable (pyparsing cannot be imported)
%compositor magic unavailable (pyparsing cannot be imported)


In [6]:
train_df.plot.scatter(x="water", y="concrete compressive strength")

In [7]:
train_df.plot.scatter(x="age", y="concrete compressive strength")

In [8]:
train_df.plot.scatter(x="superplasticizer", y="concrete compressive strength")

In [9]:
endpoint = "http://192.168.0.190:8000"
username = "daniel"
token = requests.post(f"{endpoint}/users/create/{username}").json()["token"]

In [10]:
n_eval = 100
eval_df = train_df[:n_eval]
train_df = train_df[n_eval:]

In [11]:
# helper functions


class Model(Protocol):
    def __call__(self, d: dict) -> float: ...


def evaluate(model) -> float:
    predicted = pl.Series("predicted", [model(x[:-1]) for x in eval_df.iter_rows()])
    return ((eval_df["concrete compressive strength"] - predicted) ** 2).mean()


def generate_test_predictions(model: Model) -> list[float]:
    return [model(x) for x in test_df.iter_rows()]


def submit_predictions(model_name: str, predictions: list[float]) -> float:
    return requests.post(
        f"{endpoint}/predictions/submit",
        json={"token": token, "model": model_name, "predictions": predictions},
    ).json()["score"]

In [12]:
def first_model(_):
    return train_df["concrete compressive strength"].mean()

In [13]:
eval_score = evaluate(first_model)
logger.info(f"Score on evaluation dataset: {eval_score:.3f}")

Score on evaluation dataset: 249.555


In [14]:
first_predictions = generate_test_predictions(first_model)
test_score = submit_predictions("my first model", first_predictions)
logger.info(f"Score on test dataset: {test_score:.3f}")

Score on test dataset: 315.806


In [16]:
def create_linear_model(weights: np.ndarray, bias: float = 0) -> Model:
    return lambda x: np.dot(np.array(x), weights) + bias


best_score = np.inf
best_iteration = None
best_model = None
for i in range(10):
    weights = 0.02 * np.random.rand(len(train_df.columns) - 1)
    random_linear_model = create_linear_model(weights)
    eval_score = evaluate(random_linear_model)
    if eval_score < best_score:
        best_score = eval_score
        best_iteration = i
        best_model = random_linear_model

    logger.info(
        f"Score on evaluation dataset for random linear model {i}: {eval_score:.3f}",
    )
logger.info(
    f"Best score on evaluation dataset was {best_score:.3f} from iteration {best_iteration}",
)

Score on evaluation dataset for random linear model 0: 252.295
Score on evaluation dataset for random linear model 1: 804.418
Score on evaluation dataset for random linear model 2: 619.243
Score on evaluation dataset for random linear model 3: 271.843
Score on evaluation dataset for random linear model 4: 263.901
Score on evaluation dataset for random linear model 5: 629.507
Score on evaluation dataset for random linear model 6: 605.189
Score on evaluation dataset for random linear model 7: 698.084
Score on evaluation dataset for random linear model 8: 251.898
Score on evaluation dataset for random linear model 9: 901.591
Best score on evaluation dataset was 251.898 from iteration 8


In [17]:
random_predictions = generate_test_predictions(best_model)
test_score = submit_predictions("my random model", random_predictions)
logger.info(f"Score on test dataset: {test_score:.3f}")

Score on test dataset: 318.495


In [25]:
A = np.c_[
    train_df.drop("concrete compressive strength").to_numpy(),
    np.ones(len(train_df)),
]
optimal_params = np.linalg.lstsq(
    A,
    train_df["concrete compressive strength"],
    rcond=None,
)[0]
weights = optimal_params[:-1]
bias = optimal_params[-1]

optimal_linear_model = create_linear_model(weights, bias)
eval_score = evaluate(optimal_linear_model)
logger.info(f"Score on evaluation dataset: {eval_score:.3f}")

Score on evaluation dataset: 92.024


In [26]:
optimal_linear_predictions = generate_test_predictions(optimal_linear_model)
test_score = submit_predictions("my linear model", optimal_linear_predictions)
logger.info(f"Score on test dataset: {test_score:.3f}")

Score on test dataset: 106.545


In [28]:
dict(zip(test_df.columns, weights))

{'cement': 0.11503418582849749,
 'blast furnace slag': 0.09527426324280391,
 'fly ash': 0.08066572942390958,
 'water': -0.17229612209687223,
 'superplasticizer': 0.25637078808473646,
 'coarse aggregate': 0.010301708036094467,
 'fine aggregate': 0.012633711580344383,
 'age': 0.10788867006052093}