In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
from matplotlib import pyplot as plt
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("../data/data.csv")

In [None]:
df.head(11)

In [None]:
df.isna().any()

In [None]:
df.shape

In [None]:
df.nunique()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
f, ax = plt.subplots(1, figsize=(8, 6))
sns.heatmap(df.corr(), annot=True, ax=ax)

# GMDH model

![GMDH](../assets/Structure-of-the-GMDH-algorithm.jpg)

In [None]:

class GMDH:
    def __init__(self, max_layers=10, stopping_threshold=0.01):
        self.max_layers = max_layers
        self.stopping_threshold = stopping_threshold
        self.layers = []
        self.weights = []
        self.performance = []

    @staticmethod
    def _add_bias(X):
        ones_column = np.ones((len(X), 1))
        return np.concatenate((ones_column, X), axis=1)

    @staticmethod
    def _calculate_performance(predictions, targets):
        numerator = np.sum((predictions - targets) ** 2) # sum of squared forecast errors
        denominator = np.sum((targets - np.mean(targets)) ** 2) # sum of squared errors of the target variable
        return numerator / denominator

    def fit(self, X, y):
        X_augmented = self._add_bias(X)
        y_reshaped = y.reshape(-1, 1)
        current_layer = 0

        while current_layer < self.max_layers:
            weights = np.linalg.lstsq(X_augmented, y_reshaped, rcond=None)[0]
            predictions = np.dot(X_augmented, weights)
            performance = self._calculate_performance(predictions, y_reshaped)

            if len(self.performance) > 0 and performance - self.performance[-1] < self.stopping_threshold:
                break

            self.layers.append(X_augmented)
            self.weights.append(weights)
            self.performance.append(performance)

            X_augmented = np.concatenate((X_augmented, X_augmented ** 2), axis=1)
            y_reshaped = y_reshaped - predictions
            current_layer += 1

    def predict(self, X):
        X_augmented = self._add_bias(X)
        pred = np.zeros(len(X)) # accumulation of partial forecasts

        for layer, weights in zip(self.layers, self.weights):
            partial_pred = np.dot(X_augmented, weights).ravel()
            pred += partial_pred

            X_augmented = np.concatenate((X_augmented, X_augmented ** 2), axis=1)

        return pred


# Data separating

In [None]:
df = df.to_numpy()
X = df[:, 1:]  # Features
y = df[:, 0]  # Target variable 'y'

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
len(X_train), len(X_test), len(y_train), len(y_test)

# Training

In [None]:
gmdh = GMDH()
gmdh.fit(X_train, y_train)

# Evaluate Model

In [None]:
def get_merged(actual, pred):
    d = {"actual": actual, "pred": pred}
    merged = pd.DataFrame(d)
    merged.reset_index(drop=True)
    print("\n", merged.head())

In [None]:
def average_approximation_error(actual, pred):
    average_approximation_error = np.mean(np.abs(actual - pred))
    print("Average Approximation Error:", average_approximation_error)

In [None]:
def prediction_plot(actual, pred, title=None, actual_name=None, pred_name=None):
    fig = make_subplots(rows=1, cols=1)
    tr1 = go.Scatter(
        x=np.arange(len(actual)),
        y=actual,
        mode="lines+markers",
        name=actual_name,
    )
    tr2 = go.Scatter(
        x=np.arange(len(pred)),
        y=pred,
        mode="lines+markers",
        name=pred_name,
    )
    fig.add_trace(tr1, row=1, col=1)
    fig.append_trace(tr2, row=1, col=1)
    fig.update_layout(title=title)
    fig.show()

In [None]:
def evaluate_model(X, y, title=None, actual_name=None, pred_name=None):
    y_pred = gmdh.predict(X)
    prediction_plot(
        y, y_pred, title=title, actual_name=actual_name, pred_name=pred_name
    )
    average_approximation_error(y, y_pred)
    get_merged(y, y_pred)

In [None]:
evaluate_model(X_train, y_train, "Train Data", "X_train", "y_train")

In [None]:
evaluate_model(X_test, y_test, "Test Data", "X_test", "y_test")

In [None]:
gmdh.fit(X, y)
evaluate_model(X, y, "Initial Data", "X", "y")