In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt


In [None]:
df = pd.read_csv("../data/data.csv")

In [None]:
df.head(5)

In [None]:
df.isna().any()

In [None]:
df.shape

In [None]:
df.nunique()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
f, ax = plt.subplots(1, figsize=(8, 6))
sns.heatmap(df.corr(), annot=True, ax=ax)

![GMDH](../assets/Structure-of-the-GMDH-algorithm.jpg)

https://www.researchgate.net/figure/Structure-of-the-GMDH-algorithm_fig1_331395382

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split


class GMDH:
    def __init__(self, max_layers=10, stopping_threshold=0.01):
        self.max_layers = max_layers
        self.stopping_threshold = stopping_threshold
        self.layers = []
        self.w = []
        self.rs = []

    @staticmethod
    def func(X_train):
        return np.concatenate((np.ones((len(X_train), 1)), X_train), axis=1)

    def fit(self, X_train, y_train):
        curr_layer = 0
        layer_in = self.func(X_train)
        layer_out = y_train.reshape(-1, 1)
        while curr_layer < self.max_layers:
            w = np.linalg.lstsq(layer_in, layer_out, rcond=None)[0]
            pred = np.dot(layer_in, w)
            rs = np.sum((pred - layer_out) ** 2 / np.sum((layer_out - np.mean(layer_out)) ** 2))
            if len(self.rs) > 0 and rs - self.rs[-1] < self.stopping_threshold:
                break

            self.layers.append(layer_in)
            self.w.append(w)
            self.rs.append(rs)
            layer_in = np.concatenate((layer_in, layer_in ** 2), axis=1)
            layer_out = layer_out - pred
            curr_layer += 1

    def predict(self, X_test):
        layer_in = self.func(X_test)
        pred = np.zeros((len(X_test),))

        for _, w in zip(self.layers, self.w):
            pred += np.dot(layer_in, w).ravel()
            layer_in = np.concatenate((layer_in, layer_in ** 2), axis=1)
        return pred


In [None]:
df_copy = df.to_numpy().copy()
df_copy = df_copy[:20]

X = df_copy[:, 1:]  # Features
y = df_copy[:, 0]  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [None]:
gmdh = GMDH()
gmdh.fit(X_train, y_train)
y_pred = gmdh.predict(X_test)

In [None]:
def average_approximation_error(actual, pred):
    average_approximation_error = np.mean(np.abs(actual - pred))
    print("Average Approximation Error:", average_approximation_error)

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go


def pred_plot(actual, pred, title=None):
    fig = make_subplots(rows=1, cols=1)
    tr1 = go.Scatter(
        x=np.arange(len(actual)), y=actual,
        mode='lines+markers',
        name="actual",
    )
    tr2 = go.Scatter(
        x=np.arange(len(pred)), y=pred,
        mode='lines+markers',
        name="pred",
    )
    fig.add_trace(tr1, row=1, col=1)
    fig.append_trace(tr2, row=1, col=1)
    fig.layout.title = title
    fig.show()

In [None]:
pred_plot(y_test, y_pred, "Test data")

In [None]:
average_approximation_error(y_test, y_pred)

In [None]:
gmdh = GMDH()
gmdh.fit(X, y)
y_pred_2 = gmdh.predict(X)

In [None]:
pred_plot(y, y_pred_2, "Initial data")

In [None]:
average_approximation_error(y, y_pred_2)