In [8]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures

In [9]:
def fit_model(x, y, x_labels, max_degrees, test_proportion=0.25):
    # train test split not necessary because cross_val_scores uses k-fold validation
    best_degree = 1
    model = LinearRegression()
    mean_score = -9e9
    for d in range(1, max_degrees + 1):
        if d == 1:
            p_x = x
        else:
            p_features = PolynomialFeatures(degree=d)
            p_x = p_features.fit_transform(x)
        scores = cross_val_score(model, p_x, y, cv=10)
        new_mean = np.mean(scores)
        print(f'Score of {d}-degree model: {new_mean}')
        if new_mean < mean_score:
            p_features = PolynomialFeatures(degree=d - 1)
            p_x = p_features.fit_transform(x)
            labels = p_features.get_feature_names_out(x_labels)
            print(f'Best model has {best_degree} degrees, returning\n')
            return model, best_degree, labels, p_x, y
        else:
            mean_score = new_mean
            best_degree = d
            model.fit(p_x, y)
            print(f'Coefficients: {model.coef_}')


def get_equation(x, y, labels, max_degrees, tolerance=1e-2, test_proportion=0.25) -> dict:
    model, _, feature_labels, _, _ = fit_model(x, y, labels, max_degrees, test_proportion=test_proportion)
    feature_labels = feature_labels[model.coef_ > tolerance]
    coefs = model.coef_[model.coef_ > tolerance]
    equation_dict = {}
    print(f'Equation:')
    for i in range(len(coefs[:-1])):
        print(f'{round(coefs[i], 2)}{feature_labels[i]}', end=' + ')
        equation_dict[coefs[i]] = feature_labels[i]
    print(f'{round(coefs[-1], 2)}{feature_labels[-1]}')
    equation_dict[coefs[-1]] = feature_labels[-1]
    return equation_dict

In [10]:
data = pd.read_csv('data.csv')
print(data.head(100))
print(data.describe())
data.dropna()
data = data[data.x1 != -9999]
data = data[data.train == 1]
# d1 = d1[d1.x5 != 0]
print(data.describe())

            x1          x2          x3  x4         x5  train           y
0   -99.551155 -133.218630 -1198.56910   0   0.000000      1  3380.71610
1  -180.243940 -147.405440 -1450.67760   0   0.000000      1  4018.73290
2   122.136280 -107.456430   283.78690   0   0.000000      1  -613.12384
3   -28.301346   36.367268   580.37982   1   0.000000      1  1097.56800
4   125.517690  -49.982388   730.12872   1 -77.617165      1  1306.41890
..         ...         ...         ...  ..        ...    ...         ...
95   99.546051   88.671188   113.14629   0 -12.639590      1  -211.31779
96  126.946690  -99.317421  2365.04420   0 -80.305984      1 -7251.75100
97  -81.563179    1.796087 -1503.62920   1   0.000000      1 -3206.00590
98  -13.232832  -12.581193  -510.46417   0   0.000000      1  1511.17150
99   39.381275  -90.421219  1562.74020   0   0.000000      1 -4614.83590

[100 rows x 7 columns]
                x1          x2           x3          x4          x5  \
count   277.000000  277.00000

In [11]:
x_labels = ['x1', 'x2', 'x3', 'x4', 'x5']
x = data[x_labels]
y = data.y
equation_values = get_equation(x, y, x_labels, 3)

Score of 1-degree model: 0.14101940148052367
Coefficients: [  2.65403109   1.64178089  -1.32318228 467.93836321   8.22218225]
Score of 2-degree model: 0.9999297867189878
Coefficients: [ 0.00000000e+00  1.99927257e+00  5.91044677e-03 -3.00179687e+00
 -5.68664220e+00  5.08618782e+00 -1.09717671e-04 -8.71451926e-05
 -1.24879211e-05 -3.00216858e-02 -9.24880153e-04  1.47344141e-04
 -3.55764835e-07 -3.37094761e-02 -5.69390534e-05  9.94054215e-07
  5.00595453e+00  3.03272395e-05 -5.68664220e+00 -1.91575566e-01
  4.92005446e-05]
Score of 3-degree model: 0.999788318314114
Best model has 2 degrees, returning

Equation:
2.0x1 + 5.09x5 + 5.01x3 x4
