In [82]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score

In [83]:
#df.index = df.index.astype('int')

In [84]:
df = pd.read_csv('data_model.csv', index_col = 0)
df.sample(5)

Unnamed: 0,T1,F1,F2,F3,F4,F5,F6,T2,T3,F7,F9,F10
731.0,1.321616,1915.201632,994.009249,108.137608,22.391299,219.133791,494.872471,66.482383,2431.7703,93.822161,5.226207,51.915444
282.0,2.483168,1949.480232,610.664035,99.790787,22.070878,364.341247,696.886542,73.498093,3052.430194,313.982345,5.301232,51.273989
987.0,4.39773,1991.707864,962.135964,106.179379,23.404661,355.176984,531.152675,75.101888,3579.265523,132.360669,9.330933,56.349177
571.0,2.884592,1978.408952,1200.961642,160.177823,22.220097,320.850106,407.546925,71.543911,2587.231985,165.019036,3.929925,51.783868
273.0,1.704097,1981.385336,948.47605,101.388796,17.59799,254.051211,798.628523,73.833984,2099.729904,134.477174,10.046999,46.842509


1. Упражнения на всей выборке

In [85]:
X, y = df.drop(['T2'], axis = 1).values, df['T2'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state = 42)

#res= clf._predict_proba_lr(X_test,Y_test)

X_train.shape

(655, 11)

In [86]:
scaler = StandardScaler()

In [87]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [88]:
X_train_scaled.shape

(655, 11)

In [89]:
X_test_scaled.shape

(281, 11)

In [92]:
def train_validate_report(model, X_train_scaled, y_train, X_test_scaled, y_test, feature_names):
    
    model.fit(X_train_scaled, y_train)
    
    print('MAE:', mean_absolute_error(y_test, model.predict(X_test_scaled)))
    print('MSE:', mean_squared_error(y_test, model.predict(X_test_scaled)))
    print('RMSE:', np.sqrt(mean_squared_error(y_test, model.predict(X_test_scaled))))
    
    print(pd.DataFrame(model.coef_, feature_names, columns = ['coef']).sort_values(by = 'coef', ascending = False))

In [93]:
lasso = Lasso(random_state = 17)
ridge = Ridge(random_state = 17)
lasso_cv = LassoCV(random_state = 17)
ridge_cv = RidgeCV()

In [94]:
train_validate_report(lasso, X_train_scaled, y_train, X_test_scaled, y_test, feature_names = df.columns[:-1])

MAE: 2.3986344930417918
MSE: 8.765901841522709
RMSE: 2.960726573245613
    coef
T1  -0.0
F1  -0.0
F2   0.0
F3  -0.0
F4   0.0
F5   0.0
F6   0.0
T2  -0.0
T3   0.0
F7  -0.0
F9   0.0


In [95]:
train_validate_report(ridge, X_train_scaled, y_train, X_test_scaled, y_test, feature_names = df.columns[:-1])

MAE: 2.4182609859719615
MSE: 8.776976606130868
RMSE: 2.9625962610742063
        coef
F2  0.174437
F4  0.169266
F6  0.135203
T3  0.127073
F9  0.096131
F5  0.014689
F1 -0.061166
T2 -0.064422
F7 -0.072487
T1 -0.111564
F3 -0.168502


In [96]:
train_validate_report(lasso_cv, X_train_scaled, y_train, X_test_scaled, y_test, feature_names = df.columns[:-1])

MAE: 2.3986344930417918
MSE: 8.765901841522709
RMSE: 2.960726573245613
            coef
F2  6.508788e-17
T1 -0.000000e+00
F1 -0.000000e+00
F3 -0.000000e+00
F4  0.000000e+00
F5  0.000000e+00
F6  0.000000e+00
T2 -0.000000e+00
T3  0.000000e+00
F7 -0.000000e+00
F9  0.000000e+00


In [97]:
train_validate_report(ridge_cv, X_train_scaled, y_train, X_test_scaled, y_test, feature_names = df.columns[:-1])

MAE: 2.417859597558593
MSE: 8.774790315131943
RMSE: 2.9622272558215283
        coef
F2  0.171922
F4  0.166719
F6  0.133023
T3  0.125163
F9  0.094671
F5  0.014611
F1 -0.059813
T2 -0.063016
F7 -0.071267
T1 -0.109882
F3 -0.165828


2. На важных признаках

In [98]:
df_1 = df.drop(['T2', 'F1', 'F2', 'T1', 'F5', 'T3', 'F10'], axis = 1)
X, y = df_1.values, df['T2'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state = 42)

#res= clf._predict_proba_lr(X_test,Y_test)

X_train.shape

(655, 5)

In [99]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [100]:
def train_validate_report(model, X_train_scaled, y_train, X_test_scaled, y_test, feature_names):
    
    model.fit(X_train_scaled, y_train)
    
    print('MAE:', mean_absolute_error(y_test, model.predict(X_test_scaled)))
    print('MSE:', mean_squared_error(y_test, model.predict(X_test_scaled)))
    print('RMSE:', np.sqrt(mean_squared_error(y_test, model.predict(X_test_scaled))))
    
    print(pd.DataFrame(model.coef_, feature_names, columns = ['coef']).sort_values(by = 'coef', ascending = False))

In [101]:
train_validate_report(ridge_cv, X_train_scaled, y_train, X_test_scaled, y_test, feature_names = df_1.columns[:])

MAE: 2.3920933436025504
MSE: 8.666974694942374
RMSE: 2.9439726043124743
        coef
F4  0.153404
F6  0.127902
F7  0.120786
F9 -0.065374
F3 -0.153397


Второй вариант

In [102]:
df_2 = df.drop(['T2', 'F1', 'T1', 'F5', 'F10'], axis = 1)
X, y = df_2.values, df['T2'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state = 42)

#res= clf._predict_proba_lr(X_test,Y_test)

X_train.shape

(655, 7)

In [103]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [104]:
def train_validate_report(model, X_train_scaled, y_train, X_test_scaled, y_test, feature_names):
    
    model.fit(X_train_scaled, y_train)
    
    print('MAE:', mean_absolute_error(y_test, model.predict(X_test_scaled)))
    print('MSE:', mean_squared_error(y_test, model.predict(X_test_scaled)))
    print('RMSE:', np.sqrt(mean_squared_error(y_test, model.predict(X_test_scaled))))
    
    print(pd.DataFrame(model.coef_, feature_names, columns = ['coef']).sort_values(by = 'coef', ascending = False))

In [105]:
train_validate_report(ridge_cv, X_train_scaled, y_train, X_test_scaled, y_test, feature_names = df_2.columns[:])

MAE: 2.4091779168014202
MSE: 8.765646670396103
RMSE: 2.9606834802788535
        coef
F2  0.173727
F4  0.159520
F6  0.125949
F7  0.118288
T3 -0.058801
F9 -0.075542
F3 -0.161315
