In [185]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score

In [186]:
#df.index = df.index.astype('int')

In [187]:
df = pd.read_csv('data_model.csv', index_col = 0)
df.sample(5)

Unnamed: 0,T1,F1,F2,F3,F4,F5,F6,T2,T3,F7,F8,F9,F10
69.0,3.120022,2042.280283,600.769621,88.413013,23.085422,280.643182,324.279402,69.526015,2164.388936,138.224721,0.0,4.795862,37.404762
847.0,1.047844,1966.142448,656.251716,63.652302,26.516469,282.836264,824.488816,70.119479,2124.0335,262.349622,1.0,12.656475,59.474331
120.0,4.37943,1899.029321,329.386054,79.833232,20.063223,308.076725,34.547697,74.519119,1994.674603,237.571116,0.0,7.567723,57.061357
1016.0,3.745862,1914.629424,680.683701,110.9791,25.922635,309.796388,628.36455,76.030555,2466.925422,152.18472,1.0,8.05702,47.067229
116.0,3.660787,2125.206209,856.379707,120.193115,21.546764,224.657476,270.457644,69.702994,2953.366748,132.056503,0.0,5.680511,70.744709


1. Упражнения на всей выборке

In [137]:
X, y = df.drop(['T2', 'F8'], axis = 1).values, df['T2'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state = 42)

#res= clf._predict_proba_lr(X_test,Y_test)

X_train.shape

(700, 11)

In [138]:
scaler = StandardScaler()

In [139]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [140]:
X_train_scaled.shape

(700, 11)

In [141]:
X_test_scaled.shape

(300, 11)

In [144]:
def train_validate_report(model, X_train_scaled, y_train, X_test_scaled, y_test, feature_names):
    
    model.fit(X_train_scaled, y_train)
    
    print('MAE:', mean_absolute_error(y_test, model.predict(X_test_scaled)))
    print('MSE:', mean_squared_error(y_test, model.predict(X_test_scaled)))
    print('RMSE:', np.sqrt(mean_squared_error(y_test, model.predict(X_test_scaled))))
    
    print(pd.DataFrame(model.coef_, feature_names, columns = ['coef']).sort_values(by = 'coef', ascending = False))

In [145]:
lasso = Lasso(random_state = 17)
ridge = Ridge(random_state = 17)
lasso_cv = LassoCV(random_state = 17)
ridge_cv = RidgeCV()

In [146]:
train_validate_report(lasso, X_train_scaled, y_train, X_test_scaled, y_test, feature_names = df.columns[:-2])

MAE: 2.48753392905233
MSE: 9.795852789184766
RMSE: 3.1298327094566516
    coef
T1  -0.0
F1  -0.0
F2   0.0
F3  -0.0
F4   0.0
F5   0.0
F6   0.0
T2  -0.0
T3   0.0
F7  -0.0
F8   0.0


In [147]:
train_validate_report(ridge, X_train_scaled, y_train, X_test_scaled, y_test, feature_names = df.columns[:-2])

MAE: 2.5397187674448998
MSE: 9.982885207684015
RMSE: 3.1595704150539223
        coef
F5  0.193070
F8  0.177161
F4  0.154569
T3  0.080399
F2  0.070457
F6  0.050609
T1 -0.050995
F7 -0.089641
T2 -0.093018
F3 -0.247469
F1 -0.272758


In [148]:
train_validate_report(lasso_cv, X_train_scaled, y_train, X_test_scaled, y_test, feature_names = df.columns[:-2])

MAE: 2.4986573193164485
MSE: 9.854880448632043
RMSE: 3.139248389126296
        coef
F5  0.018813
T1 -0.000000
F2  0.000000
F4  0.000000
F6  0.000000
T2 -0.000000
T3  0.000000
F7 -0.000000
F8  0.000000
F3 -0.041016
F1 -0.085611


In [149]:
train_validate_report(ridge_cv, X_train_scaled, y_train, X_test_scaled, y_test, feature_names = df.columns[:-2])

MAE: 2.5386124244104926
MSE: 9.976550440494263
RMSE: 3.1585677831090253
        coef
F5  0.190394
F8  0.174091
F4  0.152425
T3  0.079486
F2  0.069748
F6  0.049578
T1 -0.050046
F7 -0.088185
T2 -0.091147
F3 -0.243582
F1 -0.268778


2. На важных признаках

In [150]:
df_1 = df.drop(['T2', 'F1', 'F4', 'F5', 'F6', 'F7', 'F8', 'F10'], axis = 1)
X, y = df_1.values, df['T2'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state = 42)

#res= clf._predict_proba_lr(X_test,Y_test)

X_train.shape

(700, 5)

In [151]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [152]:
def train_validate_report(model, X_train_scaled, y_train, X_test_scaled, y_test, feature_names):
    
    model.fit(X_train_scaled, y_train)
    
    print('MAE:', mean_absolute_error(y_test, model.predict(X_test_scaled)))
    print('MSE:', mean_squared_error(y_test, model.predict(X_test_scaled)))
    print('RMSE:', np.sqrt(mean_squared_error(y_test, model.predict(X_test_scaled))))
    
    print(pd.DataFrame(model.coef_, feature_names, columns = ['coef']).sort_values(by = 'coef', ascending = False))

In [154]:
train_validate_report(ridge_cv, X_train_scaled, y_train, X_test_scaled, y_test, feature_names = df_1.columns[:])

MAE: 2.4997155480865203
MSE: 9.766813443813003
RMSE: 3.1251901452252473
        coef
F2  0.085454
T1 -0.026483
F9 -0.071299
T3 -0.074269
F3 -0.204957


Второй вариант

In [188]:
df_2 = df.drop(['T2', 'F1', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F10'], axis = 1)
X, y = df_2.values, df['T2'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state = 42)

#res= clf._predict_proba_lr(X_test,Y_test)

X_train.shape

(700, 4)

In [192]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [193]:
def train_validate_report(model, X_train_scaled, y_train, X_test_scaled, y_test, feature_names):
    
    model.fit(X_train_scaled, y_train)
    
    print('MAE:', mean_absolute_error(y_test, model.predict(X_test_scaled)))
    print('MSE:', mean_squared_error(y_test, model.predict(X_test_scaled)))
    print('RMSE:', np.sqrt(mean_squared_error(y_test, model.predict(X_test_scaled))))
    
    print(pd.DataFrame(model.coef_, feature_names, columns = ['coef']).sort_values(by = 'coef', ascending = False))

In [195]:
train_validate_report(ridge_cv, X_train_scaled, y_train, X_test_scaled, y_test, feature_names = df_2.columns[:])

MAE: 2.494562642301039
MSE: 9.799681642766378
RMSE: 3.1304443203427814
        coef
F2  0.079329
T1 -0.027006
T3 -0.052587
F9 -0.070446
