In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score

In [2]:
#df.index = df.index.astype('int')

In [3]:
df = pd.read_csv('data_model.csv', index_col = 0)
df.sample(5)

Unnamed: 0,T1,F1,F2,F3,F4,F5,F6,T2,T3,F7,F8,F9,F10
668.0,2.452951,1910.204905,6.607548,96.354916,19.727737,306.017334,6.098845,74.976179,2201.923561,161.213976,1.0,8.985718,53.927701
411.0,1.273273,1915.642767,6.159758,93.290977,16.704108,212.833056,5.81845,71.838552,2478.484767,149.807602,0.0,4.996654,54.513716
538.0,0.857381,2038.89846,6.264702,105.128512,19.795821,218.491902,5.874891,72.406178,2313.676959,289.349833,0.0,10.740492,49.158838
988.0,3.106634,1884.734983,6.336061,160.433234,26.809241,354.325741,5.507743,72.812556,2135.779172,275.412755,1.0,6.54466,51.481758
85.0,4.45712,2017.204418,6.888765,99.675033,25.722449,292.101685,5.707974,73.782828,2546.351962,192.158908,0.0,4.764229,70.495158


1. Упражнения на всей выборке

In [4]:
X, y = df.drop(['T2'], axis = 1).values, df['T2'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state = 42)

#res= clf._predict_proba_lr(X_test,Y_test)

X_train.shape

(700, 12)

In [5]:
scaler = StandardScaler()

In [6]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
X_train_scaled.shape

(700, 12)

In [8]:
X_test_scaled.shape

(300, 12)

In [9]:
def train_validate_report(model, X_train_scaled, y_train, X_test_scaled, y_test, feature_names):
    
    model.fit(X_train_scaled, y_train)
    
    print('MAE:', mean_absolute_error(y_test, model.predict(X_test_scaled)))
    print('MSE:', mean_squared_error(y_test, model.predict(X_test_scaled)))
    print('RMSE:', np.sqrt(mean_squared_error(y_test, model.predict(X_test_scaled))))
    
    print(pd.DataFrame(model.coef_, feature_names, columns = ['coef']).sort_values(by = 'coef', ascending = False))

In [10]:
lasso = Lasso(random_state = 17)
ridge = Ridge(random_state = 17)
lasso_cv = LassoCV(random_state = 17)
ridge_cv = RidgeCV()

In [11]:
train_validate_report(lasso, X_train_scaled, y_train, X_test_scaled, y_test, feature_names = df.columns[:-1])

MAE: 2.48753392905233
MSE: 9.795852789184766
RMSE: 3.1298327094566516
    coef
T1  -0.0
F1  -0.0
F2   0.0
F3  -0.0
F4   0.0
F5   0.0
F6   0.0
T2  -0.0
T3   0.0
F7   0.0
F8  -0.0
F9   0.0


In [12]:
train_validate_report(ridge, X_train_scaled, y_train, X_test_scaled, y_test, feature_names = df.columns[:-1])

MAE: 2.546202843552639
MSE: 10.02153840291383
RMSE: 3.165681348922192
        coef
F5  0.188343
F9  0.165729
F4  0.146320
F7  0.108069
F2  0.102731
F6  0.085467
T3  0.081850
T1 -0.047325
F8 -0.088611
T2 -0.094883
F3 -0.250175
F1 -0.266187


In [13]:
train_validate_report(lasso_cv, X_train_scaled, y_train, X_test_scaled, y_test, feature_names = df.columns[:-1])

MAE: 2.48753392905233
MSE: 9.795852789184766
RMSE: 3.1298327094566516
            coef
T1 -0.000000e+00
F2  0.000000e+00
F3 -0.000000e+00
F4  0.000000e+00
F5  0.000000e+00
F6  0.000000e+00
T2 -0.000000e+00
T3  0.000000e+00
F7  0.000000e+00
F8 -0.000000e+00
F9  0.000000e+00
F1 -4.060244e-17


In [14]:
train_validate_report(ridge_cv, X_train_scaled, y_train, X_test_scaled, y_test, feature_names = df.columns[:-1])

MAE: 2.5449691960009964
MSE: 10.014862362231069
RMSE: 3.1646267334760143
        coef
F5  0.185782
F9  0.162930
F4  0.144362
F7  0.107279
F2  0.101667
F6  0.084251
T3  0.080909
T1 -0.046447
F8 -0.087189
T2 -0.092972
F3 -0.246248
F1 -0.262363


2. На важных признаках

In [33]:
X, y = df.drop(['T2', 'F8', 'T1', 'T3', 'F2', 'F10'], axis = 1).values, df['T2'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state = 42)

#res= clf._predict_proba_lr(X_test,Y_test)

X_train.shape

(700, 7)

In [34]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [35]:
def train_validate_report(model, X_train_scaled, y_train, X_test_scaled, y_test, feature_names):
    
    model.fit(X_train_scaled, y_train)
    
    print('MAE:', mean_absolute_error(y_test, model.predict(X_test_scaled)))
    print('MSE:', mean_squared_error(y_test, model.predict(X_test_scaled)))
    print('RMSE:', np.sqrt(mean_squared_error(y_test, model.predict(X_test_scaled))))
    
    print(pd.DataFrame(model.coef_, feature_names, columns = ['coef']).sort_values(by = 'coef', ascending = False))

In [36]:
train_validate_report(ridge_cv, X_train_scaled, y_train, X_test_scaled, y_test, feature_names = df.columns[:-6])

MAE: 2.5223434603308914
MSE: 9.908531029832895
RMSE: 3.147781922216483
        coef
F3  0.187369
F2  0.140532
F4  0.089511
F5  0.074440
F6 -0.080634
F1 -0.228844
T1 -0.245288
