In [2]:
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, OneHotEncoder, OrdinalEncoder, MaxAbsScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [3]:
RANDOM_STATE = 42

In [4]:
dataset = load_boston()
X = pd.DataFrame(dataset.data)
X.columns = dataset.feature_names
y = dataset.target

1. Разделите выборку на обучающую и тестовую в отношении 80%/20%

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=RANDOM_STATE)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((404, 13), (102, 13), (404,), (102,))

2. Обучите стандартную регрессию, а также Ridge и  Lasso и параметрами по умолчанию и выведите их R2 на тестовой выборке

In [6]:
%%time
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 24.29
R2: 0.67
Wall time: 888 ms


In [7]:
%%time
rr = Ridge()
rr.fit(X_train, y_train)
y_pred = rr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 24.48
R2: 0.67
Wall time: 175 ms


In [8]:
%%time
ll = Lasso()
ll.fit(X_train, y_train)
y_pred = ll.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 24.41
R2: 0.67
Wall time: 29.4 ms


3. Для Ridge и Lasso подберите коэффициент регуляризации(используйте GridSearchCV, RidgeCV, LassoCV) в пределах от $10^{-5}$ до $10^5$ (по степеням 10). Посчитайте R2 на тестовой выборке по лучшим моделям и сравните с предыдущими результатами. Напишите как изменился результат

In [9]:
%%time
estimator = Ridge()
params = {'alpha': np.logspace(-5, 5, 11)}
gr = GridSearchCV(estimator, params)
gr.fit(X_train, y_train)
y_pred = gr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 24.29
R2: 0.67
Wall time: 158 ms


In [10]:
%%time
estimator = Lasso()
params = {'alpha': np.logspace(-5, 5, 11)}
gl = GridSearchCV(estimator, params)
gl.fit(X_train, y_train)
y_pred = gl.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 24.29
R2: 0.67
Wall time: 152 ms


In [11]:
%%time
lc = LassoCV(alphas=np.logspace(-5, 5, 11))
lc.fit(X_train, y_train)
y_pred = lc.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 24.29
R2: 0.67
Wall time: 48.8 ms


4. Проведите масштабирование выборки(используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [12]:
%%time
rc = RidgeCV(alphas=np.logspace(-5, 5, 11))
rc.fit(X_train, y_train)
y_pred = rc.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 24.29
R2: 0.67
Wall time: 103 ms


In [None]:
# От примененной регуляризации результат практически не изменился

4. Проведите масштабирование выборки(используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [13]:
%%time
ls = Pipeline([('scaler', StandardScaler()), ('LinearRegression', LinearRegression())])
ls.fit(X_train, y_train)
y_pred = ls.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 24.29
R2: 0.67
Wall time: 5.98 ms


In [14]:
%%time
lm = Pipeline([('scaler', MinMaxScaler()), ('model', LinearRegression())])
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 24.29
R2: 0.67
Wall time: 4.96 ms


In [15]:
%%time
rs = Pipeline([('scaler', StandardScaler()), ('model', Ridge())])
rs.fit(X_train, y_train)
y_pred = rs.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 24.31
R2: 0.67
Wall time: 4.99 ms


In [16]:
%%time
rs = Pipeline([('scaler', MinMaxScaler()), ('model', Ridge())])
rs.fit(X_train, y_train)
y_pred = rs.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 23.73
R2: 0.68
Wall time: 3.99 ms


In [17]:
%%time
lss = Pipeline([('scaler', StandardScaler()), ('model', Lasso())])
lss.fit(X_train, y_train)
y_pred = lss.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 27.58
R2: 0.62
Wall time: 4.99 ms


In [18]:
%%time
lsm = Pipeline([('scaler', MinMaxScaler()), ('model', Lasso())])
lsm.fit(X_train, y_train)
y_pred = lsm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 54.46
R2: 0.26
Wall time: 4.96 ms


In [None]:
# В случае с Lasso масштабирование ухудшило результат. В остальном без существенных изменений

5. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [19]:
%%time
estimator = Pipeline([('scaler', StandardScaler()), ('model', Ridge())])
params = {'model__alpha': np.logspace(-5, 5, 11)}
grs = GridSearchCV(estimator, params)
grs.fit(X_train, y_train)
y_pred = grs.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 24.31
R2: 0.67
Wall time: 245 ms


In [20]:
%%time
estimator = Pipeline([('scaler', MinMaxScaler()), ('model', Ridge())])
params = {'model__alpha': np.logspace(-5, 5, 11)}
grm = GridSearchCV(estimator, params)
grm.fit(X_train, y_train)
y_pred = grm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 24.20
R2: 0.67
Wall time: 232 ms


In [21]:
%%time
estimator = Pipeline([('scaler', StandardScaler()), ('model', Lasso())])
params = {'model__alpha': np.logspace(-5, 5, 11)}
lrs = GridSearchCV(estimator, params)
lrs.fit(X_train, y_train)
y_pred = lrs.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 24.29
R2: 0.67
Wall time: 227 ms


In [22]:
%%time
estimator = Pipeline([('scaler', MinMaxScaler()), ('model', Lasso())])
params = {'model__alpha': np.logspace(-5, 5, 11)}
lrs = GridSearchCV(estimator, params)
lrs.fit(X_train, y_train)
y_pred = lrs.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 24.29
R2: 0.67
Wall time: 216 ms


In [None]:
#Применение масштабирования c последующим подбором параметров не улучшило модель

6. Добавьте попарные произведения признаков и их квадраты (используйте PolynomialFeatures) на масштабированных признаках, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [23]:
%%time
estimator = Pipeline([('scaler', StandardScaler()), ('poly', PolynomialFeatures()), ('model', Ridge())])
params = {'model__alpha': np.logspace(-5, 5, 11)}
grs = GridSearchCV(estimator, params)
grs.fit(X_train, y_train)
y_pred = grs.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 13.34
R2: 0.82
Wall time: 317 ms


In [24]:
%%time
estimator = Pipeline([('scaler', MinMaxScaler()), ('poly', PolynomialFeatures()), ('model', Ridge())])
params = {'model__alpha': np.logspace(-5, 5, 11)}
grs = GridSearchCV(estimator, params)
grs.fit(X_train, y_train)
y_pred = grs.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 11.00
R2: 0.85
Wall time: 272 ms


In [25]:
%%time
estimator = Pipeline([('scaler', StandardScaler()), ('poly', PolynomialFeatures()), ('model', Lasso())])
params = {'model__alpha': np.logspace(-5, 5, 11)}
grs = GridSearchCV(estimator, params)
grs.fit(X_train, y_train)
y_pred = grs.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


MSE: 13.77
R2: 0.81
Wall time: 669 ms


  model = cd_fast.enet_coordinate_descent(


In [26]:
%%time
estimator = Pipeline([('scaler', MinMaxScaler()), ('poly', PolynomialFeatures()), ('model', Lasso())])
params = {'model__alpha': np.logspace(-5, 5, 11)}
grs = GridSearchCV(estimator, params)
grs.fit(X_train, y_train)
y_pred = grs.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


MSE: 11.80
R2: 0.84
Wall time: 604 ms


  model = cd_fast.enet_coordinate_descent(


In [None]:
#R2 заметно увеличился, максимально до 0.85

7. Подберите наилучшую модель (используйте Pipeline, GridSearchSCV) подбирая тип регуляризации (L1,L2), коэффициент регуляризации, метод масштабирования и степень полинома в PolynomialFeatures. Выведите итоговые параметры и результат R2. Напишите как изменился R2 по сравнению с предыдущими экспериментами

In [28]:
%%time
steps = [('scaler', StandardScaler()),
         ('polynomial', PolynomialFeatures(include_bias=False)),
         ('model', Ridge())]
params = {'scaler': [StandardScaler(), MinMaxScaler()],
          'polynomial__degree': [2, 3, 4],
          'model': [Ridge(), Lasso()],
          'model__alpha': np.logspace(-5, 5, 11)
         }
pipe = Pipeline(steps=steps)
search_model = GridSearchCV(pipe, param_grid=params, n_jobs=2)

search_model.fit(X_train, y_train)

best_params = search_model.best_params_

score = search_model.score(X_test, y_test)


print(best_params)
print(score)

{'model': Lasso(alpha=0.001), 'model__alpha': 0.001, 'polynomial__degree': 3, 'scaler': MinMaxScaler()}
0.84497340051315
Wall time: 17.1 s


  model = cd_fast.enet_coordinate_descent(


http://archive.ics.uci.edu/ml/datasets/Adult

In [29]:
link = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv'
data = pd.read_csv(link, header=None)

In [30]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


8. Разделите выборку на признаки и целевую переменную(колонка со зачениями {<=50K,>50K}). Замените целевую переменную на числовые значения.

In [31]:
X = data.loc[:,0:13]
y = data[14]
y = pd.Series(np.where(y == '<=50K', 0, 1))
y

0        0
1        0
2        0
3        0
4        0
        ..
48837    0
48838    0
48839    0
48840    0
48841    1
Length: 48842, dtype: int32

9. Выясните, присутствуют ли в данных пропуски. Заполните их самыми частыми значениями (испольуйте SimpleImputer)

In [32]:
X.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
dtype: int64

In [33]:
y.isnull().sum()

0

10. Выберите колонки с числовыми и категориальными переменными.

In [34]:
X.dtypes

0      int64
1     object
2      int64
3     object
4      int64
5     object
6     object
7     object
8     object
9     object
10     int64
11     int64
12     int64
13    object
dtype: object

In [35]:
categorialscol = X.select_dtypes(include="object").columns
numberscol = X.select_dtypes(exclude="object").columns

In [38]:
categorialscol = list(X.select_dtypes('object').columns)
C = categorialscol
N = numberscol

11. Создайте пайплайн по обработке колонок(используйте OneHotEncoder,MinMaxScaler).

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((39073, 14), (9769, 14), (39073,), (9769,))

In [40]:
numeric_preprocessor = Pipeline(steps=[("scaler", MinMaxScaler()),])

categorical_preprocessor = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer([("categorical", categorical_preprocessor, C),
                                  ("numerical", numeric_preprocessor, N)])

pipe = make_pipeline(preprocessor, LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))

In [42]:
%%time
pipe.fit(X_train,y_train)

Wall time: 674 ms


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  [1, 3, 5, 6, 7, 8, 9, 13]),
                                                 ('numerical',
                                                  Pipeline(steps=[('scaler',
                                                                   MinMaxScaler())]),
                                                  Int64Index([0, 2, 4, 10, 11, 12], dtype='int64'))])),
                ('logisticregression',
                 LogisticRegression(max_iter=1000, random_state=42))])

12. Посчитайте метрики accuracy и f1_score на предсказании только самого частого класса в целевой переменной.

In [43]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
y_pred_d = dummy_clf.predict(X_test)

In [44]:
accuracy = accuracy_score(y_test, y_pred_d)
f1 = f1_score(y_test, y_pred_d, average=None)[0]
print(accuracy)
print(f1)

0.7589313133381104
0.8629459349356923


13. Посчитайте cross_val_score по алгоритмам LogisticRegression, SVC, LinearSVC по метрикам accuracy и f1_score.
Напишите удалось ли превзойти предыдущий результат.

In [45]:
numeric_preprocessor = Pipeline(steps=[("scaler", MinMaxScaler()),])

categorical_preprocessor = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer([("categorical", categorical_preprocessor, C),
                                  ("numerical", numeric_preprocessor, N)])

pipeLR = make_pipeline(preprocessor, LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))
pipeSVC = make_pipeline(preprocessor, SVC(random_state=RANDOM_STATE))
pipeLSVC = make_pipeline(preprocessor, LinearSVC(random_state=RANDOM_STATE))

In [46]:
%%time
pipe = pipeLR
cvs_acc = cross_val_score(pipe, X, y, scoring='accuracy')
cvs_f1 = cross_val_score(pipe, X, y, scoring='f1')

print(f"accuracyLR: {cvs_acc}")
print(f"F1LR: {cvs_f1}")

accuracyLR: [0.84839799 0.84829563 0.85647011 0.85145373 0.8495086 ]
F1LR: [0.64996455 0.65129412 0.66539379 0.65882906 0.6529745 ]
Wall time: 7.87 s


In [47]:
%%time
pipe = pipeSVC
cvs_acc = cross_val_score(pipe, X, y, cv=5, scoring='accuracy')
cvs_f1 = cross_val_score(pipe, X, y, cv=5, scoring='f1')

print(f"accuracyLR: {cvs_acc}")
print(f"F1LR: {cvs_f1}")

accuracyLR: [0.8362166  0.8362166  0.84490172 0.83855446 0.84398034]
F1LR: [0.60687961 0.61759082 0.6280383  0.61602143 0.63188406]
Wall time: 7min 41s


In [48]:
%%time
pipe = pipeLSVC
cvs_acc = cross_val_score(pipe, X, y, cv=5, scoring='accuracy')
cvs_f1 = cross_val_score(pipe, X, y, cv=5, scoring='f1')

print(f"accuracyLR: {cvs_acc}")
print(f"F1LR: {cvs_f1}")

accuracyLR: [0.85065002 0.85167366 0.85728911 0.85176085 0.8531941 ]
F1LR: [0.65137395 0.65736581 0.66458133 0.65589354 0.65986717]
Wall time: 6 s


In [None]:
#Accuracy увеличился, а F1 снизился

14. Можно заметить что в данных присутствуют значения '?', замените их самыми частыми значениями (испольуйте SimpleImputer)

In [49]:
SImodifyer = SimpleImputer(missing_values='?', strategy='most_frequent')
modifyedX = pd.DataFrame(SImodifyer.fit_transform(X), columns=X.columns).astype(X.dtypes.to_dict())

15. Посчитайте cross_val_score на новых данных. Напишите удалось ли улучшить результат.

In [50]:
%%time
pipe = pipeLR
cvs_acc = cross_val_score(pipe, modifyedX, y, scoring='accuracy')
cvs_f1 = cross_val_score(pipe, modifyedX, y, scoring='f1')

print(f"accuracyLR: {cvs_acc}")
print(f"F1LR: {cvs_f1}")

accuracyLR: [0.84819326 0.84901218 0.85606061 0.85063473 0.84858722]
F1LR: [0.64648391 0.65204058 0.66571564 0.65500118 0.649443  ]
Wall time: 7.75 s


In [51]:
%%time
pipe = pipeSVC
cvs_acc = cross_val_score(pipe, modifyedX, y, cv=5, scoring='accuracy')
cvs_f1 = cross_val_score(pipe, modifyedX, y, cv=5, scoring='f1')

print(f"accuracyLR: {cvs_acc}")
print(f"F1LR: {cvs_f1}")

accuracyLR: [0.8382639  0.83539769 0.84285422 0.83865684 0.84275184]
F1LR: [0.60949086 0.61178175 0.62257192 0.61410382 0.62664074]
Wall time: 7min 35s


In [52]:
%%time
pipe = pipeLSVC
cvs_acc = cross_val_score(pipe, modifyedX, y, cv=5, scoring='accuracy')
cvs_f1 = cross_val_score(pipe, modifyedX, y, cv=5, scoring='f1')

print(f"accuracyLR: {cvs_acc}")
print(f"F1LR: {cvs_f1}")

accuracyLR: [0.84911455 0.85044529 0.85421785 0.85042998 0.8499181 ]
F1LR: [0.64396135 0.65272165 0.65703276 0.65172825 0.6492823 ]
Wall time: 6.51 s


In [None]:
#Результат без особых изменений

16. Посчитайте cross_val_score, если просто удалить значения '?'. Напишите как изменился результат

In [53]:
QMdroper = (X != '?').all(axis=1)
dropedX = X[QMdroper]
dropedy = y[QMdroper]


In [54]:
%%time
pipe = pipeLR
cvs_acc = cross_val_score(pipe, dropedX, dropedy, scoring='accuracy')
cvs_f1 = cross_val_score(pipe, dropedX, dropedy, scoring='f1')

print(f"accuracyLR: {cvs_acc}")
print(f"F1LR: {cvs_f1}")

accuracyLR: [0.84389165 0.84477612 0.85404688 0.84586466 0.84564352]
F1LR: [0.65442976 0.65638767 0.67213115 0.65933529 0.65801078]
Wall time: 6.42 s


In [55]:
%%time
pipe = pipeSVC
cvs_acc = cross_val_score(pipe, dropedX, dropedy, cv=5, scoring='accuracy')
cvs_f1 = cross_val_score(pipe, dropedX, dropedy, cv=5, scoring='f1')

print(f"accuracyLR: {cvs_acc}")
print(f"F1LR: {cvs_f1}")

accuracyLR: [0.83283582 0.83150912 0.84265812 0.83270677 0.83878815]
F1LR: [0.61663286 0.62127237 0.63855728 0.62032622 0.6367713 ]
Wall time: 6min 32s


In [56]:
%%time
pipe = pipeLSVC
cvs_acc = cross_val_score(pipe, dropedX, dropedy, cv=5, scoring='accuracy')
cvs_f1 = cross_val_score(pipe, dropedX, dropedy, cv=5, scoring='f1')

print(f"accuracyLR: {cvs_acc}")
print(f"F1LR: {cvs_f1}")

accuracyLR: [0.84610282 0.84754008 0.85393631 0.84730208 0.84763379]
F1LR: [0.65544554 0.66092943 0.6701623  0.6602706  0.6612586 ]
Wall time: 5.69 s


In [None]:
#В целом уменьшилось время, но результат заметно не улучшился

 17. Посчитайте cross_val_score для RandomForestClassifier,GradientBoostingClassifier. Напишите как изменился результат и какой вывод можно из этого сделать.

In [57]:
pipeRFC = make_pipeline(preprocessor, RandomForestClassifier(random_state=RANDOM_STATE))
pipeGBC = make_pipeline(preprocessor, GradientBoostingClassifier(random_state=RANDOM_STATE))

In [58]:
%%time
pipe = pipeRFC
cvs_acc = cross_val_score(pipe, dropedX, dropedy, cv=5, scoring='accuracy')
cvs_f1 = cross_val_score(pipe, dropedX, dropedy, cv=5, scoring='f1')

print(f"accuracyLR: {cvs_acc}")
print(f"F1LR: {cvs_f1}")

accuracyLR: [0.84997236 0.84776119 0.85338346 0.84807607 0.84564352]
F1LR: [0.67512569 0.67017964 0.67955534 0.67238913 0.66570881]
Wall time: 3min 57s


In [59]:
%%time
pipe = pipeGBC
cvs_acc = cross_val_score(pipe, dropedX, dropedy, cv=5, scoring='accuracy')
cvs_f1 = cross_val_score(pipe, dropedX, dropedy, cv=5, scoring='f1')

print(f"accuracyLR: {cvs_acc}")
print(f"F1LR: {cvs_f1}")

accuracyLR: [0.85925926 0.8604754  0.86764706 0.8630031  0.86432994]
F1LR: [0.67698554 0.68323293 0.69472073 0.69109948 0.68944571]
Wall time: 39.3 s


In [None]:
#Результат улучшился. Лучшая модель: GradientBoostingClassifier

18. Подберите наилучшую модель, подбирая методы обработки колонок - масштабирование признаков, кодирование признаков и заполнение пропусков. Параметры алгоритмов оставьте по умолчанию. Выведите итоговые параметры и результат accuracy и f1_score.

In [66]:
preprocessor_base = ColumnTransformer([('cat', Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), C),
                                       ('num', Pipeline([('scaler', MinMaxScaler())]), N)
                                       ])

encoders = [OneHotEncoder(handle_unknown='ignore'),
            OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
            ]

scalers = [StandardScaler(), MinMaxScaler()]

models = [LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
          SVC(random_state=RANDOM_STATE),
          LinearSVC(random_state=RANDOM_STATE),
          RandomForestClassifier(random_state=RANDOM_STATE),
          GradientBoostingClassifier(random_state=RANDOM_STATE)
          ]

search_model = Pipeline(steps=[('preprocessor', preprocessor_base), ('classifier', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))])

params = {
          'preprocessor__cat__encoder': encoders,
          'preprocessor__num': scalers,
          'classifier': models,
              
          }

gridsearch = GridSearchCV(search_model, param_grid=params, scoring=('accuracy', 'f1'), refit='f1')

In [67]:
%%time
gridsearch.fit(X, y)



Wall time: 23min 58s


GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('encoder',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         [1, 3,
                                                                          5, 6,
                                                                          7, 8,
                                                                          9,
                                                                          13]),
                                                                        ('num',
                                   

In [69]:
best_params = gridsearch.best_params_
print(best_params)

{'classifier': GradientBoostingClassifier(random_state=42), 'preprocessor__cat__encoder': OneHotEncoder(handle_unknown='ignore'), 'preprocessor__num': StandardScaler()}


In [70]:
gs_accuracy = accuracy_score(y_test, gridsearch.predict(X_test))
gs_f1 = f1_score(y_test, gridsearch.predict(X_test))
print(gs_accuracy)
print(gs_f1)

0.8695874705701709
0.6919729206963249
