In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
X = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
              [1, 1, 2, 1, 3, 0, 5, 10, 1, 2],
              [500, 700, 750, 600, 1450,
               800, 1500, 2000, 450, 1000],
              [1, 1, 2, 1, 2, 
               1, 3, 3, 1, 2]], dtype = np.float64)
y = np.array([0, 0, 1, 0, 1,
              0, 1, 0, 1, 1], dtype = np.float64)
X

array([[1.00e+00, 1.00e+00, 1.00e+00, 1.00e+00, 1.00e+00, 1.00e+00,
        1.00e+00, 1.00e+00, 1.00e+00, 1.00e+00],
       [1.00e+00, 1.00e+00, 2.00e+00, 1.00e+00, 3.00e+00, 0.00e+00,
        5.00e+00, 1.00e+01, 1.00e+00, 2.00e+00],
       [5.00e+02, 7.00e+02, 7.50e+02, 6.00e+02, 1.45e+03, 8.00e+02,
        1.50e+03, 2.00e+03, 4.50e+02, 1.00e+03],
       [1.00e+00, 1.00e+00, 2.00e+00, 1.00e+00, 2.00e+00, 1.00e+00,
        3.00e+00, 3.00e+00, 1.00e+00, 2.00e+00]])

In [9]:
def calc_std_feat(x):
    res = (x - x.mean()) / x.std()
    return res

X_st = X.copy()
X_st[2, :] = calc_std_feat(X[2, :])
X_st

array([[ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ],
       [ 1.        ,  1.        ,  2.        ,  1.        ,  3.        ,
         0.        ,  5.        , 10.        ,  1.        ,  2.        ],
       [-0.97958969, -0.56713087, -0.46401617, -0.77336028,  0.97958969,
        -0.36090146,  1.08270439,  2.11385144, -1.08270439,  0.05155735],
       [ 1.        ,  1.        ,  2.        ,  1.        ,  2.        ,
         1.        ,  3.        ,  3.        ,  1.        ,  2.        ]])

In [4]:
def sigmoid(z):
    res = 1 / (1 + np.exp(-z))
    return res

def calc_logloss(y, y_pred):
    err = - np.mean(y * np.log(y_pred) + (1.0 - y) * np.log(1.0 - y_pred))
    err = np.sum(err)
    return err

In [5]:
def eval_model(X, y, iterations, alpha=1e-4):
    np.random.seed(42)
    W = np.random.randn(X.shape[0])
    n = X.shape[1]
    for i in range(1, iterations+1):
        z = np.dot(W, X)
        y_pred = sigmoid(z)
        err = calc_logloss(y, y_pred)
        W -= alpha * (1/n * np.dot((y_pred - y), X.T))
        if i % (iterations / 10) == 0:
            print(i, W, err)
    return W

### 1*. Измените функцию calc_logloss так, чтобы нули по возможности не попадали в np.log.  

#### Исключим нулевые значения в самой формуле

In [7]:

def calc_logloss(y, y_pred):
    err = - np.mean(y * np.log(y_pred, where=(y_pred!=0)) + (1.0 - y) * np.log(1.0 - y_pred, where=(1-y_pred!=0)))
    err = np.sum(err)
    return err

### 2. Подберите аргументы функции eval_model для логистической регрессии таким образом, чтобы log loss был минимальным.

In [12]:
def eval_model_b(X, y, iterations, alpha=1e-4):
    np.random.seed(42)
    W = np.random.randn(X.shape[0])
    n = X.shape[1]
    for i in range(1, iterations+1):
        y_pred = sigmoid(np.dot(W, X))
        err = calc_logloss(y, y_pred)
        W -= alpha * (1/n * np.dot((y_pred - y), X.T))
    return W, err, alpha, iterations

In [13]:
#Подберем параметры

best_params = None
err_min = 1000
i = 0
for iterations in [3000, 4000, 5000, 10000]:
    for alpha in [1e-3, 1e-4, 1e-5, 1e-2]:
        params = W, err, alpha, iterations = eval_model_b(X_st, y, iterations=iterations, alpha=alpha)
        if err < err_min:
            err_min = err
            best_params = params

print(f'best params: iterations = {best_params[-1]}, alpha = {best_params[-2]}')

best params: iterations = 10000, alpha = 0.01


### 3. Создайте функцию calc_pred_proba, возвращающую предсказанную вероятность класса 1 (на вход подаются W, который уже посчитан функцией eval_model и X, на выходе - массив y_pred_proba).

In [14]:
W = eval_model(X_st, y, iterations=10000, alpha=0.01)

1000 [-0.29764618 -0.72670545  1.061634    1.39666497] 0.5233765331724747
2000 [-0.64214517 -0.81531815  1.11599369  1.72618787] 0.4994117173889704
3000 [-0.95627497 -0.87648865  1.10170056  2.00552673] 0.4812997915130281
4000 [-1.25395556 -0.91702026  1.05407927  2.24381099] 0.4663484120925518
5000 [-1.53718107 -0.94412619  0.98701591  2.45304376] 0.4534151021622755
6000 [-1.80709894 -0.96248191  0.90882535  2.64121123] 0.44193777801400824
7000 [-2.0646379  -0.97512726  0.82480965  2.81363531] 0.4316064579082126
8000 [-2.31061284 -0.98407392  0.73840308  2.97395036] 0.422228649670574
9000 [-2.54576745 -0.99066711  0.65182927  3.12469136] 0.41367061625436863
10000 [-2.77079473 -0.99580928  0.56650766  3.2676589 ] 0.4058305387773311


In [15]:
def calc_pred_proba(X, W):
    y_pred_proba = np.floor(sigmoid(np.dot(W, X)))
    return y_pred_proba

In [16]:
calc_pred_proba(X, W)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

### 4. Создайте функцию calc_pred, возвращающую предсказанный класс (на вход подаются W, который уже посчитан функцией eval_model и X, на выходе - массив y_pred).

In [17]:
def calc_pred(W, X):
    m = X.shape[1]
    y_pred = np.zeros((1, m), dtype = np.int64)
    y_pred_proba = sigmoid(W.T @ X)
    
    for i in range(len(y_pred_proba)):
        if (y_pred_proba[i] > 0.5): 
            y_pred[:, i] = 1
        elif (y_pred_proba[i] <= 0.5):
            y_pred[:, i] = 0
    return y_pred

In [18]:
y_pred = calc_pred(W, X)
y_pred

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int64)

### 5. Посчитайте Accuracy, матрицу ошибок, точность и полноту, а также F1 score.

In [19]:
#Accuracy

def accuracy_(y_pred, y):
    return len(y_pred[y == y_pred]) / len(y_pred)

In [20]:
accuracy_(y_pred, y)

5.0

In [21]:
#Correlation matrix

def correlation_matrix(y_pred, y):
    TP = len(y_pred[(y == y_pred) & (y_pred == 1)])
    FN = len(y_pred[(y != y_pred) & (y_pred == 0)])
    FP = len(y_pred[(y != y_pred) & (y_pred == 1)])
    TN = len(y_pred[(y == y_pred) & (y_pred == 0)])
    corr_matrix = pd.DataFrame.from_dict(
        {'y_pred': ['a(x) = +1', 'a(x) = -1'], 
         'y = +1': [TP, FN], 
         'y = -1': [FP, TN]}).set_index('y_pred')
    return corr_matrix

In [22]:
correlation_matrix(y_pred, y)

Unnamed: 0_level_0,y = +1,y = -1
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
a(x) = +1,5,5
a(x) = -1,0,0


In [25]:
#Precision:

def precision_(y_pred, y):
    return len(y_pred[(y == y_pred) & (y_pred == 1)]) / (
        len(y_pred[(y == y_pred) & (y_pred == 1)]) + len(y_pred[(y != y_pred) & (y_pred == 1)]))

In [26]:
precision_(y_pred, y)

0.5

In [23]:
#Recall:

def recall_(y_pred, y):
    return len(y_pred[(y == y_pred) & (y_pred == 1)]) / (
        len(y_pred[(y == y_pred) & (y_pred == 1)]) + len(y_pred[(y != y_pred) & (y_pred == 0)]))

In [24]:
recall_(y_pred, y)

1.0

In [27]:
#F1-score:

def F1_(y_pred, y):
    return 2 * precision_(y_pred, y) * recall_(y_pred, y) / (precision_(y_pred, y) + recall_(y_pred, y))

In [28]:
F1_(y_pred, y)

0.6666666666666666

### 6. Могла ли модель переобучиться? Почему?

Модель могла переобучиться и переобучилась. Основная причина: мало данных и данные не разбиваются на тестовые и тренировочные.

### 7*. Создайте функции eval_model_l1 и eval_model_l2 с применением L1 и L2 регуляризаций соответственно.

In [29]:
#L1
def eval_model_L1(X, y, iterations, alpha=1e-4, lambda_=1e-7):
    np.random.seed(42)
    W = np.random.randn(X.shape[0])
    n = X.shape[1]
    for i in range(1, iterations+1):
        y_pred = sigmoid(np.dot(W, X))
        err = calc_logloss(y, y_pred)
        W -= alpha * (1/n * np.dot((y_pred - y), X.T)  + lambda_ * np.sign(W))
        if i % (iterations / 10) == 0:
            print(i, W, err)
    return W

In [30]:
eval_model_L1(X, y, iterations=1000)

100 [ 0.49569806 -0.14048654  0.04073807  1.52293299] 2.2257319691335975
200 [ 0.49526381 -0.14103981  0.0411002   1.52387308] 2.491782941583723
300 [ 0.49482628 -0.14159343  0.04120265  1.52481175] 2.931062420395471
400 [ 0.4943932  -0.14214766  0.04110748  1.52575145] 2.500867647441929
500 [ 0.49395493 -0.14270628  0.03881607  1.52668614] 1.697865967405266
600 [ 0.49352149 -0.14325529  0.04108632  1.52762938] 2.4767531319905194
700 [ 0.49307614 -0.14382578  0.03294043  1.52855277] 1.1424018521848542
800 [ 0.49264485 -0.14437748  0.03412377  1.52949502] 1.2143454962629951
900 [ 0.49221953 -0.14491686  0.04121522  1.53044826] 2.7262876755118226
1000 [ 0.49177729 -0.14547924  0.0370982   1.53137892] 1.4656478916271243


array([ 0.49177729, -0.14547924,  0.0370982 ,  1.53137892])

In [37]:
#L2

def eval_model_L2(X, y, iterations, alpha=1e-5, lambda_=1e-8):
    np.random.seed(42)
    W = np.random.randn(X.shape[0])
    n = X.shape[1]
    for i in range(1, iterations+1):
        y_pred = sigmoid(np.dot(W, X))
        err = calc_logloss(y, y_pred)
        W -= alpha * (1/n * np.dot((y_pred - y), X.T) + lambda_ * W)
        if i % (iterations / 10) == 0:
            print(i, W, err)
    return W

In [38]:
eval_model_L2(X, y, iterations=600)

60 [ 0.49641415 -0.1390443   0.37168854  1.52260986] 0.0
120 [ 0.49611415 -0.1398243   0.09568854  1.52218986] 0.0
180 [ 0.49598337 -0.14011824 -0.00414988  1.52206505] 0.8479472941023294
240 [ 0.49594801 -0.14013575 -0.00414985  1.52210757] 0.847937119868023
300 [ 0.49591265 -0.14015326 -0.00414983  1.52215009] 0.8479269462131617
360 [ 0.49587729 -0.14017077 -0.0041498   1.52219261] 0.8479167731376714
420 [ 0.49584193 -0.14018828 -0.00414978  1.52223513] 0.8479066006414764
480 [ 0.49580657 -0.14020578 -0.00414975  1.52227765] 0.8478964287245034
540 [ 0.49577121 -0.14022328 -0.00414972  1.52232017] 0.847886257386677
600 [ 0.49573584 -0.14024078 -0.0041497   1.52236269] 0.8478760866279222


array([ 0.49573584, -0.14024078, -0.0041497 ,  1.52236269])