# Логистическая регрессия. Практика



In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, roc_curve, roc_auc_score, confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from matplotlib import pyplot as plt

In [2]:
# убираем warnings
import warnings
warnings.filterwarnings("ignore")

В качестве экспериментальных данных возьмем датасет о доходах граждан в различных странах [Adult Income](https://archive.ics.uci.edu/ml/datasets/Adult).
Далее сделаем необходимую предобработку:

In [3]:
adult_df = pd.read_csv('./data/adult.data',
                       names=['age', 'workclass', 'fnlwgt', 'education',
                              'education-num', 'marital-status', 'occupation',
                              'relationship', 'race', 'sex', 'capital-gain',
                              'capital-loss', 'hours-per-week', 'native-country', 'salary'])

In [4]:
adult_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [5]:
adult_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


Избавиться от лишних признаков:

In [6]:
adult_df.drop(['native-country'], axis=1, inplace=True)

Сконвертировать целевой столбец в бинарные значения:

In [7]:
adult_df['salary'] = (adult_df['salary'] != ' <=50K').astype('int32')

Сделать one-hot encoding для некоторых признаков:

In [8]:
adult_df = pd.get_dummies(adult_df, columns=[
                          'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex'])

In [9]:
adult_df.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,...,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Female,sex_ Male
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0


Нормализовать нуждающиеся в этом признаки:

In [10]:
a_features = adult_df[[
    'age',
    'education-num',
    'hours-per-week',
    'fnlwgt',
    'capital-gain',
    'capital-loss'
]].values

norm_features = (a_features - a_features.mean(axis=0)) / a_features.std(axis=0)

adult_df.loc[:, [
    'age',
    'education-num',
    'hours-per-week',
    'fnlwgt',
    'capital-gain',
    'capital-loss'
]] = norm_features

In [11]:
adult_df.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,...,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Female,sex_ Male
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1,0.837109,-1.008707,1.134739,-0.14592,-0.21666,-2.222153,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
2,-0.042642,0.245079,-0.42006,-0.14592,-0.21666,-0.035429,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,1.057047,0.425801,-1.197459,-0.14592,-0.21666,-0.035429,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,-0.775768,1.408176,1.134739,-0.14592,-0.21666,-0.035429,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0


Разбить таблицу данных на матрицы X и y

In [12]:
X = adult_df[list(set(adult_df.columns) - set(['salary']))].values
y = adult_df['salary'].values

Добавить фиктивный столбец единиц (bias линейной модели):

In [13]:
X = np.hstack([np.ones(X.shape[0])[:, np.newaxis], X])
m = X.shape[1]

### Задание

> Постройте модель логистической регрессии при помощи sklearn. Используйте параметры по умолчанию, обучите на всей выборке и посчитайте F1 score.

#### Решение:

In [14]:
model = LogisticRegression()

model.fit(X, y)

LogisticRegression()

In [15]:
y_predicted = model.predict(X)

F1 score:

In [16]:
print(np.round(f1_score(y, y_predicted), 2))

0.66


### Задание

> Посчитайте confusion matrix для классификатора из предыдущей задачи. Для получения матрицы можно воспользоваться методом sklearn.metrics.confusion_matrix(y_true, y_pred)

In [17]:
y_predicted = model.predict(X)

In [18]:
print(confusion_matrix(y, y_predicted))

[[23028  1692]
 [ 3128  4713]]


###  Задание

> Постройте ROC-кривую и посчитайте **ROC - AUC** для классификатора из предыдущей задачи.

In [19]:
model = LogisticRegression()

model.fit(X, y)

LogisticRegression()

In [20]:
y_pred_proba_pack = model.predict_proba(X)

np.round(roc_auc_score(y, y_pred_proba_pack[:, 1]), 2)

0.91

### Задание

> Постройте модель логистической регрессии при помощи sklearn без регуляризации. Чему равен F1 score?

In [21]:
y_pred = model.predict(X)

np.round(f1_score(y, y_pred), 2)

0.66

### Задание

> Переберите коэффициенты l2-регуляризации от 0.01 до 1 с шагом 0.01 и определите, на каком из них модель логистической регрессии из sklearn даёт наибольший F1 score.

In [22]:
c_list = np.arange(0.01, 1.01, 0.01)

max_f1 = -1
max_c = -1

for c in c_list:
    LRC = LogisticRegression(C=c, penalty='l2')
    LRC.fit(X, y)
    
    y_pred = LRC.predict(X)
    f1 = f1_score(y, y_pred)
    
    if f1 > max_f1:
        max_f1 = f1
        max_c = c

print(max_c)

0.65


### Задание

> Замените в столбце `native-country` страны, у которых меньше ста записей, на `other`, поменяйте этот столбец на dummy-переменные, обучите классификатор на всей выборке и посчитайте F1 score.

In [23]:
adult_df = pd.read_csv('./data/adult.data',
                       names=['age', 'workclass', 'fnlwgt', 'education',
                              'education-num', 'marital-status', 'occupation',
                              'relationship', 'race', 'sex', 'capital-gain',
                              'capital-loss', 'hours-per-week', 'native-country', 'salary'])

In [24]:
native_countries_les_100 = (adult_df['native-country'].value_counts() < 100).to_frame()

native_countries_les_100_list = native_countries_les_100[native_countries_les_100['native-country'] == True].index.to_list()

adult_df.loc[adult_df['native-country'].isin(native_countries_les_100_list), ['native-country']] = 'other'

In [25]:
adult_df['salary'] = (adult_df['salary'] != ' <=50K').astype('int32')

In [26]:
adult_df = pd.get_dummies(adult_df, columns=[
                          'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'])

In [27]:
a_features = adult_df[[
    'age',
    'education-num',
    'hours-per-week',
    'fnlwgt',
    'capital-gain',
    'capital-loss'
]].values

norm_features = (a_features - a_features.mean(axis=0)) / a_features.std(axis=0)

adult_df.loc[:, [
    'age',
    'education-num',
    'hours-per-week',
    'fnlwgt',
    'capital-gain',
    'capital-loss'
]] = norm_features

In [28]:
X = adult_df[list(set(adult_df.columns) - set(['salary']))].values
y = adult_df['salary'].values

In [29]:
X = np.hstack([np.ones(X.shape[0])[:, np.newaxis], X])
m = X.shape[1]

In [30]:
model = LogisticRegression()

model.fit(X, y)

LogisticRegression()

In [31]:
y_pred = model.predict(X)

np.round(f1_score(y, y_pred), 2)

0.66