# Логистическая регрессия в бинарной классификации

In [1]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
%matplotlib inline

## Подготовка даннных

Данные по заявкам на грант ученых. Содержит 6000 записей, 38 признаков. 
Источник: https://www.kaggle.com/c/unimelb

Будем предсказывать, одобрят ли заявку на грант.

In [2]:
raw_data = pd.read_csv('data.csv')
print(raw_data.shape)
raw_data.head()

(6000, 39)


Unnamed: 0,Grant.Status,Sponsor.Code,Grant.Category.Code,Contract.Value.Band...see.note.A,RFCD.Code.1,RFCD.Percentage.1,RFCD.Code.2,RFCD.Percentage.2,RFCD.Code.3,RFCD.Percentage.3,...,Dept.No..1,Faculty.No..1,With.PHD.1,No..of.Years.in.Uni.at.Time.of.Grant.1,Number.of.Successful.Grant.1,Number.of.Unsuccessful.Grant.1,A..1,A.1,B.1,C.1
0,1,21A,50A,A,230202.0,50.0,230203.0,30.0,230204.0,20.0,...,3098.0,31.0,Yes,>=0 to 5,2.0,0.0,0.0,4.0,2.0,0.0
1,1,4D,10A,D,320801.0,100.0,0.0,0.0,0.0,0.0,...,2553.0,25.0,Yes,>=0 to 5,3.0,1.0,0.0,2.0,0.0,0.0
2,0,,,,320602.0,50.0,321004.0,30.0,321015.0,20.0,...,2813.0,25.0,,Less than 0,1.0,5.0,0.0,7.0,2.0,0.0
3,0,51C,20C,A,291503.0,60.0,321402.0,40.0,0.0,0.0,...,2553.0,25.0,,more than 15,2.0,1.0,5.0,6.0,9.0,1.0
4,0,24D,30B,,380107.0,100.0,0.0,0.0,0.0,0.0,...,2923.0,25.0,,Less than 0,0.0,2.0,0.0,0.0,0.0,0.0


Видно, что есть не только численные данные. Так же есть пропущенные значения, их нужно обработать.

Выделим целевой признак:

In [3]:
X = raw_data.drop('Grant.Status', axis=1)
y = raw_data['Grant.Status']

Разобьем выборку на две части: в одной будут вещетсвенные признаки, в другой - категориальные:

In [4]:
numeric_columns = ['RFCD.Percentage.1', 'RFCD.Percentage.2', 'RFCD.Percentage.3', 
                   'RFCD.Percentage.4', 'RFCD.Percentage.5',
                   'SEO.Percentage.1', 'SEO.Percentage.2', 'SEO.Percentage.3',
                   'SEO.Percentage.4', 'SEO.Percentage.5',
                   'Year.of.Birth.1', 'Number.of.Successful.Grant.1', 'Number.of.Unsuccessful.Grant.1']
categorical_columns = list(set(X.columns.values.tolist()) - set(numeric_columns))

Для того, чтобы использовать логистическую регрессию, нужно представить все признаки вещественными числами.
Заменим пропущенные значения: у числовых - средним и нулевым, у категориальных - строкой 'NaN'.
Затем применим one-hot enocding к категориальным признакам.

In [5]:
X_real_mean = X[numeric_columns].fillna(X[numeric_columns].mean())
X_real_zero = X[numeric_columns].fillna(0.0)
X_categorical = X[categorical_columns].fillna('NA').astype(str)

Так как кодирование категориальных признаков не считает никаких параметров на данных, можно применить его сразу ко всему датасету:

In [6]:
from sklearn.feature_extraction import DictVectorizer as DV
encoder = DV(sparse=False)
X_categorical_encod = encoder.fit_transform(X_categorical.T.to_dict().values())

Разобьем выборку на обучающую и тестовую:

In [7]:
from sklearn.model_selection import train_test_split
X_train_real_mean, X_test_real_mean, y_train, y_test = train_test_split(X_real_mean, y, 
                                                                        test_size=0.3, random_state=0)
X_train_real_zero, X_test_real_zero = train_test_split(X_real_zero, test_size=0.3, random_state=0)
X_train_cat_encd, X_test_cat_encd = train_test_split(X_categorical_encod,
                                                                     test_size=0.3, random_state=0)

### Выбор способа заполенния пропущенных значений

Подобрав параметры по сетке (с помощью кросс-валидации по 3 фолдам), обучим линейную модель. На кросс-валидации будем использовать accuracy для ускорения вычислений. Выберем с помощью метрики качества AUC ROC наиболее успешный способ замены пропущенных данных.

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

regressor = LogisticRegression()
param_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}
cv = 3

X_train_zero_cat = np.hstack((X_train_real_zero, X_train_cat_encd))
X_test_zero_cat = np.hstack((X_test_real_zero, X_test_cat_encd))
X_train_mean_cat = np.hstack((X_train_real_mean, X_train_cat_encd))
X_test_mean_cat = np.hstack((X_test_real_mean, X_test_cat_encd))

grid_cv = GridSearchCV(regressor, param_grid=param_grid, cv=cv, scoring='accuracy')
grid_cv.fit(X_train_zero_cat, y_train)
zeros_auc_score = roc_auc_score(y_test, grid_cv.best_estimator_.predict(X_test_zero_cat))

grid_cv.fit(X_train_mean_cat, y_train)
means_auc_score = roc_auc_score(y_test, grid_cv.best_estimator_.predict(X_test_mean_cat))

print("Zeros score: {0}\n Means score{1}".format(zeros_auc_score, means_auc_score))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Zeros score: 0.7427423835397098
 Means score0.7437514771373173


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

def plot_scores(optimizer):
    scores=[]
    for i in range(len(optimizer.cv_results_['params'])):
        scores.append([optimizer.cv_results_['params'][i]['C'], 
                optimizer.cv_results_['mean_test_score'][i],
                optimizer.cv_results_['std_test_score'][i]])
    scores = np.array(scores)
    plt.semilogx(scores[:,0], scores[:,1])
    plt.fill_between(scores[:,0], scores[:,1]-scores[:,2], 
                                  scores[:,1]+scores[:,2], alpha=0.3)
    plt.show()
    
def write_answer_1(auc_1, auc_2):
    auc = (auc_1 + auc_2)/2
    with open("preprocessing_lr_answer1.txt", "w") as fout:
        fout.write(str(auc))
        
param_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}
cv = 3
estimator = LogisticRegression()

X_zeros_cat = np.hstack((X_train_real_zero, X_train_cat_encd))
X_zeros_cat_test = np.hstack((X_test_real_zero, X_test_cat_encd))
X_means_cat = np.hstack((X_train_real_mean, X_train_cat_encd))
X_means_cat_test = np.hstack((X_test_real_mean, X_test_cat_encd))

grid_cv = GridSearchCV(estimator, param_grid=param_grid, scoring='accuracy', cv=cv)
grid_cv.fit(X_zeros_cat, y_train)
plot_scores(grid_cv)

zeros_auc = roc_auc_score(y_test, grid_cv.best_estimator_.predict(X_zeros_cat_test))

grid_cv = GridSearchCV(estimator, param_grid=param_grid, scoring='accuracy', cv=cv)
grid_cv.fit(X_means_cat, y_train)
plot_scores(grid_cv)
means_auc = roc_auc_score(y_test, grid_cv.best_estimator_.predict(X_means_cat_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 