In [20]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, f1_score, precision_score, recall_score, confusion_matrix

In [2]:
cancer = load_breast_cancer()

In [3]:
X = cancer.data           #features
y = cancer.target         #target attributes

In [4]:
df_x = pd.DataFrame(X, columns = cancer.feature_names)
df_y = pd.Series(y)

In [13]:
#train_test_datasets split
train_x, test_x, train_y, test_y = train_test_split(df_x, df_y, test_size = 0.2, random_state = 42)

In [6]:
log_regr = LogisticRegression(max_iter = 10000) 

param = [{
    'C' : [0.001, 0.01, 0.1, 1, 10, 100],                 #for differnt values of C 
    'multi_class' : ['auto', 'ovr', 'multinomial'],       #for different logistic regression types
    }]


In [7]:
log_reg_grid = GridSearchCV(log_regr, param, cv = 5, scoring = 'accuracy')

In [8]:
log_reg_grid.fit(df_x, df_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=10000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [0.001, 0.01, 0.1, 1, 10, 100],
                          'multi_class': ['auto', 'ovr', 'multinomial']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [9]:
#best parameter
log_reg_grid.best_params_

{'C': 100, 'multi_class': 'multinomial'}

In [10]:
#best score
log_reg_grid.best_score_

0.9613569321533924

In [11]:
#training the best estimator

best_model = log_reg_grid.best_estimator_

In [14]:
best_model.fit(train_x, train_y)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
best_pred = best_model.predict(test_x)

In [26]:
#Accuracy for validation set

round(accuracy_score(test_y, best_pred), 4)

0.9825

In [16]:
#Accuracy for training set

round(accuracy_score(train_y, best_model.predict(train_x)), 4)

0.9846

In [19]:
#For logistic Regression, We calculate log_loss

print(log_loss(test_y, best_pred))
print('Not too bad!')

0.6059574735498409
Not too bad!


In [22]:
#Confusion Matrix

confusion_matrix(test_y, best_pred)

array([[41,  2],
       [ 0, 71]], dtype=int64)

In [23]:
#Precision Score
#precision = TP/(TP + FP)

precision_score(test_y, best_pred)

0.9726027397260274

In [24]:
#Recall score
#recall = TP/(TP + FN)

recall_score(test_y, best_pred)

1.0

In [25]:
#f1_score = 2*(precision * recall)/(precision + recall)
f1_score(test_y, best_pred)

0.9861111111111112

## These are some of the basic analysis in Logistic Regression Model

## THANK YOU!