In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
#import dataset
from sklearn.datasets import load_breast_cancer
X,y = load_breast_cancer(True)  

In [3]:
#scaling data is necessary for making gradient descent faster 
import pandas as pd
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_scaled = pd.DataFrame(sc.fit_transform(X))
y = pd.Series(y)

In [4]:
y.head()

0    0
1    0
2    0
3    0
4    0
dtype: int32

In [5]:
X_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,1.097064,-2.073335,1.269934,0.984375,1.568466,3.283515,2.652874,2.532475,2.217515,2.255747,...,1.88669,-1.359293,2.303601,2.001237,1.307686,2.616665,2.109526,2.296076,2.750622,1.937015
1,1.829821,-0.353632,1.685955,1.908708,-0.826962,-0.487072,-0.023846,0.548144,0.001392,-0.868652,...,1.805927,-0.369203,1.535126,1.890489,-0.375612,-0.430444,-0.146749,1.087084,-0.24389,0.28119
2,1.579888,0.456187,1.566503,1.558884,0.94221,1.052926,1.363478,2.037231,0.939685,-0.398008,...,1.51187,-0.023974,1.347475,1.456285,0.527407,1.082932,0.854974,1.955,1.152255,0.201391
3,-0.768909,0.253732,-0.592687,-0.764464,3.283553,3.402909,1.915897,1.451707,2.867383,4.910919,...,-0.281464,0.133984,-0.249939,-0.550021,3.394275,3.893397,1.989588,2.175786,6.046041,4.93501
4,1.750297,-1.151816,1.776573,1.826229,0.280372,0.53934,1.371011,1.428493,-0.00956,-0.56245,...,1.298575,-1.46677,1.338539,1.220724,0.220556,-0.313395,0.613179,0.729259,-0.868353,-0.3971


In [6]:
#Gradient descent can used in different models that have a loss function
from sklearn.linear_model import SGDClassifier
svm = SGDClassifier(loss='hinge', learning_rate='constant',eta0=0.5)
perc = SGDClassifier(loss='perceptron', learning_rate='constant',eta0=0.5)
logreg = SGDClassifier(loss='log', learning_rate='constant',eta0=0.5)

In [7]:
#compare that k-fold scores
from sklearn.model_selection import cross_val_score
print("svm's 4-fold score:",cross_val_score(svm,X_scaled,y,cv=4).mean())
print("perceptron's 4-fold score:",cross_val_score(perc,X_scaled,y,cv=4).mean())
print("logistic regression's 4-fold score:",cross_val_score(logreg,X_scaled,y,cv=4).mean())

svm's 4-fold score: 0.964837978922486
perceptron's 4-fold score: 0.9630897271742342
logistic regression's 4-fold score: 0.9595809120457007


Hyper-parameter tuning using GridSearchCV

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

Homework: Apply GridSearchCV and find the best model for this dataset

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y,random_state=0)

In [9]:
'''We are defining a param_grid here to try out different configuration 
(non-exhaustive) which gives us an idea of what values/range of values 
will give a better model to test/deploy 
'''
params = {
            'loss': ['hinge','log','perceptron'],
            'alpha': [0.001,0.01,0.1,1],
            'l1_ratio':[0,0.5,1],
            'learning_rate':['constant','invscaling','optimal','adaptive'],
            'eta0':[0.001,0.01,0.1,1],
            'power_t':[0.5,1,2]
         }
grid_model = GridSearchCV(SGDClassifier(penalty='elasticnet',random_state=7),
                          params,cv=4)
grid_model.fit(X_train,y_train)

GridSearchCV(cv=4, error_score=nan,
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='elasticnet', power_t=0.5,
                                     random_state=7, shuffle=True, tol=0.001,
                                     validation_fraction=0.1, verbose=0,
                                     warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [0.001, 0.01, 0.1, 1],
                         'eta0': [0.001, 0.01, 0.1, 1], 'l1_ratio': [0, 0.5, 1],
                         'learning_rate': ['constant', 'invscaling

In [10]:
#Best parameter settings out of the ones we provided in the param_grid
grid_model.best_params_

{'alpha': 0.001,
 'eta0': 1,
 'l1_ratio': 0,
 'learning_rate': 'invscaling',
 'loss': 'hinge',
 'power_t': 0.5}

In [13]:
#K-fold score within the training set of the best model
grid_model.best_score_

0.985959266443308

In [25]:
#let's create a model to test
model = grid_model.best_estimator_

In [26]:
#training on the same set
model.fit(X_train,y_train)

SGDClassifier(alpha=0.001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=1, fit_intercept=True,
              l1_ratio=0, learning_rate='invscaling', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None,
              penalty='elasticnet', power_t=0.5, random_state=7, shuffle=True,
              tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [27]:
#Test score 
model.score(X_test,y_test)

0.958041958041958

In [11]:
#this score is available from the gridsearch result as well
grid_model.score(X_test,y_test)

0.958041958041958

In [28]:
#the best linear model's internal attributes (coefficient)
model.coef_

array([[-0.60542523, -1.00908524, -0.58855176, -0.55234968, -0.3747195 ,
         0.21625396, -0.73399178, -0.88468022, -0.86472191,  0.76518157,
        -1.44124182,  0.00259349, -1.01940038, -1.01473144,  0.61505159,
         0.91271862,  0.22891639, -0.6290292 ,  0.30912311,  1.36839691,
        -0.9062199 , -1.14978527, -0.81175239, -0.79210491, -0.84479946,
        -0.6697836 , -0.95133703, -1.40036032, -0.42834154, -0.07212208]])

In [29]:
#model's intercept
model.intercept_

array([0.89908697])