# Logistic Regression 

In [123]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from lib.feature_selection import feature_reduction
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

npf = pd.read_csv("data/npf_train.csv")
npf.iloc[:,5:]

data = feature_reduction(npf)
X = data.iloc[:,2:]
y2 = data['class2']
y4 = data['class4']
X_norm = (X - X.min())/(X.max()-X.min())
X_all = npf.iloc[:,5:]

# define dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_norm, y2, stratify = y2,test_size=0.33, random_state=42)

In [124]:

# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear','sag','saga']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01,0.001]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.870538 using {'C': 100, 'penalty': 'l2', 'solver': 'sag'}
0.864050 (0.060183) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.864050 (0.060183) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.864050 (0.060183) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.870538 (0.055204) with: {'C': 100, 'penalty': 'l2', 'solver': 'sag'}
0.869427 (0.054256) with: {'C': 100, 'penalty': 'l2', 'solver': 'saga'}
0.860681 (0.059690) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.860681 (0.059690) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.861792 (0.059480) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.858530 (0.059861) with: {'C': 10, 'penalty': 'l2', 'solver': 'sag'}
0.859642 (0.059692) with: {'C': 10, 'penalty': 'l2', 'solver': 'saga'}
0.857276 (0.052943) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.857276 (0.052943) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.857276 (0.052943) with: {'C': 1.0, 'penalty': '



In [125]:

clf =LogisticRegression(C=100,penalty='l2',solver='sag',random_state=1,max_iter=1000).fit(X_train, y_train)

from sklearn.metrics import accuracy_score
accuracy_score(clf.predict(X_test),y_test)

0.8223684210526315

In [126]:
X_train, X_test, y_train, y_test = train_test_split(
    X_norm, y4, stratify = y4,test_size=0.33, random_state=42)

# define models and parameters
model = LogisticRegression()
solvers = ['liblinear','saga']
penalty = ['l1']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.643871 using {'C': 1.0, 'penalty': 'l1', 'solver': 'saga'}
0.610179 (0.079188) with: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
0.592652 (0.074000) with: {'C': 100, 'penalty': 'l1', 'solver': 'saga'}
0.609068 (0.065555) with: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
0.611111 (0.070406) with: {'C': 10, 'penalty': 'l1', 'solver': 'saga'}
0.629642 (0.053354) with: {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
0.643871 (0.047035) with: {'C': 1.0, 'penalty': 'l1', 'solver': 'saga'}
0.591577 (0.044042) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
0.573118 (0.041230) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
0.254839 (0.011257) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
0.500323 (0.022121) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'saga'}




In [127]:

# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs','sag','saga']
penalty = ['none']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.590502 using {'C': 10, 'penalty': 'none', 'solver': 'saga'}
0.571111 (0.085374) with: {'C': 100, 'penalty': 'none', 'solver': 'newton-cg'}
0.581971 (0.067817) with: {'C': 100, 'penalty': 'none', 'solver': 'lbfgs'}
0.585233 (0.063554) with: {'C': 100, 'penalty': 'none', 'solver': 'sag'}
0.589427 (0.075669) with: {'C': 100, 'penalty': 'none', 'solver': 'saga'}
0.571111 (0.085374) with: {'C': 10, 'penalty': 'none', 'solver': 'newton-cg'}
0.581971 (0.067817) with: {'C': 10, 'penalty': 'none', 'solver': 'lbfgs'}
0.584158 (0.066137) with: {'C': 10, 'penalty': 'none', 'solver': 'sag'}
0.590502 (0.075766) with: {'C': 10, 'penalty': 'none', 'solver': 'saga'}
0.571111 (0.085374) with: {'C': 1.0, 'penalty': 'none', 'solver': 'newton-cg'}
0.581971 (0.067817) with: {'C': 1.0, 'penalty': 'none', 'solver': 'lbfgs'}
0.588530 (0.065921) with: {'C': 1.0, 'penalty': 'none', 'solver': 'sag'}
0.590502 (0.075766) with: {'C': 1.0, 'penalty': 'none', 'solver': 'saga'}
0.571111 (0.085374) with: {'C': 0



In [128]:

# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs','sag','saga']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.625305 using {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.586201 (0.062897) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.593907 (0.066549) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.587312 (0.072189) with: {'C': 100, 'penalty': 'l2', 'solver': 'sag'}
0.592616 (0.065908) with: {'C': 100, 'penalty': 'l2', 'solver': 'saga'}
0.612222 (0.068454) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.611183 (0.067511) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.612222 (0.065871) with: {'C': 10, 'penalty': 'l2', 'solver': 'sag'}
0.610036 (0.069694) with: {'C': 10, 'penalty': 'l2', 'solver': 'saga'}
0.625305 (0.049059) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.625305 (0.049059) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.625305 (0.049059) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'sag'}
0.625305 (0.049059) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'saga'}
0.599283 (0.051598) with: {'C': 0.1, 'penalty': 'l2',

In [129]:
clf =LogisticRegression(C=1,penalty='l1',solver='saga',random_state=1,max_iter=1000).fit(X_train, y_train)

from sklearn.metrics import accuracy_score
accuracy_score(clf.predict(X_test),y_test)

0.6513157894736842