## Prediction with Logistic Regression

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,KFold
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,\
recall_score,roc_curve,auc

import expectation_reflection as ER
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from function import split_train_test,make_data_balance

In [2]:
np.random.seed(1)

First of all, the processed data are imported.

In [3]:
data_list = ['1paradox']
#data_list = np.loadtxt('data_list.txt',dtype='str')

print(data_list)

['1paradox']


In [4]:
def read_data(data_id):    
    data_name = data_list[data_id]
    print('data_name:',data_name)
    Xy = np.loadtxt('../data/%s/data_processed.dat'%data_name) 
    X = Xy[:,:-1]
    y = Xy[:,-1]

    print(np.unique(y,return_counts=True))

    X,y = make_data_balance(X,y)

    print(np.unique(y,return_counts=True))

    X, y = shuffle(X, y, random_state=1)

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state = 1)
    
    sc = MinMaxScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    return X_train,X_test,y_train,y_test

In [5]:
data_id = 0
X_train,X_test,y_train,y_test = read_data(data_id)

data_name: 1paradox
(array([0., 1.]), array([169,  60]))
(array([0., 1.]), array([60, 60]))


In [6]:
#model = LogisticRegression()
#model = RandomForestClassifier(random_state = 1)
model = SGDClassifier(loss='log',max_iter=1000,tol=0.001)  # 'log' for logistic regression, 'hinge' for SVM

# regularization penalty space
#penalty = ['l1,l2','elasticnet']
penalty = ['elasticnet']

# solver
#solver=['saga']

# regularization hyperparameter space
#C = np.logspace(0, 4, 10)
alpha = [0.001,0.01,0.1,1.0,10.,100.]

# l1_ratio
l1_ratio = [0.,0.2,0.4,0.6,0.8,1.0]

# Create hyperparameter options
#hyperparameters = dict(penalty=penalty,solver=solver,C=C,l1_ratio=l1_ratio)
hyper_parameters = dict(penalty=penalty,alpha=alpha,l1_ratio=l1_ratio)

In [7]:
# Create grid search using 3-fold cross validation
clf = GridSearchCV(model, hyper_parameters, cv=3, iid='deprecated')

In [8]:
# Fit grid search
best_model = clf.fit(X_train, y_train)

In [9]:
# View best hyperparameters
#print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best alpha:', best_model.best_estimator_.get_params()['alpha'])
print('Best l1_ratio:', best_model.best_estimator_.get_params()['l1_ratio'])

Best alpha: 0.1
Best l1_ratio: 0.8
