# **Logistic Regression**
 ## Table of contents
>1. [Dataset](#Dataset)
>2. [Logistic Regression](#Logistic_Regression)

## Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn import metrics

## Dataset <a name="Dataset"></a>

In [2]:
X_train = pd.read_csv('./data/X_train.csv').to_numpy()
y_train = pd.read_csv('./data/y_train.csv').to_numpy()
X_test = pd.read_csv('./data/X_test.csv').to_numpy()
y_test = pd.read_csv('./data/y_test.csv').to_numpy()

In [3]:
X_train.shape

(111, 605)

## Logistic Regression <a name="Logistic_Regression"></a>

In [4]:
def logistic_regression(X_train, y_train, X_test, y_test, ratio):  
    # Apply scale datasets
    # create model scale ==> standard scaler
    scaler = StandardScaler()   
    # fit dataset to model scale
    X_train_scl = scaler.fit_transform(X_train)
    X_test_scl = scaler.fit_transform(X_test)

    # convert matrix 2d -> 1d (by flatten method)    
    y_train = y_train.ravel()
    y_test = y_test.ravel()

    clf = LogisticRegression(random_state=0, penalty='elasticnet', l1_ratio=ratio,solver="saga" , n_jobs = -1)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    y_score = clf.predict_proba(X_test)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score[:,1], pos_label=2)
    auc = metrics.auc(fpr, tpr)
    
    accuracy = metrics.accuracy_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred).ravel()
    confusion = (tn, fp, fn, tp)
    
    return clf, fpr, tpr, thresholds, auc, accuracy, recall, confusion

In [5]:
ratio_arr = []
n_feature_to_select_arr = []
auc_arr = []
accuracy_arr = []
recall_arr = []
for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
    clf, fpr, tpr, thresholds, auc, accuracy, recall,confusion = logistic_regression(X_train, y_train, X_test, y_test, i)
    features_index = np.arange(0, X_train.shape[1], 1, dtype=int)
    coef = clf.coef_[0]
    # find new feature 
    count = 0
    for k in range(0, X_train.shape[1]-1, 1):
        if(coef[k] == 0):
            count = count + 1
    n_feature_to_select = len(features_index)- count  
    
    ratio_arr.append(i)
    n_feature_to_select_arr.append(n_feature_to_select)
    auc_arr.append(auc)
    accuracy_arr.append(accuracy)
    recall_arr.append(recall)
#     print("l1_ratio = ", i)
#     print("auc = ", auc)
#     print("n_feature_to_select = ", len(features_index)- count)



In [6]:
print(ratio_arr)

[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]


In [7]:
print(n_feature_to_select_arr)

[476, 428, 307, 274, 222, 204, 144, 133, 123, 97]


In [8]:
print(auc_arr)

[0.40625, 0.41145833333333337, 0.421875, 0.42708333333333337, 0.4375, 0.4322916666666667, 0.4427083333333333, 0.4427083333333333, 0.453125, 0.4583333333333333]


In [9]:
print(accuracy_arr)

[0.4642857142857143, 0.5, 0.5, 0.4642857142857143, 0.4642857142857143, 0.4642857142857143, 0.5, 0.5357142857142857, 0.5357142857142857, 0.5357142857142857]


In [10]:
print(recall_arr)

[0.3333333333333333, 0.4166666666666667, 0.4166666666666667, 0.4166666666666667, 0.4166666666666667, 0.4166666666666667, 0.5, 0.5, 0.5, 0.5]


In [11]:
# l1_ratio =  1
clf, fpr, tpr, thresholds, auc, accuracy, recall, confusion = logistic_regression(X_train, y_train, X_test, y_test, 1)



In [12]:
coef = clf.coef_[0]
features_index = np.arange(0, X_train.shape[1], 1, dtype=int)
for i in range(0, X_train.shape[1]-1, 1):
    if(coef[i] == 0):
        index = np.argwhere(features_index==i)
        features_index = np.delete(features_index,index)

In [13]:
len(features_index)

97

In [14]:
features_index

array([  7,   8,  15,  26,  30,  31,  34,  38,  41,  46,  54,  68,  69,
        72,  78, 103, 106, 107, 108, 110, 112, 114, 115, 122, 124, 125,
       129, 132, 153, 154, 155, 157, 158, 162, 172, 214, 221, 224, 226,
       230, 232, 233, 234, 236, 237, 239, 240, 241, 243, 247, 250, 266,
       268, 269, 272, 279, 306, 314, 317, 324, 326, 327, 335, 336, 342,
       358, 380, 401, 406, 407, 421, 434, 435, 439, 440, 445, 458, 480,
       483, 485, 486, 492, 506, 515, 516, 517, 546, 547, 550, 551, 553,
       564, 567, 574, 575, 582, 604])

In [15]:
confusion

(6, 6, 7, 9)

In [16]:
tpr

array([0.    , 0.0625, 0.0625, 0.3125, 0.3125, 0.375 , 0.375 , 0.5   ,
       0.5   , 0.625 , 0.625 , 0.75  , 0.75  , 1.    ])

In [17]:
print(fpr.tolist())

[0.0, 0.0, 0.25, 0.25, 0.3333333333333333, 0.3333333333333333, 0.4166666666666667, 0.4166666666666667, 0.5, 0.5, 0.75, 0.75, 1.0, 1.0]


In [18]:
fpr

array([0.        , 0.        , 0.25      , 0.25      , 0.33333333,
       0.33333333, 0.41666667, 0.41666667, 0.5       , 0.5       ,
       0.75      , 0.75      , 1.        , 1.        ])