# **Logistic Regression**
 ## Table of contents
>1. [Dataset](#Dataset)
>2. [Logistic Regression](#Logistic_Regression)

## Library

In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn import metrics

## Dataset <a name="Dataset"></a>

In [17]:
X_train = pd.read_csv('./data/X_train.csv').to_numpy()[1:,1:]
y_train = pd.read_csv('./data/y_train.csv').to_numpy()[1:,1:].astype('int')
X_test = pd.read_csv('./data/X_test.csv').to_numpy()[1:,1:]
y_test = pd.read_csv('./data/y_test.csv').to_numpy()[1:,1:].astype('int')

In [18]:
X_train

array([[0, 0, 0, ..., 1, 1, 2],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 1, 1, ..., 0, 2, 1],
       ...,
       [1, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 1, 2],
       [0, 0, 0, ..., 0, 1, 2]], dtype=object)

In [19]:
y_train

array([[2],
       [2],
       [2],
       [2],
       [1],
       [2],
       [2],
       [2],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [1],
       [1],
       [1],
       [1],
       [2],
       [2],
       [2],
       [2],
       [1],
       [2],
       [2],
       [1],
       [1],
       [2],
       [2],
       [1],
       [1],
       [2],
       [2],
       [1],
       [2],
       [1],
       [1],
       [2],
       [2],
       [1],
       [2],
       [2],
       [1],
       [2],
       [2],
       [1],
       [1],
       [2],
       [2],
       [2],
       [1],
       [1],
       [1],
       [2],
       [2],
       [1],
       [2],
       [2],
       [2],
       [2],
       [1],
       [1],
       [1],
       [2],
       [2],
       [1],
       [2],
       [2],
       [2],
       [1],
       [1],
       [2],
       [2],
       [1],
       [2],
       [2],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [2],
    

## Logistic Regression <a name="Logistic_Regression"></a>

In [20]:
def logistic_regression(X_train, y_train, X_test, y_test, ratio):  
    # Apply scale datasets
    # create model scale ==> standard scaler
    scaler = StandardScaler()   
    # fit dataset to model scale
    X_train_scl = scaler.fit_transform(X_train)
    X_test_scl = scaler.fit_transform(X_test)

    # convert matrix 2d -> 1d (by flatten method)    
    y_train = y_train.ravel()
    y_test = y_test.ravel()

    clf = LogisticRegression(random_state=0, penalty='elasticnet', l1_ratio=ratio,solver="saga" , n_jobs = -1)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    y_score = clf.predict_proba(X_test)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score[:,1], pos_label=2)
    auc = metrics.auc(fpr, tpr)
    
    accuracy = metrics.accuracy_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred).ravel()
    confusion = (tn, fp, fn, tp)
    
    return clf, fpr, tpr, thresholds, auc, accuracy, recall, confusion

In [21]:
ratio_arr = []
n_feature_to_select_arr = []
auc_arr = []
accuracy_arr = []
recall_arr = []
for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
    clf, fpr, tpr, thresholds, auc, accuracy, recall,confusion = logistic_regression(X_train, y_train, X_test, y_test, i)
    features_index = np.arange(0, X_train.shape[1], 1, dtype=int)
    coef = clf.coef_[0]
    # find new feature 
    count = 0
    for k in range(0, X_train.shape[1]-1, 1):
        if(coef[k] == 0):
            count = count + 1
    n_feature_to_select = len(features_index)- count  
    
    ratio_arr.append(i)
    n_feature_to_select_arr.append(n_feature_to_select)
    auc_arr.append(auc)
    accuracy_arr.append(accuracy)
    recall_arr.append(recall)
#     print("l1_ratio = ", i)
#     print("auc = ", auc)
#     print("n_feature_to_select = ", len(features_index)- count)



In [22]:
print(ratio_arr)

[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]


In [23]:
print(n_feature_to_select_arr)

[122, 116, 110, 94, 93, 90, 87, 77, 70, 69]


In [24]:
print(auc_arr)

[0.44318181818181823, 0.44318181818181823, 0.4545454545454546, 0.4659090909090909, 0.47159090909090906, 0.4829545454545454, 0.4829545454545454, 0.4829545454545454, 0.48863636363636365, 0.47159090909090906]


In [25]:
print(accuracy_arr)

[0.4444444444444444, 0.48148148148148145, 0.5185185185185185, 0.5555555555555556, 0.5555555555555556, 0.5555555555555556, 0.5555555555555556, 0.5555555555555556, 0.5555555555555556, 0.5925925925925926]


In [26]:
print(recall_arr)

[0.36363636363636365, 0.45454545454545453, 0.45454545454545453, 0.5454545454545454, 0.5454545454545454, 0.5454545454545454, 0.5454545454545454, 0.5454545454545454, 0.5454545454545454, 0.5454545454545454]


In [27]:
# l1_ratio =  1
clf, fpr, tpr, thresholds, auc, accuracy, recall, confusion = logistic_regression(X_train, y_train, X_test, y_test, 1)



In [28]:
coef = clf.coef_[0]
features_index = np.arange(0, X_train.shape[1], 1, dtype=int)
for i in range(0, X_train.shape[1]-1, 1):
    if(coef[i] == 0):
        index = np.argwhere(features_index==i)
        features_index = np.delete(features_index,index)

In [29]:
len(features_index)

69

In [30]:
features_index

array([  8,   9,  10,  11,  13,  14,  15,  16,  27,  30,  31,  32,  33,
        34,  35,  37,  39,  40,  41,  44,  46,  47,  48,  49,  51,  52,
        53,  54,  55,  56,  57,  59,  60,  61,  63,  65,  67,  68,  72,
        73,  74,  75,  77,  78,  80,  82,  83,  86,  89,  90,  91,  93,
        96,  97,  98, 100, 101, 113, 115, 116, 117, 118, 119, 120, 121,
       127, 131, 134, 135])

In [31]:
confusion

(6, 5, 6, 10)

In [32]:
tpr

array([0.    , 0.    , 0.0625, 0.0625, 0.125 , 0.125 , 0.25  , 0.25  ,
       0.625 , 0.625 , 0.875 , 0.875 , 1.    ])

In [33]:
print(fpr.tolist())

[0.0, 0.09090909090909091, 0.09090909090909091, 0.18181818181818182, 0.18181818181818182, 0.2727272727272727, 0.2727272727272727, 0.45454545454545453, 0.45454545454545453, 0.7272727272727273, 0.7272727272727273, 1.0, 1.0]


In [34]:
fpr

array([0.        , 0.09090909, 0.09090909, 0.18181818, 0.18181818,
       0.27272727, 0.27272727, 0.45454545, 0.45454545, 0.72727273,
       0.72727273, 1.        , 1.        ])