In [None]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix,f1_score,roc_curve
import joblib
import os

# Logistic Regression
def logisticRegression(trainX, trainY, testX, testY, decision_boundary=0.5):
    '''
        @Input
            trainX : type : array
            trainY : type : array
            testX : type : array
            testY : type : array

            decision_boundary : type : float (0~1사이의 값)
        @Output
            AucScore : type : float 64
            f1 : type : float 64
            precision : type : float 64
            recall : type : float 64
            confusionmatrix : array
    '''
    model = LogisticRegression(random_state=0).fit(trainX, trainY)

    # Logistic Regression 모델 Pickle 파일로 저장

    saveModel = joblib.dump(model, 'Logistic.pkl')

    anomalyScore = model.predict_proba(testX)
    pred = np.array([], dtype=np.int32)

    for i in range(len(anomalyScore)):
        if (anomalyScore[i][1] > decision_boundary) == True:
            pred = np.append(pred, 1)
        else:
            pred = np.append(pred, 0)

    cofMat = confusion_matrix(testY, pred, labels=[1, 0])  # 1: 불량, 0: 정상
    tpr = cofMat[1, 1] / (cofMat[1, 1] + cofMat[1, 0])
    fpr = cofMat[0, 1] / (cofMat[0, 0] + cofMat[0, 1])
    
    fpr_array = np.array([], dtype=np.int32)
    tpr_array = np.array([], dtype=np.int32)

    # Decision Boundary 별 FPR, TPR 계산 (AUC Score 시각화 용도 )
    for i in range(1, 100):
        predd = np.array([], dtype=np.int32)
        for j in range(len(anomalyScore)):
            decision_boundary = i / 100
            if (anomalyScore[j][1] > decision_boundary) == True:
                predd = np.append(predd, 1)
            else:
                predd = np.append(predd, 0)
        cofMatt = confusion_matrix(testY, predd, labels=[1, 0])  # 1: 불량, 0: 정상
        
        ##confusion matrix의 각 요소
        auc_tpr = cofMatt[1, 1] / (cofMatt[1, 1] + cofMatt[1, 0])
        auc_fpr = cofMatt[0, 1] / (cofMatt[0, 0] + cofMatt[0, 1])
        fpr_array = np.append(fpr_array, auc_fpr)
        tpr_array = np.append(tpr_array, auc_tpr)

    # calculate AUC of model
    auc = roc_auc_score(testY, pred)
    return ({"AnomalyScore": np.round(anomalyScore[:,1],3),"Prediction":pred ,"Aucscore": str(auc), "Fpr": fpr, "Tpr": tpr, "Fprarray":fpr_array, "Tprarray":tpr_array})
              # "accuracy": str(accuracy), "precision": str(precision), "recall": str(recall), "f1": str(f1), "tp": str(tp), "fp": str(fp), "fn": str(fn), "Predarray" :predarray,
              # "tn": str(tn)})  # type : int32

def classification_model_load_logistic(model_wd, testX, decision_boundary=0.5):
    model_wd = 'Logistic.pkl'
    model = joblib.load(model_wd)
    anomalyScore = model.predict_proba(testX)
    pred = np.array([], dtype=np.int32)
    
    for i in range(len(anomalyScore)):
        if (anomalyScore[i][1] > decision_boundary) == True:
            pred = np.append(pred, 1)
        else:
            pred = np.append(pred, 0)

    return {"AnomalyScore": np.round(anomalyScore[:,1],3), "Prediction":pred}


In [None]:
df = pd.read_csv('/workspace/xnsolution/data/1_Iris_virginica.csv', encoding='euc-kr')

df['label'][np.where(df['label'] == 'outlier')[0]] = 1
df['label'][np.where(df['label'] == 'target')[0]] = 0

X = df.iloc[:,0:4]
y = df.iloc[:,4]
y= y.astype('category')

trainX, testX, trainY, testY = train_test_split(X,y, test_size=0.3)

model = logisticRegression(trainX, trainY, testX, testY, decision_boundary=0.5)
load_model = classification_model_load_logistic('Logistic.pkl', testX, decision_boundary=0.5)