## 데이터 불러오기

In [1]:
import numpy as np 
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import time
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 데이터 cols 설정
cols = [' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', 'FIN Flag Count',
       ' SYN Flag Count', ' RST Flag Count', ' PSH Flag Count',
       ' ACK Flag Count', ' URG Flag Count', ' CWE Flag Count',
       ' ECE Flag Count', ' Down/Up Ratio', ' Average Packet Size',
       ' Avg Fwd Segment Size', ' Avg Bwd Segment Size',
       ' Fwd Header Length.1', 'Fwd Avg Bytes/Bulk', ' Fwd Avg Packets/Bulk',
       ' Fwd Avg Bulk Rate', ' Bwd Avg Bytes/Bulk', ' Bwd Avg Packets/Bulk',
       'Bwd Avg Bulk Rate', 'Subflow Fwd Packets', ' Subflow Fwd Bytes',
       ' Subflow Bwd Packets', ' Subflow Bwd Bytes', 'Init_Win_bytes_forward',
       ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward', 'Active Mean', ' Active Std', ' Active Max',
       ' Active Min', 'Idle Mean', ' Idle Std', ' Idle Max', ' Idle Min',' Label']
#데이터 불러오기
df1=pd.read_csv("./Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv", usecols = cols)#,nrows = 50000
df2=pd.read_csv("./Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv", usecols = cols)
df3=pd.read_csv("./Friday-WorkingHours-Morning.pcap_ISCX.csv", usecols = cols)
df4=pd.read_csv("./Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv", usecols = cols)
df5=pd.read_csv("./Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv", usecols = cols)

#데이터 통합
df = pd.concat([df1,df2])
del df1,df2
df = pd.concat([df,df3])
del df3
df = pd.concat([df,df4])
del df4
df = pd.concat([df,df5])
del df5

# 선형 회귀 분석을 통한 DDOS 예측

0. 데이터처리
1. LinearRegression을 통한 학습
2. 계산 및 모델 평가

## 데이터 처리

In [2]:
data = df.copy()

#계산에 필요한 데이터들은 x에 Ddos 여부를 판단하는 라벨 값은 y에 저장
x = data[[' Bwd Packet Length Std',' PSH Flag Count',' min_seg_size_forward',
          ' Min Packet Length',' ACK Flag Count',' Bwd Packet Length Min',
          ' Fwd IAT Std','Init_Win_bytes_forward',' Flow IAT Max',' Bwd Packets/s',
          ' URG Flag Count','Bwd IAT Total']]
y = data[[' Label']]
#학습에 사용할 데이터와 추후 예측 및 계산에 사용할 데이터를 분리
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.8,test_size=0.2)

In [3]:
#디도스는 -1 값을 정상은 1 값을 가지게 설정
attack = ['DDoS', 'PortScan', 'Bot', 'Infiltration', 'Web Attack � Brute Force',
          'Web Attack � XSS','Web Attack � Sql Injection']
normal = 'BENIGN'
y_train=y_train.replace(attack,-1)
y_train=y_train.replace(normal,1)
y_test=y_test.replace(attack,-1)
y_test=y_test.replace(normal,1)

## 선형 회귀 분석을 통한 학습

In [4]:
#LinearRegression 라이브러리를 이용해 학습
from sklearn.linear_model import LinearRegression

model=LinearRegression()
model.fit(x_train, y_train) 

LinearRegression()

## 계산

In [5]:
#학습한 모델을 이용해 예측하기
y_pred = model.predict(x_test)
#원활한 계산을 위해 양수값을 1로 음수값을 -1로 치환
for i in range(0,y_pred.size):
    if(y_pred[i]>0):
        y_pred[i]=1
    else:
        y_pred[i]=-1

In [6]:
from sklearn.metrics import confusion_matrix
#그래프로 각 영역에 해당하는 데이터 수를 알 수 없어서 따로 표현
cf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cf_matrix.ravel()
print ("True Negetive", tn, 
       "\nTrue Positive", tp)
print ("False Negetive", fn, 
       "\nFalse Positive", fp)

True Negetive 34425 
True Positive 166191
False Negetive 8161 
False Positive 23666


## 모델 평가

In [7]:
recall = tp/(tp+fn) #공격 적발률
precision = tp/(tp+fp) #정상 이용자 판단 확률
print("Recall", recall, "\nPrecision", precision)
f1 = 2 * (precision*recall)/(precision+recall)
print("F1 Score", f1)

Recall 0.9531923924015784 
Precision 0.8753482884486745
F1 Score 0.9126133621080204


# 선형회귀분석 성능 향상 시켜보기

## 데이터 전처리에 모든 column 사용

### 학습 및 계산

In [8]:
dat = df.copy()
dat.dropna(1,inplace=True)
#선형 회귀 분석이라 inf와 NaN 값이 있는 Flow Packets/s를 사용하지 않기
dat=dat.drop(columns=[' Flow Packets/s'], axis=1, inplace=False)

def testing_all(data):
    x = data[data.columns[0:-1]]
    y = data[[' Label']]
    x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.8,test_size=0.2)
    attack = ['DDoS', 'PortScan', 'Bot', 'Infiltration', 'Web Attack � Brute Force', 'Web Attack � XSS', 'Web Attack � Sql Injection']
    normal = 'BENIGN'
    y_train=y_train.replace(attack,-1)
    y_train=y_train.replace(normal,1)
    y_test=y_test.replace(attack,-1)
    y_test=y_test.replace(normal,1)
    model=LinearRegression()
    model.fit(x_train, y_train) 
    y_pred = model.predict(x_test)
    for i in range(0,y_pred.size):
        if(y_pred[i]>0):
            y_pred[i]=1
        else:
            y_pred[i]=-1
    cf_matrix = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cf_matrix.ravel()
    recall = tp/(tp+fn) #공격 적발률
    precision = tp/(tp+fp) #정상 이용자 판단 확률
#    print("Recall", recall, "\nPrecision", precision)
    f = 2 * (precision*recall)/(precision+recall)
#    print("F1 Score", f)
    
    return f

In [9]:
sum=0
for z in range(1,21):
    a = testing_all(dat)
    sum +=a
sum/=20
print("평균 f1점수:",sum)

평균 f1점수: 0.9544856803365807


정상 이용자를 판단할 확률이 크게 상승항여 F1 점수가 크게 상승

## column을 1개씩 제외하며 성능 변화 측정

In [10]:
data = df.copy()
data.dropna(1,inplace=True)
#선형 회귀 분석이라 inf와 NaN 값이 있는 Flow Packets/s를 사용하지 않기
data=data.drop(columns=[' Flow Packets/s'], axis=1, inplace=False)

In [12]:
data1 =data.copy()
change_rate = {'비교값': sum }
y = data1[[' Label']]
attack = ['DDoS', 'PortScan', 'Bot', 'Infiltration', 'Web Attack � Brute Force', 'Web Attack � XSS', 'Web Attack � Sql Injection']
normal = 'BENIGN'
#디도스는 -1 값을 정상은 1 값을 가지게 설정
y=y.replace(attack,-1)
y=y.replace(normal,1)

for i in data:
    if i!=" Label":
        data1=data.drop(columns=[i], axis=1, inplace=False)
    x = data[data1.columns[0:-1]]
    #학습에 사용할 데이터와 추후 예측 및 계산에 사용할 데이터를 분리
    x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.8,test_size=0.2)
    model=LinearRegression()
    model.fit(x_train, y_train) 
    #학습한 모델을 이용해 예측하기
    y_pred = model.predict(x_test)
    #원활한 계산을 위해 양수값을 1로 음수값을 -1로 치환
    for j in range(0,y_pred.size):
        if(y_pred[j]>0):
            y_pred[j]=1
        else:
            y_pred[j]=-1
    cf_matrix = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cf_matrix.ravel()
    print(i,"를 제외할 시")
    print ("True Negetive", tn, 
           "\nTrue Positive", tp)
    print ("False Negetive", fn, 
           "\nFalse Positive", fp)
    recall = tp/(tp+fn) #공격 적발률
    precision = tp/(tp+fp) #정상 이용자 판단 확률
    print("Recall", recall, "\nPrecision", precision)
    f = 2 * (precision*recall)/(precision+recall)
    print("F1 Score", f)
    change_rate[i]=f
    print("----------------------------------------------------------")

 Destination Port 를 제외할 시
True Negetive 51193 
True Positive 165531
False Negetive 8745 
False Positive 6974
Recall 0.9498209736280383 
Precision 0.9595721863134402
F1 Score 0.9546716803977151
----------------------------------------------------------
 Flow Duration 를 제외할 시
True Negetive 50820 
True Positive 165611
False Negetive 8612 
False Positive 7400
Recall 0.950569098224689 
Precision 0.9572281531232119
F1 Score 0.9538870041528191
----------------------------------------------------------
 Total Fwd Packets 를 제외할 시
True Negetive 50950 
True Positive 165742
False Negetive 8674 
False Positive 7077
Recall 0.9502683240069718 
Precision 0.9590496415324704
F1 Score 0.954638789292554
----------------------------------------------------------
 Total Backward Packets 를 제외할 시
True Negetive 51030 
True Positive 165436
False Negetive 8766 
False Positive 7211
Recall 0.9496791081617892 
Precision 0.9582326944574768
F1 Score 0.9539367275096656
-------------------------------------------------

In [13]:
change_rate

{'비교값': 0.9544856803365807,
 ' Destination Port': 0.9546716803977151,
 ' Flow Duration': 0.9538870041528191,
 ' Total Fwd Packets': 0.954638789292554,
 ' Total Backward Packets': 0.9539367275096656,
 'Total Length of Fwd Packets': 0.9538137167644544,
 ' Total Length of Bwd Packets': 0.9543991014601272,
 ' Fwd Packet Length Max': 0.9551647946971092,
 ' Fwd Packet Length Min': 0.9546107805168046,
 ' Fwd Packet Length Mean': 0.9540121670683008,
 ' Fwd Packet Length Std': 0.9534725662085728,
 'Bwd Packet Length Max': 0.9556538720771477,
 ' Bwd Packet Length Min': 0.9540581223002083,
 ' Bwd Packet Length Mean': 0.9543152175164332,
 ' Bwd Packet Length Std': 0.9544265248965941,
 ' Flow IAT Mean': 0.9537094619495117,
 ' Flow IAT Std': 0.9548276932629619,
 ' Flow IAT Max': 0.9543430549715148,
 ' Flow IAT Min': 0.9549484099184419,
 'Fwd IAT Total': 0.9537871026392116,
 ' Fwd IAT Mean': 0.9546835983229818,
 ' Fwd IAT Std': 0.9547465241626065,
 ' Fwd IAT Max': 0.9550495835899325,
 ' Fwd IAT Min':

### 성능에 긍정적 영향을 주는 컬럼들로만 학습을 진행

In [14]:
new_list = []
for i in change_rate:
    if change_rate[i]-change_rate['비교값'] < -0.0001 and i != 'Fwd Packets/s' and i != '비교값' and i != ' Label':
        new_list.append(i)

In [15]:
len(new_list)

40

## 학습

In [16]:
def testing_p(lists):
    data = df.copy()
    x = data[lists]
    y = data[[' Label']]
    #학습에 사용할 데이터와 추후 예측 및 계산에 사용할 데이터를 분리
    x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.8,test_size=0.2)
    #디도스는 -1 값을 정상은 1 값을 가지게 설정
    attack = ['DDoS', 'PortScan', 'Bot', 'Infiltration', 'Web Attack � Brute Force', 'Web Attack � XSS', 'Web Attack � Sql Injection']
    normal = 'BENIGN'
    y_train=y_train.replace(attack,-1)
    y_train=y_train.replace(normal,1)
    y_test=y_test.replace(attack,-1)
    y_test=y_test.replace(normal,1)
    model=LinearRegression()
    model.fit(x_train, y_train) 
    #학습한 모델을 이용해 예측하기
    y_pred = model.predict(x_test)
    #원활한 계산을 위해 양수값을 1로 음수값을 -1로 치환
    for j in range(0,y_pred.size):
        if(y_pred[j]>0):
            y_pred[j]=1
        else:
            y_pred[j]=-1
    cf_matrix = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cf_matrix.ravel()
#    print ("True Negetive", tn, 
#           "\nTrue Positive", tp)
#    print ("False Negetive", fn, 
#           "\nFalse Positive", fp)
    recall = tp/(tp+fn) #공격 적발률
    precision = tp/(tp+fp) #정상 이용자 판단 확률
    print("Recall", recall, "\nPrecision", precision)
    f = 2 * (precision*recall)/(precision+recall)
    print("F1 Score", f)
    return f

In [17]:
sum = 0
for count in range(1,21):
    print(count)
    sum += testing_p(new_list)
sum /= 20
sum

1
Recall 0.9483398224396743 
Precision 0.9405995295350599
F1 Score 0.9444538173166832
2
Recall 0.9450297702779517 
Precision 0.9407357026097095
F1 Score 0.9428778474432087
3
Recall 0.9462118821538197 
Precision 0.94094825126947
F1 Score 0.9435727261024416
4
Recall 0.9479928471537633 
Precision 0.9413244399927153
F1 Score 0.9446468753926461
5
Recall 0.9475037448132184 
Precision 0.9412106769437761
F1 Score 0.9443467268416431
6
Recall 0.9479346152962529 
Precision 0.9403516568667669
F1 Score 0.9441279103026142
7
Recall 0.9475158535118321 
Precision 0.9405225599181191
F1 Score 0.9440062551365173
8
Recall 0.9478113222699608 
Precision 0.9410474877834858
F1 Score 0.9444172946835052
9
Recall 0.9492344467437475 
Precision 0.9404044591593927
F1 Score 0.9447988223748772
10
Recall 0.9471282707638622 
Precision 0.9398354073525643
F1 Score 0.9434677460851071
11
Recall 0.9471422827273249 
Precision 0.9405027407183477
F1 Score 0.9438108348668753
12
Recall 0.9467665999552709 
Precision 0.939932024298

0.9441271124741915

처음보다 성능이 향상했지만 모든 컬럼을 사용할 때 보다는 성능이 떨어진다.

### 성능에 부정적 영향을 주는 컬럼들을 제외한 체 학습을 진행

In [18]:
new_list = []
for i in change_rate:
    #제외안 체 학습할 것이므로 비교할때 작은 것으로 리스트를 구성
    if change_rate[i]-change_rate['비교값'] < 0.0001 and i != 'Fwd Packets/s' and i != '비교값' and i !=' Label':
        new_list.append(i)

In [19]:
new_list

[' Flow Duration',
 ' Total Backward Packets',
 'Total Length of Fwd Packets',
 ' Total Length of Bwd Packets',
 ' Fwd Packet Length Mean',
 ' Fwd Packet Length Std',
 ' Bwd Packet Length Min',
 ' Bwd Packet Length Mean',
 ' Bwd Packet Length Std',
 ' Flow IAT Mean',
 ' Flow IAT Max',
 'Fwd IAT Total',
 ' Fwd IAT Min',
 'Bwd IAT Total',
 ' Bwd IAT Std',
 ' Bwd IAT Max',
 ' Bwd IAT Min',
 ' Bwd PSH Flags',
 ' Bwd URG Flags',
 ' Fwd Header Length',
 ' Bwd Header Length',
 ' Bwd Packets/s',
 ' Min Packet Length',
 ' Packet Length Mean',
 ' Packet Length Variance',
 'FIN Flag Count',
 ' SYN Flag Count',
 ' RST Flag Count',
 ' PSH Flag Count',
 ' ACK Flag Count',
 ' URG Flag Count',
 ' CWE Flag Count',
 ' ECE Flag Count',
 ' Down/Up Ratio',
 ' Average Packet Size',
 ' Avg Fwd Segment Size',
 ' Avg Bwd Segment Size',
 ' Fwd Avg Packets/Bulk',
 ' Fwd Avg Bulk Rate',
 ' Bwd Avg Bytes/Bulk',
 'Subflow Fwd Packets',
 ' Subflow Fwd Bytes',
 ' Subflow Bwd Packets',
 ' Subflow Bwd Bytes',
 'Init_Wi

In [20]:
sum = 0
for count in range(1,21):
    print(count)
    sum += testing_p(new_list)
sum /= 20
sum

1
Recall 0.9476250646143243 
Precision 0.9393970450081134
F1 Score 0.9434931164133868
2
Recall 0.9480260473242869 
Precision 0.9398853548188584
F1 Score 0.9439381496985599
3
Recall 0.9478338718285692 
Precision 0.9401080354957508
F1 Score 0.9439551458274988
4
Recall 0.948204400039091 
Precision 0.9400872018466273
F1 Score 0.9441283542448943
5
Recall 0.9476872721008355 
Precision 0.9408635866095046
F1 Score 0.9442631017327844
6
Recall 0.9483605568418815 
Precision 0.9403912766054419
F1 Score 0.944359104184724
7
Recall 0.9481129640146626 
Precision 0.9400563091886358
F1 Score 0.9440674480773185
8
Recall 0.948124134204733 
Precision 0.9403568781711419
F1 Score 0.9442245329623937
9
Recall 0.9479432567157501 
Precision 0.9400060295447694
F1 Score 0.9439579584725674
10
Recall 0.9482990152193375 
Precision 0.9397830871689292
F1 Score 0.9440218462478898
11
Recall 0.9477059155059178 
Precision 0.9395736286041725
F1 Score 0.943622251044324
12
Recall 0.9479445446296275 
Precision 0.94018535079375

0.9439884827847538

첫 선형 회귀 분석 모델보다 성능은 향상하였으나 역시 요소를 전부 사용하였을 때 보다는 성능이 낮다.

사실 긍정적인것으로만 학습을 하는것과 부정적인것을 제외한 체 학습하는것 중 후자가 칼럼의 갯수가 더 많은것을 보아 칼럼이 많을 수록 성능이 증가하는것 같다.

그럼 악영향을 크게 미치는 요소 2-3개만 제거하였을 때는 어떻게 될까 실험해본다.

In [21]:
new_list = []
for i in change_rate:
    if change_rate[i]-change_rate['비교값'] < 0.0011 and i != 'Fwd Packets/s' and i != '비교값' and i != ' Label':
        new_list.append(i)

In [22]:
len(new_list)

72

In [23]:
sum = 0
for count in range(1,21):
    print(count)
    sum += testing_p(new_list)
sum /= 20
sum

1
Recall 0.9531493894040951 
Precision 0.9550402217226053
F1 Score 0.9540938687468944
2
Recall 0.9524069174123414 
Precision 0.9548237487484608
F1 Score 0.9536138017832667
3
Recall 0.9533243671977377 
Precision 0.9533901227755552
F1 Score 0.9533572438528122
4
Recall 0.9530042154351547 
Precision 0.9539739572853488
F1 Score 0.9534888397923389
5
Recall 0.9527089756667851 
Precision 0.9543168771526981
F1 Score 0.9535122485617762
6
Recall 0.9529348851116682 
Precision 0.9540155775972736
F1 Score 0.9534749251335514
7
Recall 0.9537580537913007 
Precision 0.9546719453768253
F1 Score 0.9542147807660214
8
Recall 0.9528745979232485 
Precision 0.9553834744666541
F1 Score 0.9541273869260183
9
Recall 0.9533754862696946 
Precision 0.9548108306900198
F1 Score 0.9540926186443112
10
Recall 0.9523673031833477 
Precision 0.9542600793828586
F1 Score 0.9533127517702822
11
Recall 0.9534253807572679 
Precision 0.954482520897366
F1 Score 0.9539536579554034
12
Recall 0.9536918566289324 
Precision 0.95461750516

0.953745113231976

그럼에도 0.5% 성능 하향이 있었다.

그럼 이 모델에 배깅을 적용하면 성능이 향상할까?

## 배깅 적용 

In [24]:
data_b = df.copy()
data_b.dropna(1,inplace=True)
data_b=data_b.drop(columns=[' Flow Packets/s'], axis=1, inplace=False)
x = data_b[data_b.columns[0:-1]]
y = data_b[[' Label']]
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.8,test_size=0.2)
attack = ['DDoS', 'PortScan', 'Bot', 'Infiltration', 'Web Attack � Brute Force', 'Web Attack � XSS', 'Web Attack � Sql Injection']
normal = 'BENIGN'
y_train=y_train.replace(attack,-1)
y_train=y_train.replace(normal,1)
y_test=y_test.replace(attack,-1)
y_test=y_test.replace(normal,1)

In [25]:
from sklearn.ensemble import BaggingRegressor

bag_clf = BaggingRegressor(base_estimator = LinearRegression(), 
                           n_estimators=15, bootstrap=True,
                           verbose=1, random_state=42)
bagging = bag_clf.fit(x_train, y_train)
y_pred = bagging.predict(x_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s finished


In [26]:
for i in range(0,y_pred.size):
    if(y_pred[i]>0):
        y_pred[i]=1
    else:
        y_pred[i]=-1
cf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cf_matrix.ravel()
recall = tp/(tp+fn) #공격 적발률
precision = tp/(tp+fp) #정상 이용자 판단 확률
print("Recall", recall, "\nPrecision", precision)
f = 2 * (precision*recall)/(precision+recall)
print("F1 Score", f)

Recall 0.9510380980847493 
Precision 0.9583536062649151
F1 Score 0.9546818381148015


배깅 결과 약간의 성능 향상이 있다.

In [27]:
data_b = df.copy()
data_b.dropna(1,inplace=True)
data_b=data_b.drop(columns=[' Flow Packets/s'], axis=1, inplace=False)
x = data_b[data_b.columns[0:-1]]
y = data_b[[' Label']]
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.8,test_size=0.2)
attack = ['DDoS', 'PortScan', 'Bot', 'Infiltration', 'Web Attack � Brute Force', 'Web Attack � XSS', 'Web Attack � Sql Injection']
normal = 'BENIGN'
y_train=y_train.replace(attack,-1)
y_train=y_train.replace(normal,1)
y_test=y_test.replace(attack,-1)
y_test=y_test.replace(normal,1)
bag_clf = BaggingRegressor(base_estimator = LinearRegression(), 
                           n_estimators=150, bootstrap=True,
                           verbose=1, random_state=42)
bagging = bag_clf.fit(x_train, y_train)
y_pred = bagging.predict(x_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 19.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.3s finished


In [28]:
for i in range(0,y_pred.size):
    if(y_pred[i]>0):
        y_pred[i]=1
    else:
        y_pred[i]=-1
cf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cf_matrix.ravel()
recall = tp/(tp+fn) #공격 적발률
precision = tp/(tp+fp) #정상 이용자 판단 확률
print("Recall", recall, "\nPrecision", precision)
f = 2 * (precision*recall)/(precision+recall)
print("F1 Score", f)

Recall 0.9522899010082914 
Precision 0.9582875897056614
F1 Score 0.9552793313787721


샘플 갯수 150개로 실험 결과 f1이 0.8%정도 상승했다.