# A Transfer Learning and Optimized CNN Based Intrusion Detection System for Internet of Vehicles 
This is the code for the paper entitled "**A Transfer Learning and Optimized CNN Based Intrusion Detection System for Internet of Vehicles**" accepted in IEEE International Conference on Communications (IEEE ICC).  
Authors: Li Yang (lyang339@uwo.ca) and Abdallah Shami (Abdallah.Shami@uwo.ca)  
Organization: The Optimized Computing and Communications (OC2) Lab, ECE Department, Western University

**Notebook 1: df pre-processing**  
Procedures:  
&nbsp; 1): Read the dfset  
&nbsp; 2): Transform the tabular df into images  
&nbsp; 3): Display the transformed images  
&nbsp; 4): Split the training and test set  

## Import libraries

In [18]:
import numpy as np
import pandas as pd
import os
import cv2
import math
import random
import matplotlib.pyplot as plt
import shutil
from sklearn.preprocessing import QuantileTransformer
from PIL import Image
import warnings
warnings.filterwarnings("ignore")

## Read the Car-Hacking/CAN-Intrusion dfset
The complete Car-Hacking dataset is publicly available at: https://ocslab.hksecurity.net/Datasets/CAN-intrusion-dataset  
In this repository, due to the file size limit of GitHub, we use the 5% subset.

In [19]:
#Read dataset
df=pd.read_csv('data/clean/cicids2017/all_data.csv')
# df['padding'] = 0
print(df['Label'].unique())

['Benign' 'DDoS' 'PortScan' 'Bot' 'Infiltration'
 'Web Attack Ã\x82Â\x96 Brute Force' 'Web Attack Ã\x82Â\x96 XSS'
 'Web Attack Ã\x82Â\x96 Sql Injection' 'FTP-Patator' 'SSH-Patator'
 'DoS slowloris' 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye'
 'Heartbleed']


In [20]:
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,...,Init Bwd Win Bytes,Fwd Act Data Packets,Fwd Seg Size Min,Active Mean,Active Max,Active Min,Idle Mean,Idle Max,Idle Min,Label
0,3.0,2.0,0.0,12.0,0.0,6.0,6.0,0.000000,0.0,0.0,...,-1.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,109.0,1.0,1.0,6.0,6.0,6.0,6.0,0.000000,6.0,6.0,...,256.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,52.0,1.0,1.0,6.0,6.0,6.0,6.0,0.000000,6.0,6.0,...,256.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,34.0,1.0,1.0,6.0,6.0,6.0,6.0,0.000000,6.0,6.0,...,329.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,3.0,2.0,0.0,12.0,0.0,6.0,6.0,0.000000,0.0,0.0,...,-1.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2230950,32215.0,4.0,2.0,112.0,152.0,28.0,28.0,0.000000,76.0,76.0,...,-1.0,3.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2230951,324.0,2.0,2.0,84.0,362.0,42.0,42.0,0.000000,181.0,181.0,...,-1.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2230952,82.0,2.0,1.0,31.0,6.0,31.0,15.5,21.920311,6.0,6.0,...,0.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2230953,1048635.0,6.0,2.0,192.0,256.0,32.0,32.0,0.000000,128.0,128.0,...,-1.0,5.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


In [21]:
import re

def clean_label(label):
    if 'Web Attack' in label:
        # 去除亂碼並保留原始類型
        return re.sub(r'Ã.*Â\x96', '–', label)  # 將亂碼替換為正常符號 "–"
    return label

# 清理標籤
df['Label'] = df['Label'].apply(clean_label)
print(df['Label'].unique())

['Benign' 'DDoS' 'PortScan' 'Bot' 'Infiltration'
 'Web Attack – Brute Force' 'Web Attack – XSS'
 'Web Attack – Sql Injection' 'FTP-Patator' 'SSH-Patator' 'DoS slowloris'
 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye' 'Heartbleed']


In [22]:
# The labels of the dataset. "R" indicates normal patterns, and there are four types of attack (DoS, fuzzy. gear spoofing, and RPM spoofing zttacks)
df.Label.value_counts().sort_index()

Benign                        1894575
Bot                              1431
DDoS                           128014
DoS GoldenEye                   10285
DoS Hulk                       172743
DoS Slowhttptest                 5228
DoS slowloris                    5383
FTP-Patator                      5931
Heartbleed                         11
Infiltration                       36
PortScan                         1956
SSH-Patator                      3219
Web Attack – Brute Force         1470
Web Attack – Sql Injection         21
Web Attack – XSS                  652
Name: Label, dtype: int64

In [23]:
# 標籤映射
label_mapping = {
    'Benign': 'Benign',  # 正常流量
    'DoS Hulk': 'DoS',
    'DDoS': 'DoS',
    'DoS GoldenEye': 'DoS',
    'DoS slowloris': 'DoS',
    'DoS Slowhttptest': 'DoS',
    'PortScan': 'Port-scan',
    'FTP-Patator': 'Brute-force',
    'SSH-Patator': 'Brute-force',
    'Web Attack – Brute Force': 'Web-attacks',
    'Web Attack – XSS': 'Web-attacks',
    'Web Attack – Sql Injection': 'Web-attacks',
    'Bot': 'Botnets',
    'Infiltration': 'Botnets',
    'Heartbleed': 'Botnets'
}
# 將標籤重新分類
df['Label'] = df['Label'].map(label_mapping)

# 檢查重新分類後的標籤
print(df['Label'].unique())  # 會顯示 ['Benign', 'DoS attacks', 'Port-scan attacks', ...]


['Benign' 'DoS' 'Port-scan' 'Botnets' 'Web-attacks' 'Brute-force']


In [24]:
print(df['Label'].value_counts())

Benign         1894575
DoS             321653
Brute-force       9150
Web-attacks       2143
Port-scan         1956
Botnets           1478
Name: Label, dtype: int64


In [25]:
# 設定下採樣的數量
target_samples = 20000

# df = df.drop(df[(df['Label'] == 'Benign') & (df.index >= target_samples)].index)
# df = df.drop(df[(df['Label'] == 'DoS') & (df.index >= target_samples)].index)

# 下採樣 Benign 和 DoS 類別
benign_samples = df[df['Label'] == 'Benign'].sample(n=target_samples, random_state=42)
dos_samples = df[df['Label'] == 'DoS'].sample(n=target_samples, random_state=42)

# 保持其他類別數量不變
brute_force_samples = df[df['Label'] == 'Brute-force']
web_attack_samples = df[df['Label'] == 'Web-attacks']
port_scan_samples = df[df['Label'] == 'Port-scan']
botnet_samples = df[df['Label'] == 'Botnets']

# 合併所有樣本
df = pd.concat([benign_samples, dos_samples, brute_force_samples,
                           web_attack_samples, port_scan_samples, botnet_samples])


# 查看新的資料集分佈
print(df['Label'].value_counts())
save_dir = 'data/tabular_6class/cicids2017/'
os.makedirs(save_dir, exist_ok=True)
df.to_csv(os.path.join(save_dir, 'all_data_6class.csv'), index=False)

Benign         20000
DoS            20000
Brute-force     9150
Web-attacks     2143
Port-scan       1956
Botnets         1478
Name: Label, dtype: int64
