# A Transfer Learning and Optimized CNN Based Intrusion Detection System for Internet of Vehicles 
This is the code for the paper entitled "**A Transfer Learning and Optimized CNN Based Intrusion Detection System for Internet of Vehicles**" accepted in IEEE International Conference on Communications (IEEE ICC).  
Authors: Li Yang (lyang339@uwo.ca) and Abdallah Shami (Abdallah.Shami@uwo.ca)  
Organization: The Optimized Computing and Communications (OC2) Lab, ECE Department, Western University

**Notebook 1: df pre-processing**  
Procedures:  
&nbsp; 1): Read the dfset  
&nbsp; 2): Transform the tabular df into images  
&nbsp; 3): Display the transformed images  
&nbsp; 4): Split the training and test set  

## Import libraries

In [10]:
import numpy as np
import pandas as pd
import os
import cv2
import math
import random
import matplotlib.pyplot as plt
import shutil
from sklearn.preprocessing import QuantileTransformer
from PIL import Image
import warnings
warnings.filterwarnings("ignore")

## Read the Car-Hacking/CAN-Intrusion dfset
The complete Car-Hacking dataset is publicly available at: https://ocslab.hksecurity.net/Datasets/CAN-intrusion-dataset  
In this repository, due to the file size limit of GitHub, we use the 5% subset.

In [11]:
#Read dataset
df=pd.read_csv('data/clean/cicids2018/all_data.csv')
# df['padding'] = 0
print(df['Label'].unique())

['Benign' 'Bot' 'DoS attacks-SlowHTTPTest' 'DoS attacks-Hulk'
 'Brute Force -Web' 'Brute Force -XSS' 'SQL Injection'
 'DDoS attacks-LOIC-HTTP' 'Infilteration' 'DoS attacks-GoldenEye'
 'DoS attacks-Slowloris' 'FTP-BruteForce' 'SSH-Bruteforce'
 'DDOS attack-LOIC-UDP' 'DDOS attack-HOIC']


In [12]:
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,...,Init Bwd Win Bytes,Fwd Act Data Packets,Fwd Seg Size Min,Active Mean,Active Max,Active Min,Idle Mean,Idle Max,Idle Min,Label
0,141385.0,9.0,7.0,553.0,3773.0,202.0,61.444443,87.534440,1460.0,539.0000,...,119.0,4.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,281.0,2.0,1.0,38.0,0.0,38.0,19.000000,26.870058,0.0,0.0000,...,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,279824.0,11.0,15.0,1086.0,10527.0,385.0,98.727270,129.392500,1460.0,701.8000,...,1047.0,5.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,132.0,2.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0000,...,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,274016.0,9.0,13.0,1285.0,6141.0,517.0,142.777770,183.887730,1460.0,472.3846,...,1047.0,5.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11125105,103022.0,2.0,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0000,...,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
11125106,105445.0,2.0,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0000,...,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
11125107,733880.0,2.0,2.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0000,...,0.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
11125108,732728.0,2.0,2.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0000,...,0.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


In [13]:
import re

def clean_label(label):
    if 'Web Attack' in label:
        # 去除亂碼並保留原始類型
        return re.sub(r'Ã.*Â\x96', '–', label)  # 將亂碼替換為正常符號 "–"
    return label

# 清理標籤
df['Label'] = df['Label'].apply(clean_label)
print(df['Label'].unique())

['Benign' 'Bot' 'DoS attacks-SlowHTTPTest' 'DoS attacks-Hulk'
 'Brute Force -Web' 'Brute Force -XSS' 'SQL Injection'
 'DDoS attacks-LOIC-HTTP' 'Infilteration' 'DoS attacks-GoldenEye'
 'DoS attacks-Slowloris' 'FTP-BruteForce' 'SSH-Bruteforce'
 'DDOS attack-LOIC-UDP' 'DDOS attack-HOIC']


In [14]:
# The labels of the dataset. "R" indicates normal patterns, and there are four types of attack (DoS, fuzzy. gear spoofing, and RPM spoofing zttacks)
df.Label.value_counts().sort_index()

Benign                      9805923
Bot                          144534
Brute Force -Web                570
Brute Force -XSS                229
DDOS attack-HOIC             198861
DDOS attack-LOIC-UDP           1730
DDoS attacks-LOIC-HTTP       575364
DoS attacks-GoldenEye         41406
DoS attacks-Hulk             145199
DoS attacks-SlowHTTPTest         55
DoS attacks-Slowloris          9907
FTP-BruteForce                   53
Infilteration                107158
SQL Injection                    80
SSH-Bruteforce                94041
Name: Label, dtype: int64

In [15]:
# 標籤映射
label_mapping = {
    'Benign': 'Benign',  # 正常流量
    'DDoS attacks-LOIC-HTTP': 'DoS',
    'DDOS attack-LOIC-UDP': 'DoS',
    'DDOS attack-HOIC': 'DoS',
    'DoS attacks-GoldenEye': 'DoS',
    'DoS attacks-Slowloris': 'DoS',
    'DoS attacks-Hulk': 'DoS',
    'DoS attacks-SlowHTTPTest': 'DoS',
    'Bot': 'Botnets',
    'Infilteration': 'Botnets',
    'SSH-Bruteforce': 'Brute-force',
    'FTP-BruteForce': 'Brute-force',
    'Brute Force -Web': 'Web-attacks',
    'Brute Force -XSS': 'Web-attacks',
    'SQL Injection': 'Web-attacks',
}
# 將標籤重新分類
df['Label'] = df['Label'].map(label_mapping)

# 檢查重新分類後的標籤
print(df['Label'].unique())  # 會顯示 ['Benign', 'DoS attacks', 'Port-scan attacks', ...]


['Benign' 'Botnets' 'DoS' 'Web-attacks' 'Brute-force']


In [16]:
print(df['Label'].value_counts().sort_index())

Benign         9805923
Botnets         251692
Brute-force      94094
DoS             972522
Web-attacks        879
Name: Label, dtype: int64


In [17]:
# 設定下採樣的數量
target_samples = 13461

# 建立一個函式來安全地進行下採樣
def safe_sample(df, label, target_samples, random_state=42):
    subset = df[df['Label'] == label]
    return subset.sample(n=min(target_samples, len(subset)), random_state=random_state)

# 逐類別下採樣
benign_samples = safe_sample(df, 'Benign', target_samples)
dos_samples = safe_sample(df, 'DoS', target_samples)
botnet_samples = safe_sample(df, 'Botnets', target_samples)
brute_force_samples = safe_sample(df, 'Brute-force', target_samples)
web_attack_samples = safe_sample(df, 'Web-attacks', target_samples)
port_scan_samples = safe_sample(df, 'Port-scan', target_samples)


# 合併所有樣本
df = pd.concat([benign_samples, dos_samples, brute_force_samples,
                           web_attack_samples, port_scan_samples, botnet_samples])


# 查看新的資料集分佈
print(df['Label'].value_counts())
save_dir = 'data/tabular_6class/cicids2018/'
os.makedirs(save_dir, exist_ok=True)
df.to_csv(os.path.join(save_dir, 'all_data_6class.csv'), index=False)


Benign         13461
DoS            13461
Brute-force    13461
Botnets        13461
Web-attacks      879
Name: Label, dtype: int64
