In [None]:
from matplotlib import pyplot
from numpy import where
import pandas as pd 
import os 
import json
import numpy as np
import pylab as plt
import time
import zipfile
import seaborn as sns
import random

### Download dataset 

#### Download only the following zip file: VeReMi_54000_57600_2022-9-11_19.12.56.zip

#### https://mega.nz/folder/z0pnGA4a#WFEUISyS5_maabhcEI7HQA/folder/a1QxhaqC

####


In [None]:
def unzipfile(filename):
    with zipfile.ZipFile(filename,"r") as zip_ref:
        zip_ref.extractall()

In [None]:
# Unzip the files
# We assume the dataset file and the current python file are in the same folder. 
# Otherwise, the path of the dataset file should be revised in the following line.

# unzipfile("VeReMi_50400_54000_2022-9-11_19.12.56.zip")
unzipfile("VeReMi_54000_57600_2022-9-11_19.12.56.zip")
# unzipfile("VeReMi_57600_61200_2022-9-11_19.12.56.zip")

In [None]:
# Path for the dataset
path_1 = 'VeReMi_54000_57600_2022-9-11_19_12_56'

In [None]:
# List the json files in each folder
list_of_files = []

for root, dirs, files in os.walk(path_1):
    for file in files:
        list_of_files.append(os.path.join(root,file))
        


In [None]:
list_of_files

In [None]:
# # Each folder has one json file for the ground truth, we want to skip them

list_of_files.remove('VeReMi_54000_57600_2022-9-11_19_12_56\\traceGroundTruthJSON-15.json')

In [None]:
# Calculate the number of json
nb_each_json = []

for index in range(len(list_of_files)):
    nb_each_json.append(sum(1 for line in open(list_of_files[index])))

len(list_of_files)

In [None]:
data = pd.DataFrame()
for index in range(len(list_of_files)):
    with open(list_of_files[index]) as f:
        # Read all the data in the files
        df = pd.DataFrame(json.loads(line) for line in f)
        # Read the following features from the name of json file (by splitting the file name)
        # labelRec: the label (0 normal, 13 (for this type of attack) attacker)
        # receiver: the receiver ID
        # moduleid: OMNeT++ module number, we don't need it now
        df['labelRec'] = list_of_files[index].rsplit('\\')[1].rsplit('-')[3].rsplit('A')[1]
        df['receiver'] = list_of_files[index].rsplit('\\')[1].rsplit('-')[1]
        # df['moduleid'] = list_of_files[index].rsplit('\\')[1].rsplit('-')[4]
        data = pd.concat([data,df])
    print(index)
    

In [None]:
data.head()

In [None]:
# Choose the BSM where type = 3 (type = 2 GPS is just the sent messages, we will use the received messages, i.e. BSM)

bsm = data[data.type==3]

In [None]:
# Pos, Spd, Acl, Hed, and their noise features have three axis: x,y,z. 
# This function separates their x and y axis (z is always 0 for all, so we did not consider them.)
def clean_dataset(l, droped_features, data):
    for t in l:
        data[t + '_x'] = None
        data[t + '_y'] = None

    for j in l:
        data[j + '_x'] = data[j].apply(lambda row: row[0]) 
        data[j + '_y'] = data[j].apply(lambda row: row[1])
    data = data.drop(columns=l,axis=1)
    data.drop_duplicates(inplace=True)
    data.drop(columns= droped_features,inplace=True)
    return data


In [None]:
# The features who needs to be separated for the different axis
l = ['pos', 'pos_noise', 'spd', 'spd_noise', 'acl', 'acl_noise', 'hed', 'hed_noise']

In [None]:
# Using clean_dataset function, separate the features in l to their x and y axis
droped_features = []
bsm = clean_dataset(l, droped_features, bsm)

In [None]:
bsm.head()

In [None]:
bsm.dtypes

In [None]:
bsm['receiver'] = bsm['receiver'].astype("float64")
# bsm['sender'] = bsm['sender'].astype("float64")
bsm['labelRec'] = bsm['labelRec'].astype("int64")

bsm.head()

In [None]:
# Receiver - Sender matching

labelization = bsm[['receiver','labelRec']]
labelization.drop_duplicates(inplace=True)
bsm = bsm.drop(columns=['receiver','labelRec'], axis = 1)

In [None]:
bsm = pd.merge(bsm, labelization ,left_on='sender',right_on='receiver',how='left')
bsm.rename(columns={'labelRec':'label'},inplace=True)
bsm = bsm.drop(columns=['receiver'], axis = 1)

In [None]:
bsm = bsm.dropna()
# or:  bsm.dropna(inplace = True)
bsm.head()

In [None]:
bsm.describe()

In [None]:
bsm.columns

In [None]:
bsm.dtypes

In [None]:
bsm['label'] = bsm['label'].astype("int64")

In [None]:
# Number of lines in the dataset
len(bsm)

In [None]:
# Number of normal vehicles (0) and number of attackers (13) in the dataset
bsm.label.value_counts()

# Classification

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from collections import Counter

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


pd.set_option("display.max_columns", None)

In [None]:
def classification_attack(method,veremi):
       
    # Our labels are 0 or 13; we need 0-1 for classification algorithms. Convert 13 to 1
    veremi['label'] = veremi['label'].replace(13,1)
    
    # Create feature set X and label set y 
    y = veremi['label']
    X = veremi.drop(columns=['label','type', 'rcvTime', 'sendTime', 'sender', 'senderPseudo', 'messageID'], axis=1)
    
    # Data split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=0, stratify=y)
    
             
    if method=='RF':
        # Random Forest
        rf = RandomForestClassifier(n_estimators=50, bootstrap=True, random_state=0)

        start = time.time()
        rf.fit(X_train, y_train)
        timefit = time.time() - start

        start = time.time()
        y_pred = rf.predict(X_test)
        timepred = time.time() - start
        
    
    elif method=='Xgboost':
        # XGBoost
        xgb = XGBClassifier(random_state=0)

        start = time.time()
        xgb.fit(X_train,y_train)
        timefit = time.time() - start

        start = time.time()
        y_pred = xgb.predict(X_test)
        timepred = time.time() - start

           
   
       
    report = classification_report(y_test,y_pred,output_dict=True)

    return(report['accuracy'], report['weighted avg']['precision'], report['weighted avg']['recall'], 
          report['weighted avg']['f1-score'], timefit, timepred)


## Outputs and Stockage

In [None]:
results_classifying = pd.DataFrame(index=['Xgboost','RF'],
                       columns=['Acc','Pre','Rec','F1s','TimeFit','TimePred'])

In [None]:
results_classifying.xs('Xgboost')[:] = classification_attack('Xgboost',bsm)
print(results_classifying)
results_classifying.xs('RF')[:] = classification_attack('RF',bsm)
print(results_classifying)

results_classifying.to_excel('results_classifying.xlsx')

In [None]:
results_classifying

In [None]:
# Storage of the data
bsm.to_csv('bsm.csv')

In [None]:
### Read the saved data 
bsm_copy = pd.read_csv('bsm.csv')
#bsm_copy.drop(columns='Unnamed: 0', inplace=True)
bsm_copy.head()

## Read the saved results
#results_classifying_copy = pd.read_excel('results_classifying.xlsx')
#results_classifying_copy.rename(columns={'Unnamed: 0': 'method'}, inplace=True)
#results_classifying_copy.set_index('method', inplace=True)
#print(results_classifying_copy)

# Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import classification_report
from sklearn.metrics import homogeneity_score, completeness_score
from sklearn.preprocessing import StandardScaler

In [None]:
def clustering_attack(method,veremi):
       
    # Our labels are 0 or 13; we need 0-1 for classification algorithms. Convert 13 to 1
    veremi['label'] = veremi['label'].replace(13,1)
    
    # Create feature set X and label set y 
    y = veremi['label']
    X = veremi.drop(columns=['label','type', 'rcvTime', 'sendTime', 'sender', 'senderPseudo', 'messageID'], axis=1)
    
    # Data split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=0, stratify=y)
    
    scaler = StandardScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.fit_transform(X_test)
             
    if method=='kMeans':
        # k-Means
        kmeans = KMeans(n_clusters=2, random_state=0)
        
        start = time.time()
        kmeans.fit(scaled_X_train)
        timefit = time.time() - start

        start = time.time()
        y_pred = kmeans.predict(scaled_X_test)
        timepred = time.time() - start
    
    elif method=='GMM':
        # Gaussian Mixture Model
        gmm = GaussianMixture(n_components=2, random_state=0)

        start = time.time()
        gmm.fit(scaled_X_train)
        timefit = time.time() - start

        start = time.time()
        y_pred = gmm.predict(scaled_X_test)
        timepred = time.time() - start

           
              
   
       
    report = classification_report(y_test,y_pred,output_dict=True)

    return(report['accuracy'], report['weighted avg']['precision'], report['weighted avg']['recall'], 
          report['weighted avg']['f1-score'], timefit, timepred)


## Outputs and Stockage

In [None]:
results_cluster = pd.DataFrame(index=['kMeans','GMM'],
                       columns=['Acc','Pre','Rec','F1s','TimeFit','TimePred'])

In [None]:
results_cluster.xs('kMeans')[:] = clustering_attack('kMeans',bsm)
print(results_cluster)
results_cluster.xs('GMM')[:] = clustering_attack('GMM',bsm)
print(results_cluster)

results_cluster.to_excel('results_cluster.xlsx')

In [None]:
results_cluster

# Visualization

In [None]:
bsm['label'] = bsm['label'].replace(13,1)
y = bsm['label']
X = bsm.drop(columns=['label','type', 'rcvTime', 'sendTime', 'sender', 'senderPseudo', 'messageID'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=0, stratify=y)

In [None]:

xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)

### Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(6,6)) 
sns.heatmap(cm, annot=True, cbar=False, cmap='Blues', fmt='.0f')
ax.set_title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

### AUC (Area Under the Curve) and Error

In [None]:
xgb = XGBClassifier()

eval_set = [(X_train, y_train), (X_test, y_test)]
eval_metric = ["auc","error"]

xgb.fit(X_train, y_train, eval_set=eval_set, eval_metric=eval_metric, verbose=True)

In [None]:
results = xgb.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)

In [None]:
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['auc'], label='Train')
ax.plot(x_axis, results['validation_1']['auc'], label='Test')
ax.legend()
pyplot.ylabel('AUC')
pyplot.title('XGBoost AUC')
pyplot.show()

In [None]:
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
pyplot.ylabel('Error')
pyplot.title('XGBoost Error')
pyplot.show()