In [1]:
import pandas as pd

def transform_dataset(dataset: pd.DataFrame):
    df = dataset.copy()
    df['CAN ID'] = df['CAN ID'].apply(lambda i: int(i, 16))
    df['Flag'] = df['Flag'].apply(lambda x: 0 if x == 'R' else 1)

    for i in range(8):
        df[f'DATA{i}'] = df[f'DATA{i}'].apply(lambda i: int(i, 16))

    for col in df.columns:
        df[col] = pd.to_numeric(df[col])
    
    return df

folder_path = r'C:\Users\sungj\Downloads\Datasets\9) Car-Hacking Dataset\{0}_dataset.csv'
column_names = ['Timestamp', 'CAN ID', 'DLC', 'DATA0', 'DATA1', 'DATA2',
                'DATA3', 'DATA4', 'DATA5', 'DATA6', 'DATA7', 'Flag']
attacks = {}
for attack in ['DoS', 'Fuzzy', 'gear', 'RPM']:
    data = pd.read_csv(folder_path.format(attack), names=column_names)
    data = data.dropna()
    attacks[attack] = data
    
for att, data in attacks.items():
    attacks[att] = transform_dataset(data)
    
combined_dataset = pd.concat(attacks.values()).values

data = pd.DataFrame(combined_dataset, columns=column_names)
display(data.head(10))

Unnamed: 0,Timestamp,CAN ID,DLC,DATA0,DATA1,DATA2,DATA3,DATA4,DATA5,DATA6,DATA7,Flag
0,1478198000.0,790.0,8.0,5.0,33.0,104.0,9.0,33.0,33.0,0.0,111.0,0.0
1,1478198000.0,399.0,8.0,254.0,91.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0
2,1478198000.0,608.0,8.0,25.0,33.0,34.0,48.0,8.0,142.0,109.0,58.0,0.0
3,1478198000.0,672.0,8.0,100.0,0.0,154.0,29.0,151.0,2.0,189.0,0.0,0.0
4,1478198000.0,809.0,8.0,64.0,187.0,127.0,20.0,17.0,32.0,0.0,20.0,0.0
5,1478198000.0,1349.0,8.0,216.0,0.0,0.0,138.0,0.0,0.0,0.0,0.0,0.0
6,1478198000.0,2.0,8.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,17.0,0.0
7,1478198000.0,339.0,8.0,0.0,33.0,16.0,255.0,0.0,255.0,0.0,0.0,0.0
8,1478198000.0,704.0,8.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1478198000.0,304.0,8.0,8.0,128.0,0.0,255.0,49.0,128.0,11.0,127.0,0.0


In [2]:
import numpy as np
from sklearn.decomposition import PCA

data = pd.DataFrame(combined_dataset, columns=column_names)

X = data.drop(columns='Flag', axis=1)

pca = PCA(n_components=6)
X_pca = pca.fit_transform(X)

print(f'Explained variance ratio of each component:\n {pca.explained_variance_ratio_}')
print(f'Shape of original data: {X.shape}')
    
feature_importance = pd.Series(pca.components_[0], index=X.columns)
feature_importance = feature_importance.abs().sort_values(ascending=False)
print("Feature importance ranking:")
print(feature_importance)

Explained variance ratio of each component:
 [9.83618392e-01 1.27623238e-02 1.05220958e-03 7.67651594e-04
 5.12049329e-04 4.75698787e-04]
Shape of original data: (16368810, 11)
Feature importance ranking:
Timestamp    0.999930
CAN ID       0.010314
DATA3        0.004695
DATA7        0.002709
DATA4        0.001480
DATA5        0.000797
DATA2        0.000666
DATA6        0.000415
DATA1        0.000238
DATA0        0.000025
DLC          0.000000
dtype: float64


In [3]:
from sklearn.decomposition import TruncatedSVD

df = data.drop(columns='Flag', axis=1)

tsvd = TruncatedSVD(n_components=6)
df_tsvd = tsvd.fit_transform(df)

print(f'Explained variance ratio of each component:\n {tsvd.explained_variance_ratio_}')
print(f'Shape of original data: {df.shape}')

feature_importance = pd.Series(tsvd.components_[0], index=df.columns)
feature_importance = feature_importance.abs().sort_values(ascending=False)
print("Feature importance ranking:")
print(feature_importance)

Explained variance ratio of each component:
 [9.83482829e-01 1.28692872e-02 1.06874636e-03 7.68779794e-04
 5.18695766e-04 4.76576343e-04]
Shape of original data: (16368810, 11)
Feature importance ranking:
Timestamp    1.000000e+00
CAN ID       4.719238e-07
DATA3        5.324813e-08
DATA5        4.187071e-08
DATA0        3.941343e-08
DATA7        3.666715e-08
DATA4        3.483879e-08
DATA1        3.100053e-08
DATA2        2.741569e-08
DATA6        1.781005e-08
DLC          5.412000e-09
dtype: float64
