In [1]:
import numpy as np
import pandas as pd
import time
import os

from sklearn.utils import shuffle

## Load CAN data

In [2]:
start = time.time()
data = None

data_directory = "/Users/david/Library/CloudStorage/iCloud Drive/Documents/masters/thesis/data"

if not os.path.exists(os.path.join(data_directory, "all_data.csv")):
    print("Reading CSV file 1...")
    ornl_data = pd.read_csv(os.path.join(data_directory, "ornl-data", "can_data.csv"),
        dtype={"captureid": int, "vehicle_id": int, "ts": float, "arbitration_id": object, 
               "dlc": int, "data": object, "make": object, "model": object, "year": int})

    print("Reading CSV file 2...")
    stone_data = pd.read_csv(os.path.join(data_directory, "stone-data", "can_data.csv"),
        dtype={"captureid": int, "vehicle_id": int, "Titsme": float, "arbitration_id": object, 
               "dlc": int, "data": object, "make": object, "model": object, "year": int})

    # rename the columns
    ornl_data.columns = ["Capture", "Vehicle", "Time", "ArbID", "DLC", "Data", "Make", "Model", "Year"]
    stone_data.columns = ["Capture", "Vehicle", "Time", "ArbID", "DLC", "Data", "Make", "Model", "Year"]
    
    # combine the dataframes
    print("Combining dataframes...")
    data = pd.concat([ornl_data, stone_data])
    data = data.reset_index(drop=True)
    
    print("Writing to " + os.path.join(data_directory, "all_data.csv"))
    data.to_csv(os.path.join(data_directory, "all_data.csv"), index=False)

else:
    print("File already exists!")
    data = pd.read_csv(os.path.join(data_directory, "all_data.csv"),
        dtype={"Capture": int, "Vehicle": int, "Time": float, "ArbID": object, 
               "DLC": int, "Data": object, "Make": object, "Model": object, "Year": int})

display(data.head(5))
end = time.time()
print("Elapsed time: {:.2f} seconds".format(end - start))

Reading CSV file 1...


FileNotFoundError: [Errno 2] File b'/Users/david/Library/CloudStorage/iCloud Drive/Documents/masters/thesis/data/ornl-data/can_data.csv' does not exist: b'/Users/david/Library/CloudStorage/iCloud Drive/Documents/masters/thesis/data/ornl-data/can_data.csv'

## Build a metadata table

In [24]:
# how long does this take?
start = time.time()

# number of bytes in each snapshot
chunk_size = 1024

# minimum number of snapshots for each arbID
min_snaps = 1

# used to build a dataframe
captures = []
vehicles = []
arbIDs = []
num_bytes = []
num_snaps = []
valid = []

# for each capture...
for i in np.sort(data["Capture"].unique()):
    capture = data[data["Capture"] == i]
    vehicle = capture["Vehicle"].iloc[0]
    
    # for each arbID for capture i...
    for j in capture["ArbID"].unique():
        arbID = capture[capture["ArbID"] == j]
        bytes_count = np.sum(arbID["DLC"])
        
        captures.append(i)
        vehicles.append(vehicle)
        arbIDs.append(j)
        num_bytes.append(bytes_count)
        num_snaps.append(bytes_count // chunk_size)
                
        # every vehicle/arbID combo needs at least one picture
        valid.append("yes" if bytes_count >= chunk_size * min_snaps else "no")

# build a dataframe
metadata = pd.DataFrame(
    np.hstack((
        np.array(vehicles).reshape(-1,1),
        np.array(captures).reshape(-1,1),
        np.array(arbIDs).reshape(-1,1),
        np.array(num_bytes).reshape(-1,1),
        np.array(num_snaps).reshape(-1,1),
        np.array(valid).reshape(-1,1)
    )),
    
    columns = ["Vehicle", "Capture", "ArbID", "Bytes", "Snapshots", "Valid?"]
)

# update the datatypes
for col in metadata.columns.drop(["ArbID", "Valid?"]):
    metadata[col] = pd.to_numeric(metadata[col])

print("Elapsed time: {:.2f} seconds".format(time.time() - start))

Elapsed time: 179.50 seconds


In [31]:
# sort and show the dataframe
metadata = metadata.sort_values(by=["Vehicle", "Capture", "ArbID"]).reset_index(drop=True)
display(metadata.head(5))

valid = metadata[metadata["Valid?"] == "yes"]
invalid = metadata[metadata["Valid?"] == "no"]

valid_arbs = 0
for i in valid["Vehicle"].unique():
    valid_arbs += len(valid[valid["Vehicle"] == i]["ArbID"].unique())

invalid_arbs = 0
for i in invalid["Vehicle"].unique():
    invalid_arbs += len(invalid[invalid["Vehicle"] == i]["ArbID"].unique())

print("{} valid ArbIDs from {} vehicles".format(
    valid_arbs,
    len(valid["Vehicle"].unique())
))

print("{} invalid ArbIDs from {} vehicles".format(
    invalid_arbs,
    len(invalid["Vehicle"].unique())
))

print("{} snapshots".format(np.sum(metadata[metadata["Valid?"] == "yes"]["Snapshots"])))

Unnamed: 0,Vehicle,Capture,ArbID,Bytes,Snapshots,Valid?
0,1,1,1217,7464,7,yes
1,1,1,2015,440192,429,yes
2,1,1,2024,440192,429,yes
3,1,1,705,214256,209,yes
4,1,1,708,288792,282,yes


1012 valid ArbIDs from 20 vehicles
164 invalid ArbIDs from 12 vehicles
297253 snapshots


In [33]:
total_snaps = np.sum(valid["Snapshots"])

for i in valid["Vehicle"].unique():
    temp = valid[valid["Vehicle"] == i]
    num_arbs = len(temp["ArbID"].unique())
    num_snaps = np.sum(temp["Snapshots"])
    
    print("{} & {} & {} & {:.2f} \% \\\\".format(i, num_arbs, num_snaps, 100. * num_snaps / total_snaps))

1 & 8 & 4440 & 1.49 \% \\
2 & 47 & 6895 & 2.32 \% \\
3 & 49 & 141847 & 47.72 \% \\
4 & 103 & 14633 & 4.92 \% \\
5 & 112 & 43377 & 14.59 \% \\
6 & 79 & 9511 & 3.20 \% \\
7 & 124 & 35142 & 11.82 \% \\
8 & 35 & 5018 & 1.69 \% \\
9 & 21 & 8211 & 2.76 \% \\
101 & 29 & 4102 & 1.38 \% \\
102 & 50 & 1824 & 0.61 \% \\
103 & 62 & 1757 & 0.59 \% \\
104 & 78 & 2182 & 0.73 \% \\
105 & 24 & 3791 & 1.28 \% \\
106 & 28 & 1695 & 0.57 \% \\
107 & 42 & 2198 & 0.74 \% \\
108 & 38 & 3020 & 1.02 \% \\
109 & 26 & 2553 & 0.86 \% \\
110 & 19 & 2974 & 1.00 \% \\
111 & 38 & 2083 & 0.70 \% \\


## Write the snapshots to files

In [23]:
filename = "../snapshots/snapshots.csv"

# delete any snapshot files that already exist
# we do this because we open the file with append,
# but we don't want to append to previously-created files
if os.path.exists(filename):
    os.remove(filename)

# how long does this take?
start = time.time()

# make the big file
f = open(filename, "w+")
f.write("Vehicle,Capture,ArbID")
for i in range(1024):
    f.write(",Data " + str(i + 1))
f.write("\n")

# we only want ArbIDs with at least <min_snaps> snapshots
valid = metadata[metadata["Valid?"] == "yes"]

# for every capture (each is tied to a vehicle)...
for count, capture in enumerate(valid["Capture"].unique()):
    print("Starting capture {} ({}/{})...".format(capture, count + 1, len(valid["Capture"].unique())))
    capture = valid[valid["Capture"] == capture]
    vehicle = capture["Vehicle"].iloc[0]
    
    # for every arbID in this capture...
    for _, row in capture.iterrows():
        # we only care about one arbID from one capture of one vehicle
        can = data[data["Capture"] == row["Capture"]]
        can = can[can["ArbID"] == row["ArbID"]]
        
        # join all hex data values into one big string
        can_data = can["Data"].str.cat(sep="")
        
        # split big string into array of hex bytes
        can_data = [can_data[i:i+2] for i in range(0, len(can_data), 2)]
        
        # convert each hex byte into a three-digit integer
        for i, d in enumerate(can_data):
            can_data[i] = str(int(d, 16)).zfill(3)
        
        # write every set of <chunk_size> bytes to the file
        for i in range(0, len(can_data) - chunk_size, chunk_size):
            f.write("{},{},{},{}\n".format(
                row["Vehicle"],
                row["Capture"],
                row["ArbID"],
                ",".join(can_data[i:i+chunk_size])
            ))

f.close()
elapsed = time.time() - start
print("Elapsed time: {:.2f} seconds ({:.2f} minutes)".format(elapsed, elapsed / 60))

KeyError: 'Capture'