Virtual KITTI Dataloader

In [1]:
import pandas as pd
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt

annotation_folder = r'C:\Arbeitsordner\Abgaben_repo\vkitti_2.0.3_textgt\Scene01\30-deg-right'
data_folder = r'C:\Arbeitsordner\Abgaben_repo\vkitti_2.0.3_rgb\Scene01\30-deg-right\frames\rgb\Camera_0'

pose_df = pd.read_csv(os.path.join(annotation_folder, 'pose.txt'), delim_whitespace=True)
info_df = pd.read_csv(os.path.join(annotation_folder, 'info.txt'), delim_whitespace=True)
bbox_df = pd.read_csv(os.path.join(annotation_folder, 'bbox.txt'), delim_whitespace=True)
colors_df = pd.read_csv(os.path.join(annotation_folder, 'colors.txt'), delim_whitespace=True)
intrinsic_df = pd.read_csv(os.path.join(annotation_folder, 'intrinsic.txt'), delim_whitespace=True)
extrinsic_df = pd.read_csv(os.path.join(annotation_folder, 'extrinsic.txt'), delim_whitespace=True)

# mapping dictionary from labels to colors
label_to_color = {}
for _, row in colors_df.iterrows():
    label_to_color[row['Category']] = (row['r'], row['g'], row['b'])
    print(label_to_color)

# preprocess images
def preprocess_image(image, target_size=(224, 224)):
    image = cv2.resize(image, target_size)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = image.astype(np.float32) / 255.0
    return image

# Match RGB images to labels and 3D bounding boxes
def match_data(rgb_folder, pose_df, info_df, bbox_df, label_to_color):
    data = []
    for root, dirs, files in os.walk(rgb_folder):
        for file in files:
            if file.endswith('.jpg'): 

                frame_id = file.split('_')[1]
                frame_id = int(frame_id.split('.')[0])
                
                # Filter the bounding boxes for the current frame
                frame_bbox = bbox_df[bbox_df['frame'] == frame_id]
                print(frame_bbox)
                
                camera_id = 0 if 'Camera_0' in file else 1
                
                # Get the intrinsic parameters for the current camera
                intrinsic_params = intrinsic_df[(intrinsic_df['frame'] == frame_id) & (intrinsic_df['cameraID'] == camera_id)]
                K = intrinsic_params[['K[0,0]', 'K[1,1]', 'K[0,2]', 'K[1,2]']].values[0]
                
                # Get the extrinsic parameters for the current camera
                extrinsic_params = extrinsic_df[(extrinsic_df['frame'] == frame_id) & (extrinsic_df['cameraID'] == camera_id)]
                R = extrinsic_params[['r1,1', 'r1,2', 'r1,3', 'r2,1', 'r2,2', 'r2,3', 'r3,1', 'r3,2', 'r3,3']].values.reshape(3, 3)
                t = extrinsic_params[['t1', 't2', 't3']].values
                
             
                image_path = os.path.join(root, file)
                image = cv2.imread(image_path)
                
                # check how to get the correct label for the bounding boxes in each image? What file to use to get the labels?
                for idx, row in frame_bbox.iterrows():
                    track_id = row['trackID']
                    label_info = info_df[info_df['trackID'] == track_id]
                    label = label_info['label'].values[0]
                    
                    color = label_to_color.get(label, (255, 255, 255))  
                    
                   
                    
                    data.append({'image': preprocess_image(roi), 'label': label, 'color': color, 'bbox': [obj_left, obj_top, obj_right, obj_bottom]})
                    
    return data

data = match_data(annotation_folder, pose_df, info_df, bbox_df, label_to_color)
print(data)

{'Terrain': (210, 0, 200)}
{'Terrain': (210, 0, 200), 'Sky': (90, 200, 255)}
{'Terrain': (210, 0, 200), 'Sky': (90, 200, 255), 'Tree': (0, 199, 0)}
{'Terrain': (210, 0, 200), 'Sky': (90, 200, 255), 'Tree': (0, 199, 0), 'Vegetation': (90, 240, 0)}
{'Terrain': (210, 0, 200), 'Sky': (90, 200, 255), 'Tree': (0, 199, 0), 'Vegetation': (90, 240, 0), 'Building': (140, 140, 140)}
{'Terrain': (210, 0, 200), 'Sky': (90, 200, 255), 'Tree': (0, 199, 0), 'Vegetation': (90, 240, 0), 'Building': (140, 140, 140), 'Road': (100, 60, 100)}
{'Terrain': (210, 0, 200), 'Sky': (90, 200, 255), 'Tree': (0, 199, 0), 'Vegetation': (90, 240, 0), 'Building': (140, 140, 140), 'Road': (100, 60, 100), 'GuardRail': (250, 100, 255)}
{'Terrain': (210, 0, 200), 'Sky': (90, 200, 255), 'Tree': (0, 199, 0), 'Vegetation': (90, 240, 0), 'Building': (140, 140, 140), 'Road': (100, 60, 100), 'GuardRail': (250, 100, 255), 'TrafficSign': (255, 255, 0)}
{'Terrain': (210, 0, 200), 'Sky': (90, 200, 255), 'Tree': (0, 199, 0), 'Vegetat

KITTI Dataloader

In [1]:
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt  


def load_data(image_dir, label_dir):
    data = []
    image_files = sorted(os.listdir(image_dir))
    
    for image_file in image_files:
        if image_file.endswith('.png'):
            image_path = os.path.join(image_dir, image_file)
            label_path = os.path.join(label_dir, image_file[:-4] + ".txt")
            if os.path.isfile(label_path):
                labels = parse_label(label_path)
                data.append({
                    "image_path": image_path,
                    "labels": labels
                })
    return data

def parse_label(label_file):
    with open(label_file, 'r') as f:
        lines = f.readlines()

    labels = []
    for line in lines:
        line = line.strip()
        label_info = line.split(' ')
        if label_info[0] in ['Car', 'Truck', 'Pedestrian', 'Cyclist']:
            label = {
                "type": label_info[0],
                "truncated": float(label_info[1]),
                "occluded": int(label_info[2]),
                "alpha": float(label_info[3]),
                "bbox": [float(x) for x in label_info[4:8]],
                "dimensions": [float(x) for x in label_info[8:11]],
                "location": [float(x) for x in label_info[11:14]],
                "rotation_y": float(label_info[14]),
                "score": float(label_info[15]) if len(label_info) > 15 else None
            }
            labels.append(label)

    return labels

def visualize_data(data):
    for entry in data:
        image_path = entry["image_path"]
        labels = entry["labels"]
        image = cv2.imread(image_path)
        
        # Plot the image
        plt.figure(figsize=(10, 5))
        plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        plt.axis('off')

        # Plot bounding boxes
        for label in labels:
            bbox = label["bbox"]
            cv2.rectangle(image, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (0, 255, 0), 2)
        
        plt.show()


image_dir = r"C:\Arbeitsordner\Abgaben_repo\Datasets\KITTI\data_object_image_3\training\image_3"
label_dir = r"C:\Arbeitsordner\Abgaben_repo\Datasets\KITTI\data_object_label_2\training\label_2"
data = load_data(image_dir, label_dir)
# visualize_data(data)


In [2]:
data

[{'image_path': 'C:\\Arbeitsordner\\Abgaben_repo\\Datasets\\KITTI\\data_object_image_3\\training\\image_3\\000000.png',
  'labels': [{'type': 'Pedestrian',
    'truncated': 0.0,
    'occluded': 0,
    'alpha': -0.2,
    'bbox': [712.4, 143.0, 810.73, 307.92],
    'dimensions': [1.89, 0.48, 1.2],
    'location': [1.84, 1.47, 8.41],
    'rotation_y': 0.01,
    'score': None}]},
 {'image_path': 'C:\\Arbeitsordner\\Abgaben_repo\\Datasets\\KITTI\\data_object_image_3\\training\\image_3\\000001.png',
  'labels': [{'type': 'Truck',
    'truncated': 0.0,
    'occluded': 0,
    'alpha': -1.57,
    'bbox': [599.41, 156.4, 629.75, 189.25],
    'dimensions': [2.85, 2.63, 12.34],
    'location': [0.47, 1.49, 69.44],
    'rotation_y': -1.56,
    'score': None},
   {'type': 'Car',
    'truncated': 0.0,
    'occluded': 0,
    'alpha': 1.85,
    'bbox': [387.63, 181.54, 423.81, 203.12],
    'dimensions': [1.67, 1.87, 3.69],
    'location': [-16.53, 2.39, 58.49],
    'rotation_y': 1.57,
    'score': None

WAYMO Dataset

In [2]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.16.1-cp311-cp311-win_amd64.whl.metadata (3.5 kB)
Collecting tensorflow-intel==2.16.1 (from tensorflow)
  Downloading tensorflow_intel-2.16.1-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.16.1->tensorflow)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.16.1->tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow-intel==2.16.1->tensorflow)
  Using cached flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.16.1->tensorflow)
  Using cached gast-0.5.4-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.16.1->tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=3.10.0 (from tensor

In [3]:
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds

dataset, info = tfds.load('waymo_open_dataset/v1.0', data_dir='gs://waymo_open_dataset_v_1_0_0_individual_files/tensorflow_datasets', with_info=True)

# preprocess the dataset
def preprocess(example):
    image = tf.image.decode_jpeg(example['image'], channels=3)
    return image

dataset = dataset.map(preprocess)

# Plot an image from the dataset
for example in dataset.take(1):
    plt.imshow(example.numpy())
    plt.title("Image from Waymo dataset")
    plt.axis('off')
    plt.show()


ModuleNotFoundError: No module named 'tensorflow_datasets'

# Training
Train on KITTI Dataset

In [6]:
import os
import numpy as np
import cv2
import tensorflow as tf
from sklearn.model_selection import train_test_split


# print the training progress
class PrintProgress(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f"Epoch {epoch+1}/{self.params['epochs']}, Loss: {logs['loss']:.4f}, Accuracy: {logs['accuracy']:.4f}, Validation Loss: {logs['val_loss']:.4f}, Validation Accuracy: {logs['val_accuracy']:.4f}")


class KITTIDataLoader:
    def __init__(self, kitti_image_dir, kitti_label_dir):
        self.image_dir = kitti_image_dir
        self.label_dir = kitti_label_dir

    def load_data(self):
        data = []
        image_files = os.listdir(self.image_dir)
        label_files = os.listdir(self.label_dir)
        label_files = [file for file in label_files if file.endswith('.txt')]

        for label_file in label_files:
            image_file = label_file[:-4] + ".png"
            image_path = os.path.join(self.image_dir, image_file)
            label_path = os.path.join(self.label_dir, label_file)

            if os.path.exists(image_path):
                labels = self.parse_label(label_path)
                data.append({
                    "image_path": image_path,
                    "labels": labels
                })

        return data

    def parse_label(self, label_file):
        with open(label_file, 'r') as f:
            lines = f.readlines()

        labels = []
        for line in lines:
            line = line.strip().split(' ')
            if line[0] in ['Car', 'Truck', 'Pedestrian', 'Cyclist']:
                label = {
                    "type": line[0],
                    "bbox": [float(x) for x in line[4:8]]
                }
                labels.append(label)

        return labels

# Simple CNN model
def create_3d_object_detection_model(input_shape):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(tf.keras.layers.MaxPooling2D((2, 2)))
    model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(tf.keras.layers.MaxPooling2D((2, 2)))
    model.add(tf.keras.layers.Conv2D(128, (3, 3), activation='relu'))
    model.add(tf.keras.layers.MaxPooling2D((2, 2)))
    model.add(tf.keras.layers.Conv2D(256, (3, 3), activation='relu'))
    model.add(tf.keras.layers.MaxPooling2D((2, 2)))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(512, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(256, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(128, activation='relu'))
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    model.add(tf.keras.layers.Dense(4)) 
    return model

image_dir = r"C:\Arbeitsordner\Abgaben_repo\Datasets\KITTI\data_object_image_3\training\image_3"
label_dir = r"C:\Arbeitsordner\Abgaben_repo\Datasets\KITTI\data_object_label_2\training\label_2"


data_loader = KITTIDataLoader(image_dir, label_dir)

data = data_loader.load_data()

print(data)

# split value? Quite small dataset...
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# what is a suitable input size? 224,224,3?
input_shape = (100, 100, 3)  

model = create_3d_object_detection_model(input_shape)

model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

# trainign 
X_train = []
y_train = []
for entry in train_data:
    image_path = entry["image_path"]
    labels = entry["labels"]
    image = cv2.imread(image_path)
    image = cv2.resize(image, (input_shape[1], input_shape[0])) 
    X_train.append(image)
    # Assuming only one object per image for simplicity??? How should we use multiple labels for an image?
    label = labels[0]
    bbox = label["bbox"]

    y_train.append(bbox)

X_train = np.array(X_train)
y_train = np.array(y_train)

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[PrintProgress()])

# Testing
X_test = []
y_test = []
for entry in test_data:
    image_path = entry["image_path"]
    labels = entry["labels"]
    image = cv2.imread(image_path)
    image = cv2.resize(image, (input_shape[1], input_shape[0]))  
    X_test.append(image)
    # Assuming only one object per image for simplicity??? How should we use multiple labels for an image?
    label = labels[0]
    bbox = label["bbox"]
   
    y_test.append(bbox)

X_test = np.array(X_test)
y_test = np.array(y_test)

loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

[{'image_path': 'C:\\Arbeitsordner\\Abgaben_repo\\Datasets\\KITTI\\data_object_image_3\\training\\image_3\\000000.png', 'labels': [{'type': 'Pedestrian', 'bbox': [712.4, 143.0, 810.73, 307.92]}]}, {'image_path': 'C:\\Arbeitsordner\\Abgaben_repo\\Datasets\\KITTI\\data_object_image_3\\training\\image_3\\000001.png', 'labels': [{'type': 'Truck', 'bbox': [599.41, 156.4, 629.75, 189.25]}, {'type': 'Car', 'bbox': [387.63, 181.54, 423.81, 203.12]}, {'type': 'Cyclist', 'bbox': [676.6, 163.95, 688.98, 193.93]}]}, {'image_path': 'C:\\Arbeitsordner\\Abgaben_repo\\Datasets\\KITTI\\data_object_image_3\\training\\image_3\\000002.png', 'labels': [{'type': 'Car', 'bbox': [657.39, 190.13, 700.07, 223.39]}]}, {'image_path': 'C:\\Arbeitsordner\\Abgaben_repo\\Datasets\\KITTI\\data_object_image_3\\training\\image_3\\000003.png', 'labels': [{'type': 'Car', 'bbox': [614.24, 181.78, 727.31, 284.77]}]}, {'image_path': 'C:\\Arbeitsordner\\Abgaben_repo\\Datasets\\KITTI\\data_object_image_3\\training\\image_3\\00

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 198ms/step - accuracy: 0.5806 - loss: 33.5920Epoch 1/10, Loss: 12.0954, Accuracy: 0.6855, Validation Loss: 5.9444, Validation Accuracy: 0.8588
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 216ms/step - accuracy: 0.5814 - loss: 33.4292 - val_accuracy: 0.8588 - val_loss: 5.9444
Epoch 2/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 208ms/step - accuracy: 0.8179 - loss: 4.9391Epoch 2/10, Loss: 5.1043, Accuracy: 0.8149, Validation Loss: 6.6492, Validation Accuracy: 0.8588
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 225ms/step - accuracy: 0.8179 - loss: 4.9404 - val_accuracy: 0.8588 - val_loss: 6.6492
Epoch 3/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 204ms/step - accuracy: 0.8390 - loss: 4.5821Epoch 3/10, Loss: 4.6901, Accuracy: 0.8410, Validation Loss: 5.4461, Validation Accuracy: 0.8588
[1m131/131[0m [32m━━━━━━━━━━━━━━

KeyboardInterrupt: 

In [7]:
data

[{'image_path': 'C:\\Arbeitsordner\\Abgaben_repo\\Datasets\\KITTI\\data_object_image_3\\training\\image_3\\000000.png',
  'labels': [{'type': 'Pedestrian', 'bbox': [712.4, 143.0, 810.73, 307.92]}]},
 {'image_path': 'C:\\Arbeitsordner\\Abgaben_repo\\Datasets\\KITTI\\data_object_image_3\\training\\image_3\\000001.png',
  'labels': [{'type': 'Truck', 'bbox': [599.41, 156.4, 629.75, 189.25]},
   {'type': 'Car', 'bbox': [387.63, 181.54, 423.81, 203.12]},
   {'type': 'Cyclist', 'bbox': [676.6, 163.95, 688.98, 193.93]}]},
 {'image_path': 'C:\\Arbeitsordner\\Abgaben_repo\\Datasets\\KITTI\\data_object_image_3\\training\\image_3\\000002.png',
  'labels': [{'type': 'Car', 'bbox': [657.39, 190.13, 700.07, 223.39]}]},
 {'image_path': 'C:\\Arbeitsordner\\Abgaben_repo\\Datasets\\KITTI\\data_object_image_3\\training\\image_3\\000003.png',
  'labels': [{'type': 'Car', 'bbox': [614.24, 181.78, 727.31, 284.77]}]},
 {'image_path': 'C:\\Arbeitsordner\\Abgaben_repo\\Datasets\\KITTI\\data_object_image_3\\tra