In [62]:
from typing import Any, Dict, List, Optional, Tuple, Union
from pathlib import Path

import pandas as pd

In [63]:
def return_list_of_files(
    directory: Union[str, Path],
    extensions: Optional[List[str]] = None,
    return_string: bool = True,
) -> Union[List[str], List[Path]]:
    """Returns a list of files in a directory based on extensions.
    If extensions is None, all files are returned.

    Note:
        all_image_extensions = [".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif"]

    Args:
        directory (Union[str, Path]): The directory to search.
        extensions (Optional[List[str]]): The extension of the files to search for.
            Defaults to None.
        return_string (bool): Whether to return a list of strings or Paths.
            Defaults to True.

    Returns:
        List[str, Path]: List of files in the directory.
    """
    if isinstance(directory, str):
        directory = Path(directory)

    if extensions is None and return_string:
        return [
            f.as_posix()
            for f in directory.resolve().glob("[!__]*/**/[!__]*")
            if f.is_file()
        ]

    if extensions is None and not return_string:
        return [f for f in directory.resolve().glob("[!__]*/**/[!__]*") if f.is_file()]

    if return_string:
        list_of_files = sorted(
            [
                path.as_posix()
                for path in filter(
                    lambda path: path.suffix in extensions,
                    directory.glob("[!__]*/**/[!__]*"),
                )
            ]
        )
    else:
        list_of_files = sorted(
            filter(
                lambda path: path.suffix in extensions,
                directory.glob("[!__]*/**/[!__]*"),
            )
        )
    return list_of_files

In [64]:
train_dir: Path = Path("../datasets/coco2017/image/")
train_images = return_list_of_files(
    train_dir, extensions=[".jpg", ".png", ".jpeg"], return_string=True
)


In [65]:
len(train_images)

118287

In [66]:
train_dir: Path = Path("../datasets/coco2017/labels/")
train_labels = return_list_of_files(
    train_dir, extensions=[".txt"], return_string=True
)

In [67]:
len(train_labels)

122218

In [68]:
train_images.sort()
train_labels.sort()

In [71]:
df = pd.DataFrame({"image_path": train_images})
df_label = pd.DataFrame({"annotation_path": train_labels})


In [73]:
df_label

Unnamed: 0,annotation_path
0,../datasets/coco2017/labels/train2017/train/00...
1,../datasets/coco2017/labels/train2017/train/00...
2,../datasets/coco2017/labels/train2017/train/00...
3,../datasets/coco2017/labels/train2017/train/00...
4,../datasets/coco2017/labels/train2017/train/00...
...,...
122213,../datasets/coco2017/labels/val2017/0000005813...
122214,../datasets/coco2017/labels/val2017/0000005813...
122215,../datasets/coco2017/labels/val2017/0000005814...
122216,../datasets/coco2017/labels/val2017/0000005816...


In [94]:
df["id"]=df["image_path"].str.extract(r"(\d+)(?!.*\d)", expand=False)

In [96]:
df_label["id"]=df_label["annotation_path"].str.extract(r"(\d+)(?!.*\d)", expand=False)

In [80]:
df["image_path"]

0         data/coco2017/image/train2017/000000000009.jpg
1         data/coco2017/image/train2017/000000000025.jpg
2         data/coco2017/image/train2017/000000000030.jpg
3         data/coco2017/image/train2017/000000000034.jpg
4         data/coco2017/image/train2017/000000000036.jpg
                               ...                      
118282    data/coco2017/image/train2017/000000581906.jpg
118283    data/coco2017/image/train2017/000000581909.jpg
118284    data/coco2017/image/train2017/000000581913.jpg
118285    data/coco2017/image/train2017/000000581921.jpg
118286    data/coco2017/image/train2017/000000581929.jpg
Name: image_path, Length: 118287, dtype: object

In [74]:
df["image_path"] = df["image_path"].str.replace(
    pat="../datasets", repl="data", regex=False
)


In [75]:
df_label["annotation_path"] = df_label["annotation_path"].str.replace(
    pat="../datasets", repl="data", regex=False
)

In [76]:
df

Unnamed: 0,image_path
0,data/coco2017/image/train2017/000000000009.jpg
1,data/coco2017/image/train2017/000000000025.jpg
2,data/coco2017/image/train2017/000000000030.jpg
3,data/coco2017/image/train2017/000000000034.jpg
4,data/coco2017/image/train2017/000000000036.jpg
...,...
118282,data/coco2017/image/train2017/000000581906.jpg
118283,data/coco2017/image/train2017/000000581909.jpg
118284,data/coco2017/image/train2017/000000581913.jpg
118285,data/coco2017/image/train2017/000000581921.jpg


In [77]:
df_label

Unnamed: 0,annotation_path
0,data/coco2017/labels/train2017/train/000000000...
1,data/coco2017/labels/train2017/train/000000000...
2,data/coco2017/labels/train2017/train/000000000...
3,data/coco2017/labels/train2017/train/000000000...
4,data/coco2017/labels/train2017/train/000000000...
...,...
122213,data/coco2017/labels/val2017/000000581317.txt
122214,data/coco2017/labels/val2017/000000581357.txt
122215,data/coco2017/labels/val2017/000000581482.txt
122216,data/coco2017/labels/val2017/000000581615.txt


In [97]:
dfinal = df.merge(df_label, on="id", how = 'inner')

In [98]:
dfinal

Unnamed: 0,image_path,id,annotation_path
0,data/coco2017/image/train2017/000000000009.jpg,000000000009,data/coco2017/labels/train2017/train/000000000...
1,data/coco2017/image/train2017/000000000025.jpg,000000000025,data/coco2017/labels/train2017/train/000000000...
2,data/coco2017/image/train2017/000000000030.jpg,000000000030,data/coco2017/labels/train2017/train/000000000...
3,data/coco2017/image/train2017/000000000034.jpg,000000000034,data/coco2017/labels/train2017/train/000000000...
4,data/coco2017/image/train2017/000000000036.jpg,000000000036,data/coco2017/labels/train2017/train/000000000...
...,...,...,...
117261,data/coco2017/image/train2017/000000581906.jpg,000000581906,data/coco2017/labels/train2017/train/000000581...
117262,data/coco2017/image/train2017/000000581909.jpg,000000581909,data/coco2017/labels/train2017/train/000000581...
117263,data/coco2017/image/train2017/000000581913.jpg,000000581913,data/coco2017/labels/train2017/train/000000581...
117264,data/coco2017/image/train2017/000000581921.jpg,000000581921,data/coco2017/labels/train2017/train/000000581...


In [100]:
dfinal.columns.tolist()

['image_path', 'id', 'annotation_path']

In [105]:
dfinal = dfinal[["id", "image_path", "annotation_path"]]


In [106]:
dfinal

Unnamed: 0,id,image_path,annotation_path
0,000000000009,data/coco2017/image/train2017/000000000009.jpg,data/coco2017/labels/train2017/train/000000000...
1,000000000025,data/coco2017/image/train2017/000000000025.jpg,data/coco2017/labels/train2017/train/000000000...
2,000000000030,data/coco2017/image/train2017/000000000030.jpg,data/coco2017/labels/train2017/train/000000000...
3,000000000034,data/coco2017/image/train2017/000000000034.jpg,data/coco2017/labels/train2017/train/000000000...
4,000000000036,data/coco2017/image/train2017/000000000036.jpg,data/coco2017/labels/train2017/train/000000000...
...,...,...,...
117261,000000581906,data/coco2017/image/train2017/000000581906.jpg,data/coco2017/labels/train2017/train/000000581...
117262,000000581909,data/coco2017/image/train2017/000000581909.jpg,data/coco2017/labels/train2017/train/000000581...
117263,000000581913,data/coco2017/image/train2017/000000581913.jpg,data/coco2017/labels/train2017/train/000000581...
117264,000000581921,data/coco2017/image/train2017/000000581921.jpg,data/coco2017/labels/train2017/train/000000581...


In [None]:
dfinal.to_csv('coco.csv', index=False)