In [1]:
#@title Install dependencies
from IPython.display import clear_output, display, Image
!pip install timm transformers fairscale pycocoevalcap opencv-python ultralytics

clear_output()

### Imports

In [2]:
import os
import subprocess
import pandas as pd
import re
import cv2
from ultralytics import YOLO

clear_output()

In [3]:
#@title Clone the repository
!git clone https://github.com/xinyu1205/recognize-anything.git

%cd recognize-anything
!git checkout ec6b4241c5036e337a4543838deb9bff4990de97

clear_output()

In [4]:
def download_checkpoints():
    if not os.path.exists('pretrained'):
        os.makedirs('pretrained')

    ram_weights_path = 'pretrained/ram_swin_large_14m.pth'
    if not os.path.exists(ram_weights_path):
        !wget https://huggingface.co/spaces/xinyu1205/Recognize_Anything-Tag2Text/resolve/main/ram_swin_large_14m.pth -O pretrained/ram_swin_large_14m.pth
    else:
        print("RAM weights already downloaded!")

In [5]:
download_checkpoints()
print('weights are downloaded!')

--2024-03-31 19:37:26--  https://huggingface.co/spaces/xinyu1205/Recognize_Anything-Tag2Text/resolve/main/ram_swin_large_14m.pth
Resolving huggingface.co (huggingface.co)... 18.164.174.17, 18.164.174.23, 18.164.174.55, ...
Connecting to huggingface.co (huggingface.co)|18.164.174.17|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: /spaces/xinyu1205/recognize-anything/resolve/main/ram_swin_large_14m.pth [following]
--2024-03-31 19:37:26--  https://huggingface.co/spaces/xinyu1205/recognize-anything/resolve/main/ram_swin_large_14m.pth
Reusing existing connection to huggingface.co:443.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/e6/78/e678f8565485a3f321b1180e4c7e1e18a89a9295028358eedffb98981b37e11a/15c729c793af28b9d107c69f85836a1356d76ea830d4714699fb62e55fcc08ed?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27ram_swin_large_14m.pth%3B+filename%3D%22ram_swin_large_14m.pth%22%3B&E

### Video to Frames

In [6]:
video_path = "/content/left_half.mp4"

output_dir = os.path.join("/content", "extracted_frames")
os.makedirs(output_dir, exist_ok=True)

cap = cv2.VideoCapture(video_path)

if not cap.isOpened():
  print("Error opening video!")
  exit()

fps = int(cap.get(cv2.CAP_PROP_FPS))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

frame_count = 1
actual_count = 0

while True:
  ret, frame = cap.read()

  if not ret:
    print("Can't receive frame (stream end?). Exiting...")
    break

  # Save frame as numbered image
  if frame_count % fps == 0:
    image_path = os.path.join(output_dir, f"{frame_count}.jpg")
    cv2.imwrite(image_path, frame)
    actual_count += 1

  frame_count += 1

  if frame_count > total_frames:
    break

cap.release()

print(f"Extracted {actual_count} frames to {output_dir}")

Extracted 308 frames to /content/extracted_frames


### Inference using RAM

In [7]:
images_dir = "/content/extracted_frames"  # Update with your actual image directory

# Run inference for multiple images using RAM
output = subprocess.check_output([
    "python", "batch_inference.py",
    "--image-dir", images_dir,
    "--pretrained", "pretrained/ram_swin_large_14m.pth",
    "--model-type", "ram"
])

decoded_output = output.decode("utf-8")
image_tags = re.findall(r"'filepath': '.*/extracted_frames/(.*?\.jpg)', 'model_identified_tags': '(.*?)'", decoded_output)

df_RAM = pd.DataFrame(image_tags, columns=['Image', 'RAM_Tags'])
df_RAM.to_csv('/content/Tags_RAM.csv')

### Inference using YOLO

In [13]:
model = YOLO('yolov8n.pt')
source = '/content/extracted_frames'
results = model(source, stream=False, save=True, save_txt=True, project="/content/YOLO_inference")


image 1/308 /content/extracted_frames/1008.jpg: 480x640 2 persons, 5 cars, 13.5ms
image 2/308 /content/extracted_frames/1022.jpg: 480x640 1 person, 5 cars, 7.5ms
image 3/308 /content/extracted_frames/1036.jpg: 480x640 3 persons, 4 cars, 7.5ms
image 4/308 /content/extracted_frames/1050.jpg: 480x640 5 persons, 3 cars, 9.3ms
image 5/308 /content/extracted_frames/1064.jpg: 480x640 5 persons, 3 cars, 7.1ms
image 6/308 /content/extracted_frames/1078.jpg: 480x640 5 persons, 3 cars, 1 backpack, 7.4ms
image 7/308 /content/extracted_frames/1092.jpg: 480x640 4 persons, 4 cars, 7.2ms
image 8/308 /content/extracted_frames/1106.jpg: 480x640 5 persons, 3 cars, 7.2ms
image 9/308 /content/extracted_frames/112.jpg: 480x640 7 persons, 2 cars, 7.1ms
image 10/308 /content/extracted_frames/1120.jpg: 480x640 4 persons, 4 cars, 7.4ms
image 11/308 /content/extracted_frames/1134.jpg: 480x640 5 persons, 3 cars, 7.2ms
image 12/308 /content/extracted_frames/1148.jpg: 480x640 5 persons, 4 cars, 7.3ms
image 13/308 

In [19]:
# Dictionary of class labels
class_labels = {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck',
                8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
                14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear',
                22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase',
                29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat',
                35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
                40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana',
                47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog',
                53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed',
                60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote',
                66: 'keyboard', 67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink',
                72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear',
                78: 'hair drier', 79: 'toothbrush'}

# Path to the labels folder
labels_folder = "/content/YOLO_inference/predict/labels"

def parse_label_file(label_file):
    with open(label_file, 'r') as f:
        lines = f.readlines()
        class_counts = {class_label: 0 for class_label in class_labels.values()}
        for line in lines:
            class_index = int(line.split()[0])
            class_name = class_labels[class_index]
            class_counts[class_name] += 1
    return class_counts

data = []
for label_file in os.listdir(labels_folder):
    if label_file.endswith('.txt'):
        image_name = os.path.splitext(label_file)[0] + '.jpg'
        label_file_path = os.path.join(labels_folder, label_file)
        class_counts = parse_label_file(label_file_path)
        class_counts['Image'] = image_name
        data.append(class_counts)

df_YOLO = pd.DataFrame(data).fillna(0)
df_YOLO = df_YOLO[['Image'] + list(class_labels.values())]

In [20]:
df_YOLO_non_null = df_YOLO.loc[:, (df_YOLO != 0).any(axis=0)]
df_YOLO_non_null.to_csv('/content/Yolo_counts.csv')

In [21]:
# Function to create the tags string
def create_tags_string(row):
    tags_list = []
    for col in df_YOLO.columns[1:]:
        if row[col] > 0:
            tags_list.append(f"{int(row[col])} {col}")
    return ', '.join(tags_list)

df_YOLO['YOLO_Tags'] = df_YOLO.apply(create_tags_string, axis=1)
df_YOLO_tags = df_YOLO[['Image', 'YOLO_Tags']]
df_YOLO_tags.to_csv('/content/Yolo_tags.csv')

### Merge RAM and YOLO

In [22]:
df_final = pd.merge(df_RAM, df_YOLO_tags, on='Image', how='inner')
df_final['Image_Num'] = df_final['Image'].str.replace('.jpg', '').astype(int)
df_final = df_final.sort_values(by='Image_Num')
df_final.drop('Image_Num', axis=1, inplace=True)
df_final.reset_index(inplace=True)
df_final.drop('index', axis=1, inplace=True)
df_final.to_csv('/content/YOLO+RAM.csv')

  df_final['Image_Num'] = df_final['Image'].str.replace('.jpg', '').astype(int)


In [23]:
df_final

Unnamed: 0,Image,RAM_Tags,YOLO_Tags
0,70.jpg,car | city street | cross | crosswalk | person...,"9 person, 5 car"
1,84.jpg,car | city street | cross | crosswalk | person...,"8 person, 4 car, 1 traffic light"
2,98.jpg,car | city street | cross | crosswalk | person...,"8 person, 2 car"
3,112.jpg,city street | cross | crosswalk | person | man...,"7 person, 2 car"
4,126.jpg,boy | city street | cross | crosswalk | person...,"8 person, 4 car, 1 traffic light, 1 skateboard"
...,...,...,...
299,4256.jpg,car | city street | cross | crosswalk | person...,"8 person, 2 car"
300,4270.jpg,blue | car | city street | cross | crosswalk |...,"8 person, 2 car"
301,4284.jpg,blue | car | city street | cross | crosswalk |...,"10 person, 2 car"
302,4298.jpg,car | city street | cross | crosswalk | girl |...,"8 person, 3 car, 2 handbag"
