### Get bbox area [Clean Up 1 and Clean up 2]

In [43]:
import sqlite3
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

def list_ids_to_remove_based_on_area(df, plot=False):
    if plot:
        plt.figure(figsize=(10, 6))
        plt.hist(df['area'], bins=30, edgecolor='black')
        plt.title('Histogram of Area')
        plt.xlabel('Area')
        plt.ylabel('Frequency')
        plt.grid(True)
        plt.show()
  

    hist, bin_edges = np.histogram(df['area'], bins=30)

    # Find the bin with the highest frequency
    max_bin_index = np.argmax(hist)
    most_frequent_bin_start = bin_edges[max_bin_index]
    most_frequent_bin_end = bin_edges[max_bin_index + 1]

    # Filter the DataFrame to get the rows that fall into the most frequent bin
    most_frequent_areas = df[(df['area'] >= most_frequent_bin_start) & (df['area'] < most_frequent_bin_end)]

    # Get the unique IDs associated with these areas
    unique_ids = most_frequent_areas['id'].unique()
    filtered_results = df[~df['id'].isin(unique_ids)]
    return filtered_results, unique_ids


def list_ids_to_remove_based_on_movement(df, plot_histogram=False, threshold=100):
    df = df.sort_values(by=['id', 'frame_number'])
    df['centroid_x_shift'] = df.groupby('id')['centroid_x'].shift(1)
    df['centroid_y_shift'] = df.groupby('id')['centroid_y'].shift(1)
    
    df['movement'] = np.sqrt((df['centroid_x'] - df['centroid_x_shift'])**2 + 
                             (df['centroid_y'] - df['centroid_y_shift'])**2)
    
    df['movement'] = df['movement'].fillna(0)
    total_movement_df = df.groupby('id')['movement'].sum().reset_index()
    total_movement_df.columns = ['id', 'total_movement']
    
    if plot_histogram:
        #Visualize the distribution of total movement
        plt.figure(figsize=(10, 6))
        plt.hist(total_movement_df['total_movement'], bins=30, edgecolor='black')
        plt.title('Distribution of Total Movement per ID')
        plt.xlabel('Total Movement')
        plt.ylabel('Frequency')
        plt.grid(True)
        plt.show()
  
    low_movement_ids = total_movement_df[total_movement_df['total_movement'] < threshold]['id']
    list_ids_to_remove = low_movement_ids.to_list()
    
    
    filtered_df = df[~df['id'].isin(low_movement_ids)]
    
    return filtered_df,list_ids_to_remove


### Execute clean up 1 and 2

In [48]:
results = pd.read_csv('/home/diego/mydrive/results/1/3/1/tobalaba_entrada_20240604_1000/tobalaba_entrada_20240604_1000_bbox.csv')

initial_list_ids = results['id'].unique()
print('Initial number of IDs:', len(initial_list_ids))

df,list_ids = list_ids_to_remove_based_on_area(results, plot=False)
print('Number of IDs after removing low area IDs:', len(df['id'].unique()),'and IDs were removed:', len(list_ids))

total_movement_df,list_ids_to_remove  = list_ids_to_remove_based_on_movement(df, plot_histogram=False, threshold=100)

print('Number of IDs after removing low movement IDs:', len(total_movement_df['id'].unique()), 'and IDs were removed:', len(list_ids_to_remove))

Initial number of IDs: 3348
Number of IDs after removing low area IDs: 968 and IDs were removed: 2380
Number of IDs after removing low movement IDs: 589 and IDs were removed: 379


### Version 1 simple travel all video (From CSV to IMGs)

In [22]:
import pandas as pd
import cv2
import os
from tqdm import tqdm

def generate_img_by_bbox(csv_path, video_path, img_path, skip_frames=3, show_progress=True):
    # Load CSV data
    df = pd.read_csv(csv_path,nrows=1000)
    max_frame_number = df['frame_number'].max()
    
    # Open video file
    cap = cv2.VideoCapture(video_path)
    
    # Create the base folder for generated images
    img_generated_path = os.path.join(img_path, 'img_generated_1')
    os.makedirs(img_generated_path, exist_ok=True)
    
    current_frame = 0
    if show_progress:
        progress_bar = tqdm(total=max_frame_number, desc="Processing frames")

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        if current_frame > max_frame_number:
            break
        
        if current_frame % skip_frames == 0:
            if show_progress:
                progress_bar.update(skip_frames)
            
            # Process each row in the DataFrame for the current frame
            frame_data = df[df['frame_number'] == current_frame]
            
            for _, row in frame_data.iterrows():
                img_id = row['id']
                x1, y1, x2, y2 = int(row['x1']), int(row['y1']), int(row['x2']), int(row['y2'])
                
                # Extract the image using bounding box coordinates
                cropped_img = frame[y1:y2, x1:x2]
                
                # Create a directory for the current id if it doesn't exist
                id_folder = os.path.join(img_generated_path, str(img_id))
                os.makedirs(id_folder, exist_ok=True)
                
                # Save the image
                img_filename = f"{current_frame}_{x1}_{y1}_{x2}_{y2}.jpg"
                img_save_path = os.path.join(id_folder, img_filename)
                cv2.imwrite(img_save_path, cropped_img)
        
        current_frame += 1
    
    cap.release()
    if show_progress:
        progress_bar.close()
    print("Image extraction completed!")

# Example usage
csv_path = '/home/diego/mydrive/results/1/3/1/tobalaba_entrada_20240604_1000/tobalaba_entrada_20240604_1000_bbox_filtered.csv'
video_path = '/home/diego/mydrive/footage/1/3/1/tobalaba_entrada_20240604_1000.mkv'
img_path = '/home/diego/mydrive/results/1/3/1/tobalaba_entrada_20240604_1000/'
generate_img_by_bbox(csv_path, video_path, img_path, skip_frames=3)

Processing frames: 30711it [01:13, 415.41it/s]                           

Image extraction completed!





### Version 2 Optimizada Testear con mp4 (From CSV to IMGs)

In [20]:
import pandas as pd
import cv2
import os

def generate_img_by_bbox(csv_path, video_path, img_path, skip_frames=3):
    # Load CSV data
    df = pd.read_csv(csv_path, nrows=1000)
    
    # Open video file
    cap = cv2.VideoCapture(video_path)
    
    # Create the base folder for generated images
    img_generated_path = os.path.join(img_path, 'img_generated')
    os.makedirs(img_generated_path, exist_ok=True)
    
    # Get total number of frames in the video
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    for index, row in df.iterrows():
        frame_number = row['frame_number']
        
        # Skip frames according to the skip_frames parameter
        if frame_number % skip_frames != 0:
            continue
        
        # Check if the frame_number is within the total frames
        if frame_number >= total_frames:
            continue
        
        # Seek to the frame
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
        
        # Read the frame
        ret, frame = cap.read()
        if not ret:
            print(f"Warning: Could not read frame {frame_number}")
            continue
        
        # Verify the frame number
        current_frame = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
        if current_frame != frame_number + 1:
            print(f"Warning: Expected frame {frame_number} but got frame {current_frame-1}")
            continue
        
        # Process all rows with the same frame number
        frame_data = df[df['frame_number'] == frame_number]
        for _, row in frame_data.iterrows():
            img_id = row['id']
            x1, y1, x2, y2 = int(row['x1']), int(row['y1']), int(row['x2']), int(row['y2'])
            
            # Extract the image using bounding box coordinates
            cropped_img = frame[y1:y2, x1:x2]
            
            # Create a directory for the current id if it doesn't exist
            id_folder = os.path.join(img_generated_path, str(img_id))
            os.makedirs(id_folder, exist_ok=True)
            
            # Save the image
            img_filename = f"{frame_number}_{x1}_{y1}_{x2}_{y2}.jpg"
            img_save_path = os.path.join(id_folder, img_filename)
            cv2.imwrite(img_save_path, cropped_img)
    
    cap.release()
    print("Image extraction completed!")

# Example usage
csv_path = '/home/diego/mydrive/results/1/3/1/tobalaba_entrada_20240604_1000/tobalaba_entrada_20240604_1000_bbox_filtered.csv'
video_path = '/home/diego/mydrive/footage/1/3/1/tobalaba_entrada_20240604_1000.mkv'
img_path = '/home/diego/mydrive/results/1/3/1/tobalaba_entrada_20240604_1000/'
generate_img_by_bbox(csv_path, video_path, img_path, skip_frames=3)

Image extraction completed!


### Be able to see images that has low movemenent [Clean Up 2 Debugger] 

In [None]:
import os
import random
import matplotlib.pyplot as plt
import cv2


# FOR DEBUG IMAGES
def view_imgs_by_list_id(list_ids, img_path_folder, img_size=(50, 50), grid_columns=20):
    images = []
    ids = []
    
    for img_id in list_ids:
        id_folder = os.path.join(img_path_folder, str(img_id))
        if not os.path.exists(id_folder):
            continue
        
        img_files = os.listdir(id_folder)
        if not img_files:
            continue
        
        # Randomly pick one image from the folder
        img_file = random.choice(img_files)
        img_file_path = os.path.join(id_folder, img_file)
        
        # Read and resize the image
        img = cv2.imread(img_file_path)
        if img is None:
            continue
        img = cv2.resize(img, img_size)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert to RGB for matplotlib
        
        images.append(img)
        ids.append(img_id)
    
    # Determine the grid size
    grid_rows = len(images) // grid_columns + 1
    
    # Create the plot
    fig, axes = plt.subplots(grid_rows, grid_columns, figsize=(grid_columns, grid_rows))
    fig.subplots_adjust(hspace=0.4, wspace=0.4)
    
    # Plot the images
    for i, ax in enumerate(axes.flat):
        if i < len(images):
            ax.imshow(images[i])
            ax.axis('off')
            ax.set_title(str(ids[i]), fontsize=8)
        else:
            ax.axis('off')
    
    plt.show()

# Example usage
list_ids = list_ids_to_remove
img_path = '/home/diego/mydrive/results/1/3/1/tobalaba_entrada_20240604_1000/imgs'
view_imgs_by_list_id(list_ids, img_path, img_size=(50, 50), grid_columns=20)
