### Get bbox area [Clean Up 1]

In [11]:
import sqlite3
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

def get_db_connection(path_to_db):
    conn = sqlite3.connect(path_to_db)
    conn.row_factory = sqlite3.Row
    return conn


db = get_db_connection('/home/diego/mydrive/results/1/3/1/tobalaba_entrada_20240604_1000/tobalaba_entrada_20240604_1000_bbox.db')
query = "SELECT * from bbox_raw"
results = pd.read_sql(query, db)

# Index(['id', 'x1', 'y1', 'x2', 'y2', 'centroid_x', 'centroid_y', 'area',
#        'frame_number', 'time_sec', 'time_video', 'overlap',
#        'distance_to_center', 'direction', 'conf_score', 'img_name'],
#       dtype='object')

# plt.figure(figsize=(10, 6))
# plt.hist(results['area'], bins=30, edgecolor='black')
# plt.title('Histogram of Area')
# plt.xlabel('Area')
# plt.ylabel('Frequency')
# plt.grid(True)
# plt.show()

uniques_ids = results['id'].unique()
print(len(uniques_ids))


hist, bin_edges = np.histogram(results['area'], bins=30)

# Find the bin with the highest frequency
max_bin_index = np.argmax(hist)
most_frequent_bin_start = bin_edges[max_bin_index]
most_frequent_bin_end = bin_edges[max_bin_index + 1]

# Filter the DataFrame to get the rows that fall into the most frequent bin
most_frequent_areas = results[(results['area'] >= most_frequent_bin_start) & (results['area'] < most_frequent_bin_end)]

# Get the unique IDs associated with these areas
unique_ids = most_frequent_areas['id'].unique()

# Remove rows with these unique IDs
filtered_results = results[~results['id'].isin(unique_ids)]

new_csv = filtered_results.to_csv('/home/diego/mydrive/results/1/3/1/tobalaba_entrada_20240604_1000/tobalaba_entrada_20240604_1000_bbox_filtered.csv', index=False)

# Print or return the unique IDs
# print(unique_ids.tolist())
# print(len(unique_ids))

3354


### Version 1 simple travel all video

In [22]:
import pandas as pd
import cv2
import os
from tqdm import tqdm

def generate_img_by_bbox(csv_path, video_path, img_path, skip_frames=3, show_progress=True):
    # Load CSV data
    df = pd.read_csv(csv_path,nrows=1000)
    max_frame_number = df['frame_number'].max()
    
    # Open video file
    cap = cv2.VideoCapture(video_path)
    
    # Create the base folder for generated images
    img_generated_path = os.path.join(img_path, 'img_generated_1')
    os.makedirs(img_generated_path, exist_ok=True)
    
    current_frame = 0
    if show_progress:
        progress_bar = tqdm(total=max_frame_number, desc="Processing frames")

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        if current_frame > max_frame_number:
            break
        
        if current_frame % skip_frames == 0:
            if show_progress:
                progress_bar.update(skip_frames)
            
            # Process each row in the DataFrame for the current frame
            frame_data = df[df['frame_number'] == current_frame]
            
            for _, row in frame_data.iterrows():
                img_id = row['id']
                x1, y1, x2, y2 = int(row['x1']), int(row['y1']), int(row['x2']), int(row['y2'])
                
                # Extract the image using bounding box coordinates
                cropped_img = frame[y1:y2, x1:x2]
                
                # Create a directory for the current id if it doesn't exist
                id_folder = os.path.join(img_generated_path, str(img_id))
                os.makedirs(id_folder, exist_ok=True)
                
                # Save the image
                img_filename = f"{current_frame}_{x1}_{y1}_{x2}_{y2}.jpg"
                img_save_path = os.path.join(id_folder, img_filename)
                cv2.imwrite(img_save_path, cropped_img)
        
        current_frame += 1
    
    cap.release()
    if show_progress:
        progress_bar.close()
    print("Image extraction completed!")

# Example usage
csv_path = '/home/diego/mydrive/results/1/3/1/tobalaba_entrada_20240604_1000/tobalaba_entrada_20240604_1000_bbox_filtered.csv'
video_path = '/home/diego/mydrive/footage/1/3/1/tobalaba_entrada_20240604_1000.mkv'
img_path = '/home/diego/mydrive/results/1/3/1/tobalaba_entrada_20240604_1000/'
generate_img_by_bbox(csv_path, video_path, img_path, skip_frames=3)

Processing frames: 30711it [01:13, 415.41it/s]                           

Image extraction completed!





### Version 2 Optimizada Testear con mp4

In [20]:
import pandas as pd
import cv2
import os

def generate_img_by_bbox(csv_path, video_path, img_path, skip_frames=3):
    # Load CSV data
    df = pd.read_csv(csv_path, nrows=1000)
    
    # Open video file
    cap = cv2.VideoCapture(video_path)
    
    # Create the base folder for generated images
    img_generated_path = os.path.join(img_path, 'img_generated')
    os.makedirs(img_generated_path, exist_ok=True)
    
    # Get total number of frames in the video
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    for index, row in df.iterrows():
        frame_number = row['frame_number']
        
        # Skip frames according to the skip_frames parameter
        if frame_number % skip_frames != 0:
            continue
        
        # Check if the frame_number is within the total frames
        if frame_number >= total_frames:
            continue
        
        # Seek to the frame
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
        
        # Read the frame
        ret, frame = cap.read()
        if not ret:
            print(f"Warning: Could not read frame {frame_number}")
            continue
        
        # Verify the frame number
        current_frame = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
        if current_frame != frame_number + 1:
            print(f"Warning: Expected frame {frame_number} but got frame {current_frame-1}")
            continue
        
        # Process all rows with the same frame number
        frame_data = df[df['frame_number'] == frame_number]
        for _, row in frame_data.iterrows():
            img_id = row['id']
            x1, y1, x2, y2 = int(row['x1']), int(row['y1']), int(row['x2']), int(row['y2'])
            
            # Extract the image using bounding box coordinates
            cropped_img = frame[y1:y2, x1:x2]
            
            # Create a directory for the current id if it doesn't exist
            id_folder = os.path.join(img_generated_path, str(img_id))
            os.makedirs(id_folder, exist_ok=True)
            
            # Save the image
            img_filename = f"{frame_number}_{x1}_{y1}_{x2}_{y2}.jpg"
            img_save_path = os.path.join(id_folder, img_filename)
            cv2.imwrite(img_save_path, cropped_img)
    
    cap.release()
    print("Image extraction completed!")

# Example usage
csv_path = '/home/diego/mydrive/results/1/3/1/tobalaba_entrada_20240604_1000/tobalaba_entrada_20240604_1000_bbox_filtered.csv'
video_path = '/home/diego/mydrive/footage/1/3/1/tobalaba_entrada_20240604_1000.mkv'
img_path = '/home/diego/mydrive/results/1/3/1/tobalaba_entrada_20240604_1000/'
generate_img_by_bbox(csv_path, video_path, img_path, skip_frames=3)

Image extraction completed!


### Clean UP IDs without movements [Clean Up 2]

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def calculate_movement(df):
    df = df.sort_values(by=['id', 'frame_number'])
    df['centroid_x_shift'] = df.groupby('id')['centroid_x'].shift(1)
    df['centroid_y_shift'] = df.groupby('id')['centroid_y'].shift(1)
    
    df['movement'] = np.sqrt((df['centroid_x'] - df['centroid_x_shift'])**2 + 
                             (df['centroid_y'] - df['centroid_y_shift'])**2)
    
    df['movement'] = df['movement'].fillna(0)
    total_movement_df = df.groupby('id')['movement'].sum().reset_index()
    total_movement_df.columns = ['id', 'total_movement']
    
    return total_movement_df

# Load CSV data
csv_path = '/home/diego/mydrive/results/1/3/1/tobalaba_entrada_20240604_1000/tobalaba_entrada_20240604_1000_bbox_filtered.csv'
img_folder = '/home/diego/mydrive/results/1/3/1/tobalaba_entrada_20240604_1000/imgs'
df = pd.read_csv(csv_path)

# Calculate total movement
total_movement_df = calculate_movement(df)

# Visualize the distribution of total movement
plt.figure(figsize=(10, 6))
plt.hist(total_movement_df['total_movement'], bins=30, edgecolor='black')
plt.title('Distribution of Total Movement per ID')
plt.xlabel('Total Movement')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Define a threshold for low movement (adjust based on visualization)
threshold = 100

# Filter out IDs with movement below the threshold
low_movement_ids = total_movement_df[total_movement_df['total_movement'] < threshold]['id']

# Filter the original DataFrame to remove these IDs
filtered_df = df[~df['id'].isin(low_movement_ids)]

# Verify the filtering
print(f"Number of low movement IDs removed: {len(low_movement_ids)}")
print(f"Original DataFrame size: {df.shape}")
print(f"Filtered DataFrame size: {filtered_df.shape}")


### Be able to see images that has low movemenent [Clean Up 2 Viewer] 