### Add images to CSV BBOX

In [5]:
import os
import pandas as pd
import glob

def find_image_name(row, image_dict):
    """
    Construct the image name based on row's id and frame_number,
    then check if it exists in the image_dict.
    """
    id_frame_pattern = f"img_{int(row['id'])}_{int(row['frame_number'])}"
    matched_images = [img for img in image_dict.get(int(row['id']), []) if id_frame_pattern in img]
    return matched_images[0] if matched_images else None

def append_image_names(csv_path, base_path):
    # Load the CSV into a DataFrame
    df = pd.read_csv(csv_path, dtype={'id': 'int64','frame_number': 'int64'})

    # Dictionary to hold image names for each id
    image_dict = {}

    # List directories in the base path and filter by those matching the ids in the DataFrame
    for dir_name in os.listdir(base_path):
        dir_path = os.path.join(base_path, dir_name)
        if os.path.isdir(dir_path) and dir_name.isdigit():
            id = int(dir_name)
            # List all images for the current id
            image_dict[id] = [os.path.basename(x) for x in glob.glob(os.path.join(dir_path, "*.png"))]

    # Apply the function to find the matching image name for each row
    df['img_name'] = df.apply(lambda row: find_image_name(row, image_dict), axis=1)
    
    return df


BASE_IMAGEES_PATH = '/home/diego/Documents/yolov7-tracker/imgs_conce'
BASE_FOLDER_NAME = 'logs'
CSV_FILE_PATH = 'conce_bbox.csv'

CSV_FILE_PATH = os.path.join(BASE_FOLDER_NAME, CSV_FILE_PATH)
updated_df = append_image_names(CSV_FILE_PATH, BASE_IMAGEES_PATH)

updated_df.to_csv(CSV_FILE_PATH, index=False)

### Add kfold to images and add label_img, label_direction column

In [6]:
# Re-importing necessary libraries and redefining the function with corrections
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import os
# Re-defining the set_folds function
def set_folds(csv_path, k_folds, n_images):
    df = pd.read_csv(csv_path)

    # Ensure 'img_name' column exists
    if 'img_name' not in df.columns:
        raise ValueError("img_name column doesn't exist in the dataset.")

    # Initialize k_fold column in original df
    df['k_fold'] = np.nan
    df['label_img'] = np.nan
    df['label_direction'] = np.nan

    # Filter rows where 'img_name' is not empty
    df_filtered = df[df['img_name'] != ''].copy()

    # Sort by 'id' and 'frame_number'
    df_filtered.sort_values(by=['id', 'frame_number'], inplace=True)

    # Process each ID separately in filtered df
    for id_value in df_filtered['id'].unique():
        subset = df_filtered[(df_filtered['id'] == id_value) & (df_filtered['img_name'].notna())]

        # Apply KFold or assign all to the same fold if condition is met
        if len(subset) < k_folds * n_images:
            df.loc[subset.index, 'k_fold'] = 0  # Assign all to fold 0 if condition is met
        else:
            # Apply KFold
            kf = KFold(n_splits=k_folds)
            for fold, (_, test_index) in enumerate(kf.split(subset)):
                # Select n_images per fold if specified
                #selected_indices = test_index[:n_images] if n_images < len(test_index) else test_index Selecciona los primeros n_images
                selected_indices = np.random.choice(test_index, min(n_images, len(test_index)), replace=False)
                df.loc[subset.iloc[selected_indices].index, 'k_fold'] = fold
                df.loc[subset.iloc[selected_indices].index, 'label_img'] = 0
    return df

BASE_FOLDER_NAME = 'logs'
CSV_FILE_PATH = 'conce_bbox.csv'
CSV_FILE_PATH = os.path.join(BASE_FOLDER_NAME, CSV_FILE_PATH)
df_with_folds = set_folds(CSV_FILE_PATH, k_folds=5, n_images=3)

df_with_folds.to_csv(CSV_FILE_PATH, index=False)


### CSV to SQL LITE

In [2]:
import pandas as pd
import sqlite3
import os

def convert_csv_to_sqlite(csv_file_path, db_file_path, table_name='bbox_data'):
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_file_path)
    
    # Create a connection to the SQLite database
    conn = sqlite3.connect(db_file_path)
    
    # Write the data to a SQLite table
    df.to_sql(table_name, conn, if_exists='replace', index=False)
    
    # Close the connection
    conn.close()

BASE_FOLDER_NAME = 'logs'
CSV_FILE_PATH = 'conce_bbox.csv'

CSV_FILE_PATH = os.path.join(BASE_FOLDER_NAME, CSV_FILE_PATH)

db_file_path = f'{BASE_FOLDER_NAME}/bbox_data.db'
convert_csv_to_sqlite(CSV_FILE_PATH, db_file_path)

### Model Creation

In [29]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier


BASE_FOLDER_NAME = 'logs'
CSV_FILE_PATH = 'updated_conce_bbox_area.csv'
CSV_FILE_PATH = os.path.join(BASE_FOLDER_NAME, CSV_FILE_PATH)

NEW_CSV = os.path.join(BASE_FOLDER_NAME, 'updated_conce_bbox_area_model.csv')

# Load your data
df = pd.read_csv(CSV_FILE_PATH)

# Preprocess your data
# Convert '-' labels to None for easier handling
df['label'] = df['label'].apply(lambda x: None if x == '-' else x).astype(float)

# Separate the dataset into training and prediction sets
train_df = df.dropna(subset=['label'])
predict_df = df[df['label'].isna()]

# Define features and target
features = ['area', 'centroid_x', 'centroid_y', 'frame_number', 'overlap', 'distance_to_center', 'conf_score']
target = 'label'

# Splitting the training data for validation
X_train, X_val, y_train, y_val = train_test_split(train_df[features], train_df[target], test_size=0.2, random_state=42)

# Initialize the Gradient Boosting Classifier
model = GradientBoostingClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Validate the model
val_predictions = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Predicting on the dataset without labels
predict_features = predict_df[features]
predicted_labels = model.predict(predict_features)
predicted_confidences = model.predict_proba(predict_features).max(axis=1)

# Adding predictions back to the dataframe
predict_df['model_label'] = predicted_labels
predict_df['model_confidence'] = predicted_confidences

# Combine the prediction and training dataframes
final_df = pd.concat([train_df, predict_df], sort=False)

# Save the updated dataframe to a new CSV file
final_df.to_csv(NEW_CSV, index=False)

print("Updated CSV saved successfully.")


Validation Accuracy: 0.9722222222222222


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predict_df['model_label'] = predicted_labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predict_df['model_confidence'] = predicted_confidences


Updated CSV saved successfully.


### TEST Show model results predictions

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import os
import random

# Read the CSV file
BASE_FOLDER_NAME = 'logs'
CSV_FILE_PATH = 'updated_conce_bbox_area.csv'
NEW_CSV = os.path.join(BASE_FOLDER_NAME, 'updated_conce_bbox_area_model.csv')

def find_matching_file_path(directory, filename_start):
    """
    Searches for files in the specified directory that start with the given filename start string.
    
    :param directory: The directory to search within.
    :param filename_start: The initial part of the file name to match.
    :return: The full path to the first matching file, or None if no match is found.
    """
    # Construct the search pattern
    search_pattern = os.path.join(directory, filename_start + "*.png")
    
    # Use glob to find all files matching the pattern
    matching_files = glob.glob(search_pattern)
    
    # Return the first matching file path, if any
    if matching_files:
        return matching_files[0]  # Return full path of the first match
    else:
        return ''  # No match found

df = pd.read_csv(NEW_CSV)

# Filter rows where `model_label` and `model_confidence` are not null
df_filtered = df.dropna(subset=['model_label', 'model_confidence'])

# Filtering rows where frame_number % 3 == 0
#df_filtered = df_filtered[df_filtered['frame_number'] % 3 == 0]

# Base path for the images
base_path = "/home/diego/Documents/yolov7-tracker/imgs_conce"

# Function to construct the file path
def construct_file_path(row):
    return os.path.join(base_path, str(int(row['id'])))

# Apply the function to construct file paths
df_filtered['file_path'] = df_filtered.apply(construct_file_path, axis=1)

# Randomly select 50 images if available, or take the whole dataset if less than 50
sample_size = min(20, len(df_filtered))
sampled_df = df_filtered.sample(n=sample_size)

rows = (sample_size + 4) // 5  # Calculate rows needed for the sample size, adjust the denominator to change columns
cols = 5 if sample_size > 5 else sample_size  # Adjust columns based on sample size

fig, axs = plt.subplots(rows, cols, figsize=(20, 8 * rows))  # Adjust figsize dynamically
axs = axs.flatten()  # Flatten to easily loop over if it's a grid

for i in range(len(axs)):
    if i < sample_size:
        row = sampled_df.iloc[i]
        img_path = find_matching_file_path(row.file_path, f"img_{row.id}_{row.frame_number}")
        if os.path.exists(img_path):
            img = Image.open(img_path)
            axs[i].imshow(img)
            axs[i].set_title(f"ID: {row.id}\nFrame: {row.frame_number}\nLabel: {int(row.model_label)}\nConfidence: {row.model_confidence:.2f}", fontsize=10)
            axs[i].axis('off')
        else:
            axs[i].set_visible(False)
    else:
        axs[i].set_visible(False)  # Hide unused subplots

plt.tight_layout()
plt.show()


### Data process IN/OUT/BAD Feature Engineering

In [9]:
import pandas as pd
import os
import numpy as np

BASE_FOLDER_NAME = 'logs'
CSV_FILE_PATH = 'conce_bbox.csv_alternative.csv'
CSV_FILE_PATH = os.path.join(BASE_FOLDER_NAME, CSV_FILE_PATH)

# Load your data
df = pd.read_csv(CSV_FILE_PATH)

# Ensure the DataFrame is sorted by 'id' and 'frame_number' for correct diff calculations
df.sort_values(by=['id', 'frame_number'], inplace=True)

# Calculate Movement Features (Δx and Δy)
df['delta_x'] = df.groupby('id')['centroid_x'].diff().fillna(0)
df['delta_y'] = df.groupby('id')['centroid_y'].diff().fillna(0)

# Calculate Aggregated Features for each ID
aggregations = {
    'delta_x': ['mean', 'max', 'min', 'std'],
    'delta_y': ['mean', 'max', 'min', 'std']
}
aggregated_features = df.groupby('id').agg(aggregations).reset_index()

# Correct the naming of the aggregated columns
aggregated_features.columns = ['id'] + [f'{var}_{stat}' for var, stats in aggregations.items() for stat in stats]

# Correctly merge aggregated features back to the original dataframe
df = pd.merge(df, aggregated_features, on='id', how='left')

# Calculate Sequence Features (net movement direction)
df['net_movement_x'] = df.groupby('id')['delta_x'].transform('sum')
df['net_movement_y'] = df.groupby('id')['delta_y'].transform('sum')

df.to_csv(CSV_FILE_PATH, index=False)

print("Updated CSV saved successfully.")

Updated CSV saved successfully.


### In vs Out

In [19]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

BASE_FOLDER_NAME = 'logs'
CSV_FILE_PATH = 'conce_bbox.csv_alternative.csv'
NEW_FILE = 'updated_conce_bbox_with_predictions.csv'
CSV_FILE_PATH = os.path.join(BASE_FOLDER_NAME, CSV_FILE_PATH)

# Load the data
df = pd.read_csv(CSV_FILE_PATH)

# Filter out rows with 'BAD' or empty in 'label_direction'
df_filtered = df[(df['label_direction'] == 'IN') | (df['label_direction'] == 'OUT')]

# Correctly encode 'IN' as 1 and 'OUT' as 0
label_encoder = LabelEncoder()
df_filtered['label_direction_encoded'] = label_encoder.fit_transform(df_filtered['label_direction'])  # 'IN' -> 1, 'OUT' -> 0

# Define features (make sure to only include numeric columns and exclude any text columns)
features = [col for col in df_filtered.columns if col not in ['id', 'label_direction', 'label_direction_encoded'] and df_filtered[col].dtype in [np.int64, np.float64]]
target = 'label_direction_encoded'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_filtered[features], df_filtered[target], test_size=0.2, random_state=42)

# Initialize the Gradient Boosting Classifier
model = GradientBoostingClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
predictions = model.predict(X_test)
prediction_probs = model.predict_proba(X_test)[:, 1]  # Probability of being 'IN'

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy}")

# Prepare the entire dataset for prediction
# First, ensure only numeric features are used
df['label_direction_p'] = np.nan  # Initialize column for model predictions
df['label_direction_p_conf'] = np.nan  # Initialize column for prediction confidence

# Predicting on rows needing prediction (assuming 'BAD' or not labeled)
predict_features = df[features]
df['label_direction_p'] = model.predict(predict_features)
df['label_direction_p_conf'] = model.predict_proba(predict_features)[:, 1]  # Probability of being 'IN'

# Mapping numeric predictions back to 'IN' or 'OUT'
df['label_direction_p'] = label_encoder.inverse_transform(df['label_direction_p'].astype(int))

# Save the updated dataframe to a new CSV file
NEW_CSV_PATH = os.path.join(BASE_FOLDER_NAME, NEW_FILE)
df.to_csv(NEW_CSV_PATH, index=False)

print("Updated CSV with predictions saved successfully.")


  df = pd.read_csv(CSV_FILE_PATH)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['label_direction_encoded'] = label_encoder.fit_transform(df_filtered['label_direction'])  # 'IN' -> 1, 'OUT' -> 0


Test Accuracy: 1.0
Updated CSV with predictions saved successfully.


### In|OUT vs BAD

In [20]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

BASE_FOLDER_NAME = 'logs'
CSV_FILE_PATH = 'conce_bbox.csv_alternative.csv'
CSV_FILE_PATH = os.path.join(BASE_FOLDER_NAME, CSV_FILE_PATH)

# Load the original data
df_original = pd.read_csv(CSV_FILE_PATH)

# Create a copy for processing
df = df_original.copy()

# Encode labels: 1 for 'IN' or 'OUT' (good image), 0 for 'BAD' (bad image), keep NaN for now
df['label_encoded'] = df['label_direction'].apply(lambda x: 1 if x in ['IN', 'OUT'] else 0 if x == 'BAD' else np.nan)

# Prepare data for model training (exclude rows with NaN in 'label_encoded')
df_train = df.dropna(subset=['label_encoded'])

# Define features (excluding non-numeric columns and the 'label_direction', 'label_encoded' columns)
features = [col for col in df_train.columns if col not in ['id', 'label_direction', 'label_encoded'] and df_train[col].dtype in [np.int64, np.float64]]
target = 'label_encoded'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_train[features], df_train[target], test_size=0.2, random_state=42)

# Initialize the Gradient Boosting Classifier
model = GradientBoostingClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the entire original dataset (make sure to handle NaNs in features if they exist)
df_original['label_good_dir'] = model.predict(df_original[features].fillna(0))  # Using fillna(0) as an example handling method
df_original['label_good_dir_conf'] = model.predict_proba(df_original[features].fillna(0))[:, 1]  # Confidence of being a good image

# Save the updated dataframe with predictions for the entire dataset to a new CSV file
NEW_CSV_PATH = os.path.join(BASE_FOLDER_NAME, 'updated_conce_bbox_with_good_dir_predictions.csv')
df_original.to_csv(NEW_CSV_PATH, index=False)

print("Updated CSV with good direction predictions saved successfully.")


  df_original = pd.read_csv(CSV_FILE_PATH)


Updated CSV with good direction predictions saved successfully.
