### Model Image Selection Pro

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import joblib  # For saving and loading the model

def train_and_save_model(training_data_path, model_save_path):
    INTEREST_LABEL = 'label_img'
    interest_values = [1, 2]

    only_train_data = pd.read_csv(training_data_path)
    only_train_data = only_train_data.dropna(subset=['img_name'])
    only_train_data = only_train_data[only_train_data[INTEREST_LABEL].isin(interest_values)]

    features = ['area', 'centroid_x', 'centroid_y', 'frame_number', 'overlap', 'distance_to_center', 'conf_score']
    target = INTEREST_LABEL

    assert all(f in only_train_data.columns for f in features + [target]), "Some required columns are missing."

    X_train, X_val, y_train, y_val = train_test_split(only_train_data[features], only_train_data[target], test_size=0.2, random_state=42)

    model = GradientBoostingClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Save the trained model
    joblib.dump(model, model_save_path)

    val_predictions = model.predict(X_val)
    val_accuracy = accuracy_score(y_val, val_predictions)
    print(f"Validation Accuracy: {val_accuracy}")

def predict_and_save_results(model_weights_path, csv_file_path):
    features = ['area', 'centroid_x', 'centroid_y', 'frame_number', 'overlap', 'distance_to_center', 'conf_score']
    
    # Load the model
    model = joblib.load(model_weights_path)
    
    # Load the CSV file for prediction
    data = pd.read_csv(csv_file_path)
    assert all(f in data.columns for f in features), "Some required columns for prediction are missing."
    
    predictions = model.predict(data[features])
    predicted_confidences = model.predict_proba(data[features]).max(axis=1)
    
    # Add predictions to the dataframe
    data['model_label_img'] = predictions
    data['model_label_conf'] = predicted_confidences
    
    data['model_label_conf'] = data['model_label_conf'].round(2)
    
    # Save the modified dataframe to a new CSV file
    predicted_csv_path = os.path.splitext(csv_file_path)[0] + "_img_selection_predicted.csv"
    data.to_csv(predicted_csv_path, index=False)
    print(f"Predictions saved to: {predicted_csv_path}")
    

# train_and_save_model('/home/diego/Documents/yolov7-tracker/mini_models/results/from_sql_bbox.csv', '/home/diego/Documents/yolov7-tracker/mini_models/results/image_selection_model.pkl')

In [2]:
predict_and_save_results('/home/diego/Documents/yolov7-tracker/mini_models/results/image_selection_model.pkl','/home/diego/Documents/yolov7-tracker/logs/santos_dumont_bbox.csv')

Predictions saved to: /home/diego/Documents/yolov7-tracker/logs/santos_dumont_bbox_img_selection_predicted.csv


### Model Image Selection

In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


BASE_FOLDER_NAME = 'results'
CSV_FILE_NAME = 'conce_bbox.csv'
CSV_FILE_PATH = os.path.join(BASE_FOLDER_NAME, CSV_FILE_NAME)


MODEL_RESULT = os.path.join(BASE_FOLDER_NAME, f'model_img_selction{CSV_FILE_NAME}')



INTEREST_LABEL = 'label_img'
interest_values = [1, 2]  # Define the values you're interested in BAD and GOOD images


# Load your data
df = pd.read_csv(CSV_FILE_PATH)

# Preprocess your data
df[INTEREST_LABEL] = df[INTEREST_LABEL].apply(lambda x: x if x in interest_values else None).astype(float)

#### TWEAK DIEGO ####
# XQ la data la tengo en otro CSV que es de entrenamitento
CSV_FILE_PATH = os.path.join(BASE_FOLDER_NAME, 'from_sql_bbox.csv')
only_train_data = pd.read_csv(CSV_FILE_PATH)
only_train_data = only_train_data.dropna(subset=['img_name'])
only_train_data = only_train_data[only_train_data[INTEREST_LABEL].isin(interest_values)]
#### TWEAK DIEGO ####



# Further filter the DataFrame to only include rows with ID lower than 1300 for training
train_df = df[(df['id'] < 1300) & df[INTEREST_LABEL].notna()]

# For prediction, you'd consider rows beyond ID 1300 or those not fitting the interest labels
predict_df = df[(df['id'] >= 1300) | df[INTEREST_LABEL].isna()] ##### OJO ESTO

#Filter only the rows that has img_name
predict_df = predict_df.dropna(subset=['img_name'])

# Define features and target
features = ['area', 'centroid_x', 'centroid_y', 'frame_number', 'overlap', 'distance_to_center', 'conf_score']
target = INTEREST_LABEL

# Splitting the training data for validation ### TWEEK DIEGO ###
X_train, X_val, y_train, y_val = train_test_split(only_train_data[features], only_train_data[target], test_size=0.2, random_state=42)

# Initialize the Gradient Boosting Classifier
model = GradientBoostingClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Validate the model
val_predictions = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Predicting on the dataset without labels
predict_features = predict_df[features]
predicted_labels = model.predict(predict_features)
predicted_confidences = model.predict_proba(predict_features).max(axis=1)

# Adding predictions back to the dataframe
predict_df['model_label_img'] = predicted_labels
predict_df['model_label_conf'] = predicted_confidences

# Combine the prediction and training dataframes
final_df = pd.concat([train_df, predict_df], sort=False)

# Save the updated dataframe to a new CSV file
# final_df.to_csv(MODEL_RESULT, index=False) #-> SAVE


#### PREDICT IN ALL DATA ####
df_total = df.dropna(subset=['img_name'])
predict_features = df_total[features]
predicted_labels = model.predict(predict_features)
predicted_confidences = model.predict_proba(predict_features).max(axis=1)

# Adding predictions back to the dataframe
df_total['model_label_img'] = predicted_labels
df_total['model_label_conf'] = predicted_confidences.round(2)
total = os.path.join(BASE_FOLDER_NAME, f'total_model_img_selction_{CSV_FILE_NAME}')
df_total.to_csv(total, index=False) #-> SAVE

#### PREDICT IN ALL DATA ####




print("Updated CSV saved successfully.")
### OJO DIEGO ###
# Este resultado se le quitan todas las rows que no tienen img_name

In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


# BASE_FOLDER_NAME = 'results'
# CSV_FILE_NAME = 'conce_bbox.csv'
# CSV_FILE_PATH = os.path.join(BASE_FOLDER_NAME, CSV_FILE_NAME)


# MODEL_RESULT = os.path.join(BASE_FOLDER_NAME, f'model_img_selction{CSV_FILE_NAME}')



INTEREST_LABEL = 'label_img'
interest_values = [1, 2]  # Define the values you're interested in BAD and GOOD images


# # Load your data
# df = pd.read_csv(CSV_FILE_PATH)

# # Preprocess your data
# df[INTEREST_LABEL] = df[INTEREST_LABEL].apply(lambda x: x if x in interest_values else None).astype(float)

#### TWEAK DIEGO ####
# XQ la data la tengo en otro CSV que es de entrenamitento
CSV_FILE_PATH = os.path.join(BASE_FOLDER_NAME, 'from_sql_bbox.csv')
only_train_data = pd.read_csv(CSV_FILE_PATH)
only_train_data = only_train_data.dropna(subset=['img_name'])
only_train_data = only_train_data[only_train_data[INTEREST_LABEL].isin(interest_values)]
#### TWEAK DIEGO ####



# Further filter the DataFrame to only include rows with ID lower than 1300 for training
# train_df = df[(df['id'] < 1300) & df[INTEREST_LABEL].notna()]

# For prediction, you'd consider rows beyond ID 1300 or those not fitting the interest labels
# predict_df = df[(df['id'] >= 1300) | df[INTEREST_LABEL].isna()] ##### OJO ESTO

#Filter only the rows that has img_name
# predict_df = predict_df.dropna(subset=['img_name'])

# Define features and target
features = ['area', 'centroid_x', 'centroid_y', 'frame_number', 'overlap', 'distance_to_center', 'conf_score']
target = INTEREST_LABEL

# Splitting the training data for validation ### TWEEK DIEGO ###
X_train, X_val, y_train, y_val = train_test_split(only_train_data[features], only_train_data[target], test_size=0.2, random_state=42)

# Initialize the Gradient Boosting Classifier
model = GradientBoostingClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Validate the model
val_predictions = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Predicting on the dataset without labels
# predict_features = predict_df[features]
# predicted_labels = model.predict(predict_features)
# predicted_confidences = model.predict_proba(predict_features).max(axis=1)

# # Adding predictions back to the dataframe
# predict_df['model_label_img'] = predicted_labels
# predict_df['model_label_conf'] = predicted_confidences

# # Combine the prediction and training dataframes
# final_df = pd.concat([train_df, predict_df], sort=False)

# # Save the updated dataframe to a new CSV file
# # final_df.to_csv(MODEL_RESULT, index=False) #-> SAVE


# #### PREDICT IN ALL DATA ####
# df_total = df.dropna(subset=['img_name'])
# predict_features = df_total[features]
# predicted_labels = model.predict(predict_features)
# predicted_confidences = model.predict_proba(predict_features).max(axis=1)

# # Adding predictions back to the dataframe
# df_total['model_label_img'] = predicted_labels
# df_total['model_label_conf'] = predicted_confidences.round(2)
# total = os.path.join(BASE_FOLDER_NAME, f'total_model_img_selction_{CSV_FILE_NAME}')
# df_total.to_csv(total, index=False) #-> SAVE

# #### PREDICT IN ALL DATA ####




# print("Updated CSV saved successfully.")
# ### OJO DIEGO ###
# # Este resultado se le quitan todas las rows que no tienen img_name

### Model Image Selection Test

In [None]:
#### ESTE RECIBE final_df = pd.concat([train_df, predict_df], sort=False), donde solo van a haber datos en model_label_img y model_label_conf
# Para que no cuente el train

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from utils import calculate_confidence_distribution
FOLDER_PATH_IMGS = '/home/diego/Documents/yolov7-tracker/imgs_conce/'


# Filter final_df to include only rows where both 'model_label_img' and INTEREST_LABEL are not null
comparison_df = final_df.dropna(subset=['model_label_img', INTEREST_LABEL])
# Extract model predictions and actual labels
y_pred = comparison_df['model_label_img']
y_true = comparison_df[INTEREST_LABEL]

# Generate the confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Display the confusion matrix using ConfusionMatrixDisplay
cmd = ConfusionMatrixDisplay(cm, display_labels=interest_values)
cmd.plot(cmap="Blues")
cmd.ax_.set(xlabel='Predicted labels', ylabel='True labels', title='Confusion Matrix')
plt.show()

# Generate the classification report
report = classification_report(y_true, y_pred, target_names=[str(label) for label in interest_values])

print("Classification Report:")
print(report)

# Assuming 'comparison_df' contains the actual (INTEREST_LABEL) and predicted ('model_label_img') labels
# Let's identify False Negatives: actual label is in interest_values, but predicted is not.
total_wrong_filter = (comparison_df[INTEREST_LABEL].isin(interest_values)) & (comparison_df['model_label_img'] != comparison_df[INTEREST_LABEL])
false_negative_filter = (comparison_df[INTEREST_LABEL] == 1) & (comparison_df['model_label_img'] == 2) # False Negative filter
false_positive_filter = (comparison_df[INTEREST_LABEL] == 2) & (comparison_df['model_label_img'] == 1) # False Positive filter

false_negatives = comparison_df[false_negative_filter][:20]

# Define the grid size for plotting
n_rows = 2  # Adjust based on the number of images you want per column
n_cols = (len(false_negatives) + 1) // n_rows

fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 4))
fig.suptitle('False Negative Images')

# Flatten the axes array for easy indexing
axes = axes.flatten()

for i,(index, row) in enumerate(false_negatives.iterrows()):
    img_path = os.path.join(FOLDER_PATH_IMGS,row['img_name'].split('_')[1], row['img_name'])
    img = mpimg.imread(img_path)
    axes[i].imshow(img)
    axes[i].set_title(f"ID: {row['img_name'].split('_')[1]} frame: {row['img_name'].split('_')[2]} \n true: {row['label_img']} pred: {row['model_label_img']} conf: {row['model_label_conf']:.2f}")
    axes[i].axis('off')

# Hide any empty subplots
for j in range(i + 1, n_rows * n_cols):
    axes[j].axis('off')

plt.tight_layout()
# Save the figure to disk
save_path = 'logs/imageSelectionResults.png'  # Specify your desired path and filename
plt.savefig(save_path, dpi=300)  # Adjust DPI for higher resolution images
plt.show()

calculate_confidence_distribution(comparison_df, label_direction_column='label_img', model_label_direction_column='model_label_img', model_label_direction_conf_column='model_label_conf')

In [5]:
import pandas as pd

# Path to the log file
log_file_path = '/home/diego/Documents/yolov7-tracker/mini_models/parameter_tuning_log.txt'

# Initialize an empty list to store the dictionaries
data = []

# Open the log file and process each block of text
with open(log_file_path, 'r') as file:
    entry = {}
    for line in file:
        if line.startswith('K1:'):
            # Split the line by ', ' and extract values
            params = line.split(', ')
            entry['K1'] = int(params[0].split(': ')[1])
            entry['K2'] = int(params[1].split(': ')[1])
            entry['LAMBDA'] = float(params[2].split(': ')[1])
        elif line.startswith('Rank1:'):
            entry['Rank1'] = float(line.split(': ')[1].replace('%', ''))
        elif line.startswith('Rank5:'):
            entry['Rank5'] = float(line.split(': ')[1].replace('%', ''))
        elif line.startswith('Matches#Rank5:'):
            entry['Matches_Rank5'] = float(line.split(': ')[1].replace('%', ''))
        elif line.startswith('mAP:'):
            entry['mAP'] = float(line.split(': ')[1].replace('%', ''))
        elif line.strip() == '===============================':
            # End of a block of data, add the dictionary to the list and reset it
            data.append(entry)
            entry = {}

# Create a DataFrame
df = pd.DataFrame(data)

# Convert the DataFrame to an Excel file
excel_file_path = 'path_to_excel_file.xlsx'
df.to_excel(excel_file_path, index=False)

print(f"Data has been successfully converted to Excel and saved at {excel_file_path}")


Data has been successfully converted to Excel and saved at path_to_excel_file.xlsx
