# Label Frequencies and Analysis

In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

In [None]:
OUTPUT_DIR_PATH = Path('../../output/label_frequencies')

In [None]:
binary_df = pd.read_csv('../../data/dataframes/binary_labels.csv')

In [None]:
binary_df.head()

## Analyze Label Frequency

### How many labels are there and how much of them are of interest?

In [None]:
# Calculate the total number of labels
total_labels = binary_df.drop(['boulder', 'camera', 'participant', 'repetition'], axis=1).sum().sum()

# Calculate the total number of 'no_movement_of_interest' labels
no_movement_of_interest_count = binary_df['no_movement_of_interest'].sum()

# Calculate the percentage of 'no_movement_of_interest'
percentage_no_movement = (no_movement_of_interest_count / total_labels) * 100

print("Total number of labels:", total_labels)
print("Number of 'no_movement_of_interest' labels:", no_movement_of_interest_count)
print("Percentage of 'no_movement_of_interest': {:.2f}%".format(percentage_no_movement))

# Plotting
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))

# Bar chart for counts
bars = ax[0].bar(['Total Labels', 'No Movement of Interest'], [total_labels, no_movement_of_interest_count], color=['orange', 'teal'], width=0.5)  # Narrower bars
ax[0].set_title('Label Counts', fontsize=14)
ax[0].set_ylabel('Number of Labels', fontsize=12)
ax[0].set_ylim(0, total_labels + 0.1 * total_labels)  # Setting y-limit slightly higher for better visualization

# Font size for bar labels
for bar in bars:
    yval = bar.get_height()
    ax[0].text(bar.get_x() + bar.get_width()/2, yval, int(yval), va='bottom', ha='center', fontsize=12)  # va: vertical alignment, ha: horizontal alignment

# Pie chart for percentage
labels = 'No Movement of Interest', 'Other Labels'
sizes = [percentage_no_movement, 100 - percentage_no_movement]
colors = ['teal', 'orange']
explode = (0.1, 0)  # explode first slice

ax[1].pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140, textprops={'fontsize': 12})
ax[1].set_title('Percentage of No Movement of Interest', fontsize=14)

# Show plot
plt.tight_layout()
plt.savefig(OUTPUT_DIR_PATH / "labels_of_interest")
plt.show()

### Label Frequency Over All Boulders

In [None]:
# Analyze label frequencies
label_freq = binary_df.drop(['boulder', 'camera', 'participant', 'repetition'], axis=1).sum().sort_values(ascending=False)
print("Label Frequencies:\n", label_freq)

In [None]:
# Analyze label frequencies, excluding 'no_movement_of_interest' and counts of zero
label_frequencies = binary_df.drop(['boulder', 'camera', 'participant', 'repetition', 'no_movement_of_interest', 'before_start_position', 'start_position'], axis=1).sum().sort_values(ascending=False)
label_frequencies = label_frequencies[label_frequencies > 0]  # Filter out labels with zero occurrences

# Plotting
plt.figure(figsize=(12, 8))
bars = plt.bar(label_frequencies.index, label_frequencies.values, color='teal')
plt.title('Label Frequencies', fontsize=14)
plt.ylabel('Frequency: Number of Frames', fontsize=12)
plt.xticks(rotation=45, ha="right", fontsize=10)  # Rotate labels for better visibility
plt.tight_layout()

# Optional: Add text labels above bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, int(yval), ha='center', va='bottom', fontsize=10)

plt.savefig(OUTPUT_DIR_PATH / "label_frequencies")
plt.show()

### Label Frequencies per Boulder

In [None]:
# Group by 'boulder' and sum up the occurrences of each label
boulder_label_frequencies = binary_df.groupby('boulder').sum()

# List of boulders
boulders = ['W1', 'W2', 'W3', 'W4']

# Plotting
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 12))  # Adjust subplot grid as needed
axes = axes.flatten()  # Flatten the axes array for easier iteration

for i, boulder in enumerate(boulders):
    # Selecting data for each boulder
    data = boulder_label_frequencies.loc[boulder].drop(['camera', 'participant', 'repetition', 'no_movement_of_interest', 'before_start_position', 'start_position'])  # Exclude 'no_movement_of_interest' and non-label columns
    data = data[data > 0]  # Filter out labels with zero occurrences
    data.sort_values(ascending=False, inplace=True)  # Sort data to make the plot more informative
    
    # Bar plot for each boulder
    axes[i].bar(data.index, data.values, color='teal')
    axes[i].set_title(f'Label Frequencies for {boulder}', fontsize=14)
    axes[i].set_ylabel('Frequency: Number of Frames', fontsize=12)
    axes[i].tick_params(axis='x', rotation=45)  # Rotate labels for better visibility

plt.tight_layout()
plt.savefig(OUTPUT_DIR_PATH / "label_frequencies_per_boulder")
plt.show()


In [None]:
# Group by 'boulder' and sum up the occurrences of each label
boulder_label_freq = binary_df.groupby('boulder').sum()

# List of boulders
boulders = ['W1', 'W2', 'W3', 'W4']

# Calculating and printing the ratio of 'no_movement_of_interest' to other movements per boulder
ratios = {}
for boulder in boulders:
    no_movement_of_interest_count = boulder_label_freq.at[boulder, 'no_movement_of_interest']
    total_other_labels_count = boulder_label_freq.loc[boulder].drop(['camera', 'participant', 'repetition', 'no_movement_of_interest']).sum()
    ratio = (no_movement_of_interest_count / total_other_labels_count) if total_other_labels_count != 0 else 0  # Avoid division by zero
    ratios[boulder] = ratio * 100  # Convert ratio to percentage

    print(f"Percentage of 'no_movement_of_interest' to other labels for {boulder}: {ratios[boulder]:.2f}%")
