### Comprehensive Dataset Information Visualisation  &  Splitting Dataset Information

#### This script is used to present comprehensive information about the dataset in YOLO format. The inputs are the paths to the training, validation and test sets for the current dataset and the dataset categories.
#### The output of is a pie chart showing the proportions of each category in each of the three subsets; a table of the specific number of categories and the total number of samples in each of the three subsets (which will be printed on the console as well); and a bar chart of the number of samples versus the proportions in each of the three subsets.

In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd

def analyze_yolo_dataset_multiple(paths, dataset_names):
    # Change the classes names of YOLO format Datasets
    emotions = ['surprise', 'fear', 'disgust', 'happiness', 'sadness', 'anger', 'neutral']

    def get_emotion_counts(dataset_path):
        emotion_counts = {emotion: 0 for emotion in emotions}
        txt_file_count = 0

        for root, _, files in os.walk(dataset_path):
            for file_name in files:
                if file_name.endswith('.txt'):
                    txt_file_count += 1
                    txt_file_path = os.path.join(root, file_name)

                    try:
                        with open(txt_file_path, 'r') as file:
                            lines = file.readlines()

                            for line in lines:
                                emotion_id = int(line.split()[0])
                                emotion = emotions[emotion_id]
                                emotion_counts[emotion] += 1
                    except Exception as e:
                        print(f"Error reading file {file_name}: {e}")
                        continue

        return emotion_counts, txt_file_count

    fig, axes = plt.subplots(1, 3, figsize=(30, 10))

    emotion_data = []

    for i, (dataset_path, dataset_name) in enumerate(zip(paths, dataset_names)):
        emotion_counts, txt_file_count = get_emotion_counts(dataset_path)

        emotion_data.append(emotion_counts)

        # pie chart
        wedges, texts, autotexts = axes[i].pie(emotion_counts.values(), labels=emotion_counts.keys(), autopct='%1.1f%%', startangle=140)
        axes[i].set_title(f'{dataset_name}', fontweight='bold', fontsize=36)
        axes[i].axis('equal')  

        for text in texts + autotexts:
            text.set_fontsize(24)
            text.set_fontweight('bold')
        for autotext in autotexts:
            autotext.set_fontsize(22)
            autotext.set_fontweight('bold')

    emotion_df = pd.DataFrame(emotion_data, index=dataset_names)
    emotion_df['Total'] = emotion_df.sum(axis=1)
    print(emotion_df)

    # table
    fig, ax = plt.subplots(figsize=(12, 4))
    ax.axis('off')
    tbl = ax.table(cellText=emotion_df.values, colLabels=emotion_df.columns, rowLabels=emotion_df.index, loc='center', cellLoc='center')
    tbl.auto_set_font_size(False)
    tbl.set_fontsize(12)
    tbl.scale(1.2, 1.2)

    for key, cell in tbl.get_celld().items():
        if key[0] == 0 or key[1] == -1:
            cell.set_text_props(fontweight='bold', fontsize=14)

    plt.show()

    # bar chart
    emotion_df = emotion_df.drop(columns=['Total'])  

    ax = emotion_df.T.plot(kind='bar', figsize=(15, 3), width=0.8)

    plt.xlabel('Emotions', fontweight='bold', fontsize=14)
    plt.ylabel('Number of Instances', fontweight='bold', fontsize=14)
    plt.xticks(rotation=0, fontsize=14) 
    plt.yticks(fontsize=14)
    plt.legend(title='Datasets', title_fontsize='13', fontsize='12')

    for idx, emotion in enumerate(emotion_df.columns):
        for i, dataset_name in enumerate(dataset_names):
            value = emotion_df.loc[dataset_name, emotion]
            total = emotion_df.loc[dataset_name].sum()
            percentage = (value / total) * 100
            ax.text(idx + (i * 0.25) - 0.2, value + 5, f'{value}\n({percentage:.1f}%)', ha='center', va='bottom', fontsize=10, fontweight='bold')

    plt.show()

# Change the paths of training set, validation set and test set of this YOLO format Dataset
paths = [
    '/Users/piglet/Desktop/Project_Code/RAFDB/RAFDB工作区/RAFDB_YOLO-detection/train/labels',
    '/Users/piglet/Desktop/Project_Code/RAFDB/RAFDB工作区/RAFDB_YOLO-detection/valid/labels',
    '/Users/piglet/Desktop/Project_Code/RAFDB/RAFDB工作区/RAFDB_YOLO-detection/test/labels'
]
dataset_names = ['Train set', 'Valid set', 'Test set']
analyze_yolo_dataset_multiple(paths, dataset_names)
