# Exploratory Data Analysis (EDA)

## Emotion Face Classifier Notebook 1

Data load and harmonization

Summaries and visualizations include:
- Skimpy reports
- Barplot of image count by emotion
- Waffle chart of image distribution
- Displaying example images

In [None]:
import os
import pandas as pd
import plotly.express as px

from skimpy import skim_get_figure
from datascifuncs.tidbit_tools import load_json, write_json, check_directory_name

In [None]:
main_dir = 'EmotionFaceClassifier'
check_directory_name(main_dir)

In [None]:
from utils.data_ingest_eda import (
    generate_file_dataframe,
    emotion_count_piv,
    piv_stacked_bar,
    waffle_chart,
    show_example_images,
    convert_pixels_to_array,
    create_img
)

## FER2013 Data Harmonization

In [None]:
os.makedirs('imgs', exist_ok=True)

In [None]:
# Load common dicts from json config file
common_dicts = load_config('./configs/input_mappings.json')
print(common_dicts.keys())

In [None]:
# Load in key dicts from json for data mapping
emo_dict = common_dicts['emo_dict']

In [None]:
# Load in key dicts from json for data mapping
emo_color_dict = common_dicts['emo_color_dict']

In [None]:
# Read in data
fer2013_path = 'data/fer2013/fer2013.csv'
fer2013 = pd.read_csv(fer2013_path)

In [None]:
# Modify df for clarity
fer2013 = fer2013.rename(columns={'emotion': 'emotion_id'})
fer2013['emotion'] = fer2013['emotion_id'].astype(str).map(emo_dict)

In [None]:
# Pixel data must be converted to np.array
fer2013['image'] = fer2013['pixels'].apply(convert_pixels_to_array)

In [None]:
# Initial data has 3 usages: train, public test, private test
# Mapping reduces to train and test only 
fer2013['usage']=fer2013['Usage'].map(common_dicts['usage_dict'])

In [None]:
# Creates a numeric index for each usage/emotion group
# Value does not hold meaning expect as identifier for image
fer2013['emo_count_id'] = fer2013.groupby(['usage', 'emotion']).cumcount()+1

In [None]:
for _, row in fer2013.iterrows():
    create_img(row)

## Data Summary and Pivots

In [None]:
fer = generate_file_dataframe('data/fer2013/')
fer.head()

In [None]:
frd = generate_file_dataframe('data/frd2020/')
frd.head()

In [None]:
# Set source column
fer['source'] = 'fer'
frd['source'] = 'frd'

In [None]:
df = pd.concat([fer, frd], axis=0)

In [None]:
# # Save combined data
# combined_df_path = os.path.join('data', 'efc2024.csv')
# df.to_csv(combined_df_path)

In [None]:
print(fer.shape)
print(frd.shape)
print(df.shape)

In [None]:
df.columns

In [None]:
# Modify df for clarity
# df = df.rename(columns={'emotion': 'emotion_id'})
# df['emotion'] = df['emotion_id'].astype(str).map(emo_dict)
fer['color'] = fer['emotion'].map(emo_color_dict)
frd['color'] = frd['emotion'].map(emo_color_dict)
df['color'] = df['emotion'].map(emo_color_dict)

In [None]:
df['emotion'].value_counts()

In [None]:
df_dict = {
    'fer2013': fer,
    'frd2020': frd,
    'efc2024': df
}

In [None]:
# skim will generate and save a file with basic descriptives
for df_str, pd_df in df_dict.items():
    out_path = os.path.join('imgs', f'{df_str}_skim.svg')
    skim_get_figure(pd_df, save_path=out_path)

## Pivots grouped by Emotion and Usage

In [None]:
# skim will generate and save a file with basic descriptives
for df_str, pd_df in df_dict.items():
    piv_df = emotion_count_piv(
        df=pd_df,
        gby_cols=['emotion', 'train_test_split'], 
        agg_col='Filename',
        count_cols=['Training', 'Testing']
    )

    # Skim package for a nice overview, saved to save_path below
    out_path = os.path.join('imgs', f'{df_str}_counts_skim.svg')
    skim_get_figure(piv_df, save_path=out_path)

    # Bar plot of emotion image counts
    piv_stacked_bar(df=piv_df, label=df_str)

    # Waffle plot of distribution
    waffle_path = os.path.join('imgs', f'{df_str}_waffle_chart.png')
    waffle_chart(
        df=piv_df, group_col='Emotion', data_col='TotalImages', 
        save_path=waffle_path, display=False, total_squares=100,
        color_dict=emo_color_dict
    )

    # Save pivoted data
    intermediate_data_dir = os.path.join('data', 'intermediate_data')
    os.makedirs(intermediate_data_dir, exist_ok=True)
    piv_save_path = os.path.join(intermediate_data_dir, f'{df_str}_emo_piv.csv')
    piv_df.to_csv(piv_save_path)

## Example Images

In [None]:
display_images=False
for df_str, pd_df in df_dict.items():
    # Creates image with 1 example face/emotion
    fig, axes = show_example_images(
        df=pd_df, group_col='emotion', image_col='Full Path', 
        save_path=f'./imgs/{df_str}_examples_1.png', samples=1,
        display=display_images)

    # Creates image with 3 example face/emotion
    fig, axes = show_example_images(
        df=pd_df, group_col='emotion', image_col='Full Path', 
        save_path=f'./imgs/{df_str}_examples_3.png', samples=3,
        display=display_images)