# Exploratory Data Analysis (EDA)

## Emotion Face Classifier Notebook 1

Data load and harmonization

Summary pivot counts of emotion by data source and combined

Visualizations:
- Skimpy reports
- Barplot of image count by emotion
- Waffle chart of image distribution
- Example image displays

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import pandas as pd
import plotly.express as px

from skimpy import skim_get_figure
from datascifuncs.tidbit_tools import load_json, write_json, print_json, check_directory_name

In [4]:
main_dir = 'EmotionFaceClassifier'
check_directory_name(main_dir)

Directory set to /Users/dsl/Documents/GitHub/EmotionFaceClassifier, matches target dir string EmotionFaceClassifier.


True

In [5]:
from utils.eda import (
    convert_pixels_to_array,
    save_image
)

from utils.image_processing import create_image_dataframe

## FER2013 Data Harmonization

Image data is contained in a csv file as a flattened string.

To sync with FRD 2020 data, arrays are extracted, converted into 2-D, and written to greyscale jpg images.

Directory structure matches that of FRD with usage (train or test) followed by emotion type.

In [7]:
os.makedirs('imgs', exist_ok=True)

In [8]:
# Load common dicts from json config file
common_dicts = load_json('./configs/input_mappings.json')
print_json(common_dicts)

{
    "usage_dict": {
        "Training": "Training",
        "PublicTest": "Testing",
        "PrivateTest": "Testing"
    },
    "emo_dict": {
        "0": "Angry",
        "1": "Disgust",
        "2": "Fear",
        "3": "Happy",
        "4": "Sad",
        "5": "Surprise",
        "6": "Neutral"
    },
    "frd_emo_dict": {
        "0": "Angry",
        "1": "Fear",
        "2": "Happy",
        "3": "Sad",
        "4": "Surprise",
        "5": "Neutral"
    },
    "emo_color_dict": {
        "Angry": "red",
        "Disgust": "olive",
        "Fear": "black",
        "Happy": "gold",
        "Sad": "blue",
        "Surprise": "darkviolet",
        "Neutral": "slategray"
    },
    "output_col_order": [
        "Emotion",
        "TotalImages",
        "Training",
        "TrainingPerc",
        "PublicTest",
        "PublicTestPerc",
        "PrivateTest",
        "PrivateTestPerc"
    ],
    "frd_output_col_order": [
        "Emotion",
        "TotalImages",
        "Training",


In [9]:
# Select emotion mapping section of json
emo_dict = common_dicts['emo_dict']
print_json(emo_dict)

{
    "0": "Angry",
    "1": "Disgust",
    "2": "Fear",
    "3": "Happy",
    "4": "Sad",
    "5": "Surprise",
    "6": "Neutral"
}


In [10]:
# Select color mappings for emotion categories
emo_color_dict = common_dicts['emo_color_dict']
print_json(emo_color_dict)

{
    "Angry": "red",
    "Disgust": "olive",
    "Fear": "black",
    "Happy": "gold",
    "Sad": "blue",
    "Surprise": "darkviolet",
    "Neutral": "slategray"
}


In [11]:
# Read in FER 2013 data
fer2013_path = 'data/fer2013/fer2013.csv'
fer2013 = pd.read_csv(fer2013_path)

In [12]:
# Check column names and shape
print(fer2013.columns)
print(fer2013.shape)

Index(['emotion', 'pixels', 'Usage'], dtype='object')
(35887, 3)


In [13]:
# Check emotion values
print(sorted(fer2013['emotion'].unique()))

[0, 1, 2, 3, 4, 5, 6]


In [14]:
# Map emotion labels to values for clarity and harmonization
fer2013 = fer2013.rename(columns={'emotion': 'emotion_id'})
fer2013['emotion'] = fer2013['emotion_id'].astype(str).map(emo_dict)

In [15]:
# Pixel data must be converted to np.array
fer2013['image'] = fer2013['pixels'].apply(convert_pixels_to_array)

In [16]:
# Initial data has 3 usages: train, public test, private test
# Mapping reduces to train and test only 
fer2013['usage']=fer2013['Usage'].map(common_dicts['usage_dict'])

In [17]:
# Creates a numeric index for each usage/emotion group
# Value is used as identifier for image
fer2013['emo_count_id'] = fer2013.groupby(['usage', 'emotion']).cumcount()+1

In [18]:
# Write img arrays to jpg for data harmony
fer2013.apply(save_image, axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
35882    None
35883    None
35884    None
35885    None
35886    None
Length: 35887, dtype: object

## Data Summary and Pivots

In [21]:
# Define the base path where your datasets are located
base_path = 'data'
datasets = ['fer2013', 'frd2020']
usages = ['Training', 'Testing']
emotions = common_dicts['frd_emo_dict'].values()

In [None]:
search_structure = {
    'dataset': datasets,
    'usage': usages,
    'emotion': emotions
}

In [None]:
df = create_image_dataframe(base_path=base_path, search_structure=search_structure)

In [None]:
df.head()

In [27]:
df.dataset.value_counts()

dataset
fer2013    35340
frd2020    31338
Name: count, dtype: int64


In [29]:
# Add emotion color tags
df['color'] = df['emotion'].map(emo_color_dict)

In [31]:
df['emotion'].value_counts()

emotion
Happy       17978
Neutral     12396
Sad         12154
Fear        10242
Angry        9906
Surprise     4002
Name: count, dtype: int64

In [35]:
# Save combined data
df_save_path = os.path.join('data', 'combined_data.csv')
df.to_csv(df_save_path)

In [None]:
# skim generates and saves a file with basic descriptives
save_path = os.path.join('imgs', f'data_skim.svg')
skim_get_figure(df, save_path=save_path)

## Pivots grouped by Emotion and Usage

In [None]:
# skim will generate and save a file with basic descriptives
for df_str, pd_df in df_dict.items():
    piv_df = emotion_count_piv(
        df=pd_df,
        gby_cols=['emotion', 'train_test_split'], 
        agg_col='Filename',
        count_cols=['Training', 'Testing']
    )

    # Skim package for a nice overview, saved to save_path below
    out_path = os.path.join('imgs', f'{df_str}_counts_skim.svg')
    skim_get_figure(piv_df, save_path=out_path)

    # Bar plot of emotion image counts
    piv_stacked_bar(df=piv_df, label=df_str)

    # Waffle plot of distribution
    waffle_path = os.path.join('imgs', f'{df_str}_waffle_chart.png')
    waffle_chart(
        df=piv_df, group_col='Emotion', data_col='TotalImages', 
        save_path=waffle_path, display=False, total_squares=100,
        color_dict=emo_color_dict
    )

    # Save pivoted data
    intermediate_data_dir = os.path.join('data', 'intermediate_data')
    os.makedirs(intermediate_data_dir, exist_ok=True)
    piv_save_path = os.path.join(intermediate_data_dir, f'{df_str}_emo_piv.csv')
    piv_df.to_csv(piv_save_path)

## Example Images

In [None]:
display_images=False
for df_str, pd_df in df_dict.items():
    # Creates image with 1 example face/emotion
    fig, axes = show_example_images(
        df=pd_df, group_col='emotion', image_col='Full Path', 
        save_path=f'./imgs/{df_str}_examples_1.png', samples=1,
        display=display_images)

    # Creates image with 3 example face/emotion
    fig, axes = show_example_images(
        df=pd_df, group_col='emotion', image_col='Full Path', 
        save_path=f'./imgs/{df_str}_examples_3.png', samples=3,
        display=display_images)