In [1]:
import os
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import cv2 as cv
from sklearn.cluster import MiniBatchKMeans

# Load data

In [3]:
# File data
csv = '../data/fer_file_data.csv'
df = pd.read_csv(csv)

In [4]:
df.head()

Unnamed: 0,usage,emotion,file
0,train,3,../data/external/fer2013/train/happy/Training_...
1,train,3,../data/external/fer2013/train/happy/Training_...
2,train,3,../data/external/fer2013/train/happy/Training_...
3,train,3,../data/external/fer2013/train/happy/Training_...
4,train,3,../data/external/fer2013/train/happy/Training_...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35328 entries, 0 to 35327
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   usage    35328 non-null  object
 1   emotion  35328 non-null  int64 
 2   file     35328 non-null  object
dtypes: int64(1), object(2)
memory usage: 828.1+ KB


# SIFT

In [36]:
def sift(file):
    image = cv.imread(file)
    gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    sift = cv.SIFT_create()
    keypoints, descriptors = sift.detectAndCompute(gray,None)
    
    return keypoints, descriptors
   

def extractFeatures(df):
    descriptors = []
    emotions = []
    for index, row in df.iterrows():
        kp, desc = sift(row.file)
        if type(desc) != type(None):
            descriptors.append(desc)
            emotions.append(row.emotion)
            
    return descriptors, emotions


def flatFeatures(descriptors):
    all_descriptors = []
    for i in range(len(descriptors)):
        all_descriptors.extend(descriptors[i])
            
    return all_descriptors
        
        
def kMeans(k, all_descriptors):
    batch_size = int(len(all_descriptors) / 3) 
    kmeans = MiniBatchKMeans(n_clusters = k,
                             batch_size = batch_size,
                             init_size = None)
    kmeans.fit(all_descriptors)
    
    return kmeans
        
        
def histograms(k, all_descriptors, descriptors):
    histograms = []
    kmeans = kMeans(k, all_descriptors)
    
    for i in range(len(descriptors)):
        preds = kmeans.predict(descriptors[i].astype('float'))
        hist, bin_edges = np.histogram(a = preds, bins = range(0, k*5))
        histograms.append(hist)
    
    return histograms


def saveHistograms(usage, histograms):
    sift_data = []
    for i in range(len(histograms)):
        vectors = ' '.join(str(x) for x in histograms[i])
        sift_data.append([usage, emotions[i], vectors])

In [37]:
train_descriptors, train_emotions = extractFeatures(df[df['usage']=='train'])
test_descriptors, test_emotions = extractFeatures(df[df['usage']=='test'])

In [38]:
train_descriptors_flat = flatFeatures(train_descriptors)
test_descriptors_flat = flatFeatures(test_descriptors)

In [39]:
print(len(train_descriptors))
print(len(test_descriptors))

28257
7065


In [40]:
print(len(train_emotions))
print(len(test_emotions))

28257
7065


In [41]:
k = len(df.emotion.unique())
train_hist = histograms(k, train_descriptors_flat, train_descriptors)
test_hist = histograms(k, test_descriptors_flat, test_descriptors)

In [42]:
print(np.array(train_hist).shape)
print(np.array(test_hist).shape)

(28257, 29)
(7065, 29)


In [43]:
test_hist[0]

array([4, 3, 5, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

# Save Histograms

In [44]:
def combineData(usage, histograms, emotions):
    sift_data = []
    for i in range(len(histograms)):
        vectors = ' '.join(str(x) for x in histograms[i])
        sift_data.append([usage, emotions[i], vectors])
    return sift_data

In [45]:
sift_data1 = combineData('train', train_hist, train_emotions)
sift_data2 = combineData('test', test_hist, test_emotions)
sift_data = sift_data1 + sift_data2

In [46]:
df_sift = pd.DataFrame(sift_data, columns = ['usage', 'emotion', 'histogram'])
df_sift.head()

Unnamed: 0,usage,emotion,histogram
0,train,3,4 3 6 1 1 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,train,3,2 8 4 4 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,train,3,2 9 9 5 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
3,train,3,2 3 3 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,train,3,6 14 7 2 1 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...


In [47]:
df_sift.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35322 entries, 0 to 35321
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   usage      35322 non-null  object
 1   emotion    35322 non-null  int64 
 2   histogram  35322 non-null  object
dtypes: int64(1), object(2)
memory usage: 828.0+ KB


In [48]:
df_sift.to_csv('../data/sift_data.csv', index=False)