In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pywt
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split

from scipy.spatial.distance import cdist

import tensorflow as tf
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.python.ops.numpy_ops import np_config
from tensorflow.train import BytesList
from tensorflow.train import Example, Features, Feature
np_config.enable_numpy_behavior()

import librosa as lb

import requests
from tqdm import trange, tqdm
from pathlib import Path
import os

from model import WaveletAE
from utils import get_style_correlation_transform

# IMuse - Image To Music Style Transfer
## Data EDA and Preprocessing

### Image Wiki Art Sentiment Data

In [2]:
wikiart_images = pd.read_csv('../data/image/WikiArt-Emotions/WikiArt-info.tsv', sep='\t')
print(f'{wikiart_images.shape[0]} unique images')
wikiart_images.head()

4119 unique images


Unnamed: 0,ID,Category,Artist,Title,Year,Image URL,Painting Info URL,Artist Info URL
0,58c6237dedc2c9c7dc0de1ae,Impressionism,Charles Courtney Curran,In the Luxembourg Garden,1889,https://uploads3.wikiart.org/00123/images/char...,https://www.wikiart.org/en/charles-courtney-cu...,https://www.wikiart.org/en/charles-courtney-cu...
1,577280dfedc2cb3880f28e76,Neo-Expressionism,Keith Haring,The Marriage of Heaven and Hell,1984,https://uploads1.wikiart.org/images/keith-hari...,https://www.wikiart.org/en/keith-haring/the-ma...,https://www.wikiart.org/en/keith-haring
2,57727f2dedc2cb3880ed5fa9,Post-Impressionism,Jozsef Rippl-Ronai,Uncle Piacsek in front of the Black Sideboard,1906,https://uploads3.wikiart.org/images/j-zsef-rip...,https://www.wikiart.org/en/jozsef-rippl-ronai/...,https://www.wikiart.org/en/jozsef-rippl-ronai
3,58d1240cedc2c94f900fc610,Cubism,Vadym Meller,Monk. For the Play &#39;Mazeppa&#39;,1920,https://uploads2.wikiart.org/00124/images/vady...,https://www.wikiart.org/en/vadym-meller/monk-f...,https://www.wikiart.org/en/vadym-meller
4,57727de7edc2cb3880e91f26,Romanticism,David Wilkie,The Defence of Sarago&#231;a,1828,https://uploads6.wikiart.org/images/david-wilk...,https://www.wikiart.org/en/david-wilkie/the-de...,https://www.wikiart.org/en/david-wilkie


In [3]:
wikiart_images = wikiart_images[['ID', 'Image URL']]
wikiart_images.columns = ['id', 'url']
wikiart_images.head()

Unnamed: 0,id,url
0,58c6237dedc2c9c7dc0de1ae,https://uploads3.wikiart.org/00123/images/char...
1,577280dfedc2cb3880f28e76,https://uploads1.wikiart.org/images/keith-hari...
2,57727f2dedc2cb3880ed5fa9,https://uploads3.wikiart.org/images/j-zsef-rip...
3,58d1240cedc2c94f900fc610,https://uploads2.wikiart.org/00124/images/vady...
4,57727de7edc2cb3880e91f26,https://uploads6.wikiart.org/images/david-wilk...


In [4]:
wikiart_images[(wikiart_images.url.str.endswith('.jpg')) | (wikiart_images.url.str.endswith('.JPG'))].shape[0]

4114

Almost all of the data is .jpg so we can remove the 4 png files in order to keep the consistency.

In [5]:
wikiart_images = wikiart_images[(wikiart_images.url.str.endswith('.jpg')) | (wikiart_images.url.str.endswith('.JPG'))]

In [6]:
def download_img(url, directory, file_id):
    image = requests.get(url).content
    with open(f'{directory}/{file_id}.jpg', 'wb') as handler:
        handler.write(image)

# for img_id, imd_url in tqdm(wikiart_images.values):
#     download_img(imd_url, '../data/image/wikiart', img_id)

### Emotion Votes

In [7]:
arts_emotions = pd.read_csv('../data/image/WikiArt-Emotions/WikiArt-Emotions-All.tsv', sep='\t')
arts_emotions.head()

Unnamed: 0,ID,Style,Category,Artist,Title,Year,Is painting,Face/body,Ave. art rating,Art (image+title): agreeableness,...,TitleOnly: love,TitleOnly: optimism,TitleOnly: pessimism,TitleOnly: regret,TitleOnly: sadness,TitleOnly: shame,TitleOnly: shyness,TitleOnly: surprise,TitleOnly: trust,TitleOnly: neutral
0,58c6237dedc2c9c7dc0de1ae,Modern Art,Impressionism,Charles Courtney Curran,In the Luxembourg Garden,1889,yes,face,2.33,0.036,...,0.155,0.238,0.024,0.012,0.024,0.012,0.0,0.048,0.155,0.0
1,577280dfedc2cb3880f28e76,Modern Art,Neo-Expressionism,Keith Haring,The Marriage of Heaven and Hell,1984,yes,body,0.7,0.0,...,0.2,0.0,0.1,0.0,0.0,0.0,0.0,0.2,0.0,0.0
2,57727f2dedc2cb3880ed5fa9,Modern Art,Post-Impressionism,Jozsef Rippl-Ronai,Uncle Piacsek in front of the Black Sideboard,1906,yes,face,1.6,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.0
3,58d1240cedc2c94f900fc610,Modern Art,Cubism,Vadym Meller,Monk. For the Play &#39;Mazeppa&#39;,1920,yes,face,0.82,0.0,...,0.091,0.091,0.0,0.0,0.091,0.0,0.091,0.182,0.091,0.0
4,57727de7edc2cb3880e91f26,Post Renaissance Art,Romanticism,David Wilkie,The Defence of Sarago&#231;a,1828,yes,face,1.69,0.077,...,0.077,0.231,0.077,0.077,0.154,0.077,0.077,0.154,0.385,0.0


In [8]:
# Keeping the "Image Only" emotions
arts_emotions = arts_emotions[['ID', *arts_emotions.columns[29:49]]]
arts_emotions.columns = [col.split(':')[-1].strip() for col in arts_emotions.columns]
arts_emotions.columns = ['id', *arts_emotions.columns[1:]]
arts_emotions.head()

Unnamed: 0,id,agreeableness,anger,anticipation,arrogance,disagreeableness,disgust,fear,gratitude,happiness,...,love,optimism,pessimism,regret,sadness,shame,shyness,surprise,trust,neutral
0,58c6237dedc2c9c7dc0de1ae,0.06,0.012,0.071,0.024,0.012,0.0,0.012,0.119,0.726,...,0.25,0.274,0.012,0.0,0.131,0.0,0.024,0.024,0.25,0.0
1,577280dfedc2cb3880f28e76,0.0,0.0,0.1,0.0,0.0,0.1,0.3,0.0,0.1,...,0.0,0.2,0.2,0.1,0.2,0.0,0.0,0.5,0.0,0.0
2,57727f2dedc2cb3880ed5fa9,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.1,0.5,...,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.1,0.2,0.0
3,58d1240cedc2c94f900fc610,0.0,0.0,0.091,0.0,0.0,0.091,0.0,0.0,0.091,...,0.0,0.0,0.182,0.0,0.364,0.0,0.0,0.273,0.0,0.0
4,57727de7edc2cb3880e91f26,0.077,0.077,0.077,0.231,0.308,0.308,0.308,0.231,0.154,...,0.077,0.231,0.231,0.077,0.231,0.154,0.077,0.154,0.231,0.0


In [9]:
#Create column with top emotion associated with artwork 
prob_df4 = arts_emotions.loc[:, ('agreeableness', 'anger', 'anticipation','arrogance', 'disagreeableness',
       'disgust', 'fear','gratitude', 'happiness', 'humility', 'love',
       'optimism', 'pessimism','regret', 'sadness','shame', 'shyness',
       'surprise', 'trust','neutral')]
arts_emotions["emotion"] = prob_df4.idxmax(axis = 1)
arts_emotions.head()

Unnamed: 0,id,agreeableness,anger,anticipation,arrogance,disagreeableness,disgust,fear,gratitude,happiness,...,optimism,pessimism,regret,sadness,shame,shyness,surprise,trust,neutral,emotion
0,58c6237dedc2c9c7dc0de1ae,0.06,0.012,0.071,0.024,0.012,0.0,0.012,0.119,0.726,...,0.274,0.012,0.0,0.131,0.0,0.024,0.024,0.25,0.0,happiness
1,577280dfedc2cb3880f28e76,0.0,0.0,0.1,0.0,0.0,0.1,0.3,0.0,0.1,...,0.2,0.2,0.1,0.2,0.0,0.0,0.5,0.0,0.0,surprise
2,57727f2dedc2cb3880ed5fa9,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.1,0.5,...,0.3,0.0,0.0,0.0,0.0,0.0,0.1,0.2,0.0,happiness
3,58d1240cedc2c94f900fc610,0.0,0.0,0.091,0.0,0.0,0.091,0.0,0.0,0.091,...,0.0,0.182,0.0,0.364,0.0,0.0,0.273,0.0,0.0,sadness
4,57727de7edc2cb3880e91f26,0.077,0.077,0.077,0.231,0.308,0.308,0.308,0.231,0.154,...,0.231,0.231,0.077,0.231,0.154,0.077,0.154,0.231,0.0,disagreeableness


In [10]:
arts_emotions.emotion.unique()

array(['happiness', 'surprise', 'sadness', 'disagreeableness', 'fear',
       'trust', 'anticipation', 'humility', 'shame', 'arrogance', 'love',
       'disgust', 'optimism', 'anger', 'pessimism', 'neutral',
       'gratitude', 'agreeableness', 'shyness'], dtype=object)

In [11]:
emotion_quadrants = {
    'happiness': 'Q1',
    'surprise': 'Q1',
    'sadness': 'Q3',
    'disagreeableness': 'Q2',
    'fear': 'Q2',
    'trust': 'Q1',
    'anticipation': 'Q2',
    'humility': 'Q4',
    'shame': 'Q3',
    'arrogance': 'Q2',
    'love': 'Q1',
    'disgust': 'Q2',
    'optimism': 'Q1',
    'anger': 'Q2',
    'pessimism': 'Q3',
    'neutral': 'Q4',
    'gratitude': 'Q1',
    'agreeableness': 'Q4',
    'shyness': 'Q4',
    'happy': 'Q1',
    'sad': 'Q3',
    'tender': 'Q1',
    'high val.': 'Q1',
    'low val.': 'Q2',
    'high ener.': 'Q1',
    'low ener.': 'Q4',
    'high tens.': 'Q2',
    'low tens.': 'Q4',
    'anger high': 'Q2',
    'anger mod.': 'Q3',
    'fear high': 'Q2',
    'fear mod.': 'Q3',
    'happy high': 'Q1',
    'happy mod.': 'Q2',
    'sad high': 'Q3',
    'sad mod.': 'Q4',
    'tender high': 'Q4',
    'tender mod.': 'Q1',
    'valence pos. high': 'Q1',
    'valence pos. mod.': 'Q1',
    'valence neg. mod.': 'Q2',
    'valence neg. high': 'Q2',
    'energy pos. high': 'Q1',
    'energy pos. mod.': 'Q1',
    'energy neg. mod.': 'Q2',
    'energy neg. high': 'Q2',
    'tension pos. high': 'Q2',
    'tension pos. mod.': 'Q1',
    'tension neg. mod.': 'Q3',
    'tension neg. high': 'Q4',
}

In [12]:
def set_quadrant(df):
    df['quadrant'] = 0

    def get_quadrant(row):
        row.quadrant = emotion_quadrants[row.emotion]
        
        return row

    return df.apply(get_quadrant, axis=1)

arts_emotions = set_quadrant(arts_emotions)
arts_emotions = arts_emotions[['id', 'quadrant']]

### Music Data Preparation

In [13]:
osts_set_1 = pd.read_csv('../data/music/OSTs/set1_tracklist.csv', index_col=0)
osts_set_1['set'] = 1
osts_set_2 = pd.read_csv('../data/music/OSTs/set2_tracklist.csv', index_col=0)
osts_set_2['set'] = 2

osts = pd.concat([osts_set_1, osts_set_2])
osts = osts[['Emotion', 'set']]
osts.columns = ['emotion', 'set']
osts['emotion'] = osts.emotion.str.lower()
osts = set_quadrant(osts)

osts.head()

Unnamed: 0_level_0,emotion,set,quadrant
Nro,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,happy,1,Q1
2,happy,1,Q1
3,happy,1,Q1
4,happy,1,Q1
5,happy,1,Q1


In [14]:
songs = pd.read_csv('../data/music/others/annotations.csv')
songs.columns = ['song', 'quadrant']
songs.head()

Unnamed: 0,song,quadrant
0,MT0000004637,Q3
1,MT0000011357,Q2
2,MT0000011975,Q2
3,MT0000040632,Q1
4,MT0000044741,Q3


In [15]:
osts['music'] = '../data/music/OSTs/Set' + osts.set.astype(str) + '/' + osts.index.map('{0:0=3d}'.format) + '.mp3'
songs['music'] = '../data/music/others/' + songs.quadrant + '/' + songs.song + '.mp3'

osts = osts[['music', 'quadrant']]
songs = songs[['music', 'quadrant']]

music_data = pd.concat([osts, songs]).reset_index(drop=True)

### Map Music to Images

In [16]:
def set_related_music(df, already_set_df = None, use_second_set = False):
    df['music'] = ''

    q1_ids = music_data[(music_data.quadrant == 'Q1')].music.values
    q2_ids = music_data[(music_data.quadrant == 'Q2')].music.values
    q3_ids = music_data[(music_data.quadrant == 'Q3')].music.values
    q4_ids = music_data[(music_data.quadrant == 'Q4')].music.values
    
    df.loc[df.quadrant == 'Q1', 'music'] = np.random.choice(
        q1_ids,
        df[df.quadrant == 'Q1'].shape[0],
        replace=q1_ids.shape[0] < df[df.quadrant == 'Q1'].shape[0]
    )
    df.loc[df.quadrant == 'Q2', 'music'] = np.random.choice(
        q2_ids,
        df[df.quadrant == 'Q2'].shape[0],
        replace=q2_ids.shape[0] < df[df.quadrant == 'Q2'].shape[0]
    )
    df.loc[df.quadrant == 'Q3', 'music'] = np.random.choice(
        q3_ids,
        df[df.quadrant == 'Q3'].shape[0],
        replace=q3_ids.shape[0] < df[df.quadrant == 'Q3'].shape[0]
    )
    df.loc[df.quadrant == 'Q4', 'music'] = np.random.choice(
        q4_ids,
        df[df.quadrant == 'Q4'].shape[0],
        replace=q4_ids.shape[0] < df[df.quadrant == 'Q4'].shape[0]
    )
    
    return df

data = set_related_music(arts_emotions)
data['img'] = '../data/image/wikiart/' + data.id + '.jpg'
data.drop(['id'], 1, inplace=True)
data.head()

  data.drop(['id'], 1, inplace=True)


Unnamed: 0,quadrant,music,img
0,Q1,../data/music/OSTs/Set2/055.mp3,../data/image/wikiart/58c6237dedc2c9c7dc0de1ae...
1,Q1,../data/music/OSTs/Set2/098.mp3,../data/image/wikiart/577280dfedc2cb3880f28e76...
2,Q1,../data/music/others/Q1/MT0001703346.mp3,../data/image/wikiart/57727f2dedc2cb3880ed5fa9...
3,Q3,../data/music/others/Q3/MT0009220462.mp3,../data/image/wikiart/58d1240cedc2c94f900fc610...
4,Q2,../data/music/OSTs/Set1/301.mp3,../data/image/wikiart/57727de7edc2cb3880e91f26...


### TF Dataset

In [17]:
x_train, x_test, y_train, y_test = train_test_split(data.music, data.img, test_size=0.10, stratify=data.quadrant, shuffle=True)
train_ds = pd.DataFrame({'x': x_train, 'y': y_train})
test_ds = pd.DataFrame({'x': x_test, 'y': y_test})

train_ds = tf.data.Dataset.from_tensor_slices(tf.convert_to_tensor(train_ds))
test_ds = tf.data.Dataset.from_tensor_slices(tf.convert_to_tensor(test_ds))

In [18]:
SR = 16000              # sampling rate (original or the one to be resampled)
HOP_LENGTH = 256        # hop size of the STFT
N_FFT = 512             # frame size (number of freq bins of the STFT)
N_MELS = 96             # number of mel bands

In [19]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.numpy()]))

def normalized_wt_downsampling(x, wavelet, level):
        LL = pywt.wavedec2(x, wavelet, 'periodization', level)[0]
        LL = LL / np.abs(LL).max()

        return LL

def per_channel_wd(img, level=1, wavelet='haar'):
    r, g, b = tf.unstack(img, axis=2)
    r = normalized_wt_downsampling(r, wavelet, level)
    g = normalized_wt_downsampling(g, wavelet, level)
    b = normalized_wt_downsampling(b, wavelet, level)

    return tf.stack([r, g, b], axis=2)

class DatasetGenerator:
    def __init__(self, max_dim_size = 64, vgg_input_max_size = 512):
        self.max_dim_size = max_dim_size
        self.vgg_input_max_size = vgg_input_max_size
        self.wavelet_ae = WaveletAE()
    
    def load_image(self, img_path):
        self.img_path = img_path
        self.img_bytes = tf.io.read_file(img_path)
        self.img_raw = tf.image.decode_image(self.img_bytes, channels = 3, dtype=tf.float32)
        
        dwt_level = tf.experimental.numpy.log2(tf.reduce_max(self.img_raw.shape) / self.max_dim_size)
        dwt_level = tf.round(dwt_level)
        dwt_level = tf.cast(dwt_level, tf.uint8)
        
        self.img_resized = per_channel_wd(self.img_raw, dwt_level)
        
        ar = self.img_raw.shape[0] / self.img_raw.shape[1]
        if ar > 1:
            size = [self.vgg_input_max_size, int(self.vgg_input_max_size / ar)]
        else:
            size = [int(ar * self.vgg_input_max_size), self.vgg_input_max_size]

        self.img_raw = tf.image.resize(self.img_raw, size)
            
    def load_music(self, music_path):
        audio, sr = lb.load(music_path, sr=SR)

        self.spec = lb.feature.melspectrogram(
            y=audio,
            sr=sr,
            hop_length=HOP_LENGTH,
            n_fft=N_FFT,
            n_mels=N_MELS
        ).T
        
        self.spec = tf.constant(np.log10((1e4 * self.spec) + 1), dtype=tf.float16)
        self.spec_corr = tf.matmul(self.spec, self.spec, transpose_a=True) / self.spec.shape[0]
        
    def get_style_transormations(self):
        feat, _ = self.wavelet_ae.get_features(tf.expand_dims(self.img_raw, 0))
        self.style_ede = self.wavelet_ae.get_style_correlations(tf.expand_dims(self.img_raw, 0))
        
        for i in range(len(self.style_ede)):
            self.style_ede[i] = tf.cast(self.style_ede[i], tf.float16)
    
    def process(self, music, img):
        self.load_image(img)
        self.load_music(music)
        self.get_style_transormations()
    
    def serialize_information(self):
        img_resized = tf.cast(self.img_resized * 255, tf.uint8)
        img_resized = tf.image.encode_jpeg(img_resized)
        
        features = Features(feature = {
          'resized_image': _bytes_feature(img_resized),
          'block1': _bytes_feature(tf.io.serialize_tensor(self.style_ede[0][0])),
          'block2': _bytes_feature(tf.io.serialize_tensor(self.style_ede[1][0])),
          'block3': _bytes_feature(tf.io.serialize_tensor(self.style_ede[2][0])),
          'block4': _bytes_feature(tf.io.serialize_tensor(self.style_ede[3][0])),
            
          'music_spec': _bytes_feature(tf.io.serialize_tensor(self.spec)),
        })

        return Example(features=features).SerializeToString()

In [None]:
datagen = DatasetGenerator()
tf_record_options = tf.io.TFRecordOptions(compression_type = "GZIP")

BASE_DATA_DIR = Path(os.getcwd()).parent  / "data" / "tfrecords"

def write_as_TFRecords(dataset, target_dir, batch_size, datagen):
    dataset = dataset.batch(batch_size)
    dataset_len = len(list(dataset))
    for part_id, data in enumerate(dataset):
        filename = str(target_dir / f"{part_id}.tfrecord")
        with tf.io.TFRecordWriter(filename, options = tf_record_options) as writer:
            for music, image in tqdm(data):
                datagen.process(music.numpy().decode("utf-8"), image.numpy().decode("utf-8"))
                writer.write(datagen.serialize_information())
            writer.close()
            
write_as_TFRecords(dataset = train_ds,
                   target_dir = BASE_DATA_DIR / "train", 
                   batch_size = 1024,
                   datagen = datagen)

  return f(*args, **kwargs)
 27%|█████████████████████▎                                                         | 276/1024 [08:46<23:58,  1.92s/it]