# deep-audiobook-tuner
A system that generates an apt, emotionally pertinent, unique musical score for an audiobook automatically based on the current narrative for the purpose of ameliorating user-experience while being accurate, cost-efficient, and time saving.

### Dependencies

In [None]:
from glob import glob
import os
import sys
import time

import asyncio
from ipywidgets import FileUpload
from IPython.display import display, clear_output
import keras
import ktrain
import librosa
import shutil
import tensorflow as tf

sys.path.append(os.path.abspath("../"))

from deepaudiobooktuner.audio_analysis import *
from deepaudiobooktuner.ibm_transcription import *
from deepaudiobooktuner.text_analysis import *
from deepaudiobooktuner.music_generation import *
from deepaudiobooktuner.utils import *

import warnings
warnings.filterwarnings('ignore')

In [None]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
   tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Helper Functions

#### Uploading an audiobook

In [None]:
def wait_for_change(widget, value):
    future = asyncio.Future()
    def getvalue(change):
        # make the new value available
        future.set_result(change.new)
        widget.unobserve(getvalue, value)
    widget.observe(getvalue, value)
    return future

async def uploadAudiobook():
    # displaying an upload widget
    upload = FileUpload(accept='.mp3', multiple=False)
    display(upload)
    
    future = await wait_for_change(upload, 'value')
    
    uploaded_filename = list(upload.value)[0]
    audiobook_path = f'{path("../assets/audiobooks")}/{uploaded_filename}'
    
    # Saving the audiobook to the audiobook directory
    with open(f'{audiobook_path}', 'wb') as output_file: 
        content = upload.value[uploaded_filename]['content']   
        output_file.write(content)
    
    print(f"----Saved audiobook at {audiobook_path}")
    
    return audiobook_path

#### Creating a temperory directory

In [None]:
def createDir(file_path):
    file_name = (file_path.split("\\")[-1])
    file_name = file_name.split("/")[-1][:-4]
    creation_time = time.time()

    paths = {
        "audio_model": path("../assets/audio_sentiment_data_v2/models/hyperband_tuned_model_final_[0.260879248380661, 0.9069767594337463]/"),
        "pickles": path("../assets/audio_sentiment_data_v2/pickles"),
        "text_model": path("../assets/text_sentiment_data/models/bert_model_2/"),
        "music_model": path("../assets/music_generation/models/MusicTransformerKeyC.pth"),
        "music_data": path("../assets/music_generation/pickles/"),
        "wav_save_path": path(f"../assets/temp/{file_name}-{creation_time}"),
        "clips_save_path": path(f"../assets/temp/{file_name}-{creation_time}/clips"),
        "music_samples": path("../assets/music_generation/datasets/vg-midi-annotated")
    }
    
    # Creating directories in temp to store the converted wav file and the clips
    os.mkdir(paths["wav_save_path"])
    os.mkdir(paths["clips_save_path"])
    
    print("----Temporary directory created.")
    
    return file_name, paths

#### Loading assets

In [None]:
def loadAssets(paths):
    # Loading the audio analyzer model, scaler and classes
    current_time = time.time()
    audio_model, audio_scaler, audio_classes = loadAudioAssets(
        model_path=paths["audio_model"], pickles_path=paths["pickles"]
    )
    print(f"----Loaded audio model assets. Time taken: {round(time.time()-current_time, 1)} s")

    # Loading the text analyzer model and classes
    current_time = time.time()
    text_predictor = ktrain.load_predictor(paths["text_model"])
    text_classes = text_predictor.get_classes()
    print(f"----Loaded text model assets. Time taken: {round(time.time()-current_time, 1)} s")
    
    # Loading the music generation model and music_data
    current_time = time.time()
    music_data = load_data(paths['music_data'], 'musicitem_data_save.pkl')
    music_model = music_model_learner(music_data, pretrained_path=paths['music_model'])
    print(f"----Loaded music model assets. Time taken: {round(time.time()-current_time, 1)} s")
    
    # Setting up IBM
    current_time = time.time()
    stt = setUpIBM()
    print(f"----Setup IBM transcription service. Time taken: {round(time.time()-current_time, 1)} s")
    
    assets = {'audio_model': audio_model,
              'audio_scaler': audio_scaler,
              'audio_classes': audio_classes,
              'text_predictor': text_predictor,
              'text_classes': text_classes,
              'music_data': music_data,
              'music_model': music_model,
              'stt': stt}
    
    return assets

#### Performing text and audio sentiment analysis

In [None]:
def sentimentAnalysis(paths, stt, text_predictor, audio_model, audio_scaler, audio_classes):
    transcriptions = [] # List to store transcriptions of all the segmented clips
    emotions = [] # List to store emotions of all the segmented clips
    
    for i, file_name in enumerate(glob.glob(f'{paths["clips_save_path"]}/*.wav')):
        current_time = time.time()
        print(f"\nProcessing clip {i+1}:")

        # Performing text sentiment analysis
        print("----Text sentiment analysis") 
        text_emotions, transcription = analyzeText(
            file_name=file_name, stt=stt, predictor=text_predictor
        )
        
        # Performing text sentiment analysis
        print("----Audio sentiment analysis")
        audio_emotions = analyzeAudio(
            file_name=file_name, model=audio_model, scaler=audio_scaler
        )
        
        # Taking the average of text and audio emotions
        print("----Predicting final emotion")
        weighted_text_emotions = text_emotions * 0.5
        weighted_audio_emotions = audio_emotions * 0.5
        weighted_emotions = weighted_text_emotions + weighted_audio_emotions

        # Picking the dominant emotion and labelling it
        weighted_emotions = weighted_emotions.argmax()
        weighted_emotions = weighted_emotions.astype(int).flatten()
        final_emotion = audio_classes.inverse_transform((weighted_emotions))

        transcriptions.append(transcription)
        emotions.append(final_emotion)
        
        print(f"----Clip {i+1} processed. Time taken: {round(time.time()-current_time, 1)} s")
        
    return transcriptions, emotions  

In [None]:
def musicClipsGeneration(paths, music_model, music_data, songs, music_emotions=['Angry', 'Happy', 'Neutral', 'Sad']):
    # Generating music for each emotion
    for music_emotion in music_emotions:
        current_time = time.time()
        
        # Generating a song
        full_song = generateMusic(music_emotion, 
                                  f"{paths['music_samples']}", 
                                  music_model,
                                  music_data)
        
        print(f"----generated {music_emotion} clip. Time taken: {round(time.time()-current_time, 1)} s")
        
        # Adding the song created for the emotion to a dictionary
        songs[music_emotion] = full_song
    
    return songs

---
## Main Function
**Input:** Audiobook in mp3 format.  

**Procedure:**
- The audiobook is segmented into 30 second clips.
- Each segment is analyzed by the audio and text analyzers.
- A sentiment is predicted for each segment of the audiobook.
- Music clips are generated for the four emotions (Angry, Happy, Neutral and Sad) 

**Output:**
- Audiobook segments with its transcription and predicted emotion
- Music clips for each emotion (Angry, Happy, Neutral and Sad)

In [None]:
class deepAudiobookTuner:    
    def __init__(self, audiobook_path):
        self.audiobook_path = audiobook_path
        
        # Creating a temperory directory to store the segmented audiobook clips and generated music clips
        print("\nCreating temporary directory.")
        self.file_name, self.paths = createDir(audiobook_path)

        # Loading assets.
        print("\nLoading assets.")
        self.assets = loadAssets(self.paths)
    
        # Converting the mp3 file to a wav file
        print("\nConverting mp3 to wav")
        self.wav_file_path = convertToWav(
            file_name = self.file_name, file_path = self.audiobook_path, save_path = self.paths["wav_save_path"]
        )

        # Segmenting the audio file into 30 second clips
        print("\nSegmenting audiobook")
        segmentAudioFile(
            file_name = self.file_name, file_path = self.wav_file_path, save_path = self.paths["clips_save_path"]
        )
        
        self.songs = {}
        
    
    def analyzeSentiments(self):
        # Performing Sentiment Analysis
        print("\n\nPerforming sentiment analysis")
        self.transcriptions, self.emotions = sentimentAnalysis(self.paths,
                                                               self.assets['stt'], 
                                                               self.assets['text_predictor'],
                                                               self.assets['audio_model'],
                                                               self.assets['audio_scaler'],
                                                               self.assets['audio_classes'])
    
    def generateMusicClips(self):
        # Generating music
        print("\n\nGenerating music")
        self.songs = musicClipsGeneration(paths = self.paths, 
                                          music_model = self.assets['music_model'], 
                                          music_data = self.assets['music_data'], 
                                          songs = self.songs)
           
        
    def regenerateMusicClips(self, music_emotions):
        # Generating music
        print("\n\nRegenerating music")
        self.songs = musicClipsGeneration(music_emotions = music_emotions,
                                          paths = self.paths, 
                                          music_model = self.assets['music_model'], 
                                          music_data = self.assets['music_data'], 
                                          songs = self.songs)
        
    def displayOutputs(self):
        print("\nTranscriptions and Emotions:")
        for i, (transcription, emotion) in enumerate(zip(self.transcriptions, self.emotions)):
            print(f"\nClip {i+1}:")
            print(f"Transcription: {transcription}")
            print(f"Emotion: {emotion}")
        
        print("\n\nGenerated Music:")
        for music_emotion in self.songs:
            print(f"\nMusic clip for emotion: {music_emotion}:")
            music_clip = self.songs[music_emotion].stream
            music_clip.show('midi')

In [None]:
def main(audiobook_path):
    current_time = time.time()

    tuner = deepAudiobookTuner(audiobook_path)
    tuner.analyzeSentiments()
    tuner.generateMusicClips()

    print(f"\n\nJob complete. Total time taken: {round(time.time()-current_time, 1)} s")
    print("\n\n-----------------------------------------------------------------------------------------------------------\n\n")
    print("Outputs: ")

    tuner.displayOutputs()

---

## Demo

In [None]:
# Taking an audiobook in mp3 format as the input
print("\nUpload an audiobook in mp3 format.\n")
audiobook_path = asyncio.ensure_future(uploadAudiobook())

In [None]:
main(audiobook_path.result())