# Moosic Feature Engineering

* MOOSIC - mood based music recommendation system


    * data extraction and data management
    * data preprocessing 
    * load combined spotify data + main genres engineered
    * remove null entries from main genres
    * create core genre : one gener associated to a track
    * mood engineering: research, quadrants and subquadrants
    * drop nulls and duplicates
    * encode genre to be used as features during training
    * make data balanced with respect the moods






## Importing required libraries




In [None]:


# IMPORT LIBRARIES


try:

    import numpy as np
    import pandas as pd
    import random as rnd
    #from tqdm.notebook import tqdm as tqdm
    from tqdm import tqdm 
    #from .autonotebook import tqdm as notebook_tqdm
    import time

    # databases - sql
    #from dotenv import dotenv_values
    #import sqlalchemy

    # visualisation
    import seaborn as sns
    import matplotlib.pyplot as plt
    from matplotlib.colors import ListedColormap

    # split data - avoid data leakage
    from sklearn.model_selection import train_test_split



except ImportError as error:
    print(f"Installation of the required dependencies necessary! {error}")

    %pip install numpy
    %pip install pandas
    #%pip install dotenv
    #%pip install sqlalchemy
    %pip install seaborn
    %pip install matplotlib
    %pip install tqdm
    %pip install ipywidgets
    %pip install yellowbrick  

    print(f"Successful installation of the required dependencies necessary")


import warnings
warnings.filterwarnings('ignore')





## Loading the data

In [None]:
# load the data files for moosic


df_musgenre = pd.read_csv('../data/processed/df_with_main_genres_v1.csv', low_memory=False)

# get shape of the artist and tracks dataframe

print(f"Music data: There are {df_musgenre.shape[0]} observations and {df_musgenre.shape[1]} feature variables ")
print('----------'*10)

df_musgenre.head(2)

In [None]:


moosic_data = df_musgenre.copy(deep=True)
moosic_data = moosic_data.query("main_genres != '[]' ").reset_index(drop=True)
moosic_data = moosic_data.convert_dtypes()

null_rows = moosic_data[moosic_data.isnull().T.any()].index
moosic_data = moosic_data.drop(null_rows)

print(moosic_data.isnull().T.any())
print(moosic_data.shape)

moosic_data.head(2)



In [None]:
moosic_data['main_genres'].unique()



In [None]:
# from one_genre_each_song.ipynb


# Create an empty list to hold rows
rows_list = []

# Iterate through each row in the original DataFrame
for index, row in moosic_data.iterrows():
    # Get the list of genres in the 'main_genres' column
    genres_list = eval(row['main_genres'])  # Convert the string representation to a list
    
    # Iterate through each genre in the list and create a new row
    for genre in genres_list:
        # Create a copy of the row and update the 'main_genres' column with the current genre
        new_row = row.copy()
        new_row['core_genres'] = genre #changed here to core_genres because we already have main_genres column
        
        # Append the new row to the list
        rows_list.append(new_row)

# Create a new DataFrame from the list of rows
moosic_data = pd.DataFrame(rows_list)



In [None]:
# Count the number of songs for each unique genre
genre_counts = moosic_data['core_genres'].value_counts()

# Count the total number of unique genres
num_unique_genres = len(genre_counts)

# Print the number of unique genres and the number of songs for each genre
print("Number of Unique Genres:", num_unique_genres)
print("\nNumber of Songs for Each Genre:")
print(genre_counts)

In [None]:
# Check for empty values in the entire DataFrame
empty_values = moosic_data.isna().sum()

# Print the number of empty values for each column
print("Empty Values in Each Column:")
print(empty_values)

In [None]:
# drop null/empty rows

null_rows1 = moosic_data[moosic_data.isnull().T.any()].index
moosic_data = moosic_data.drop(null_rows1)
empty_values = moosic_data.isna().sum()
print(empty_values)

In [None]:
# current features for moosic data

features = moosic_data.columns.tolist()
features



In [None]:
# save current moosic (mood-music) dataset : removed nulls and empty lists

#moosic_data.to_csv('../data/processed/moosic_data.csv', chunksize=len(moosic_data)//5, index=False)





![1](https://www.researchgate.net/profile/Angelo-Ciaramella/publication/263964114/figure/fig1/AS:566286145867776@1512024546717/Two-dimensional-emotion-representation-in-Thayers-model.png)
[ 1](https://www.researchgate.net/figure/Two-dimensional-emotion-representation-in-Thayers-model_fig1_263964114/download?_tp=eyJjb250ZXh0Ijp7ImZpcnN0UGFnZSI6Il9kaXJlY3QiLCJwYWdlIjoiX2RpcmVjdCJ9fQ)


![2](https://www.researchgate.net/profile/Sarfaraz-Masood/publication/309648785/figure/fig1/AS:424403734405121@1478197143294/Thayers-2-D-Emotion-Model.png)
[ 2](https://www.researchgate.net/publication/309648785_MFCC_Spectral_and_Temporal_Feature_based_Emotion_Identification_in_Songs)





### Circumplex model of affect (mood) - Russell 2-D model formula

* valence = radius (lenght) x cos(angle of affect term)
* energy = radius (lenght) x sin(angle of affect term)


### Plutchik's wheel of affect/Emotions + Circumplex

* mood_indicators = (valence , energy) = (r.cos(theta) , r.sin(theta)) 
  - the radius = 0.5
  - the origin = (0.5, 0.5)
  - theta = angle of affect for mood
  - proposed affect-mood sub-quadrants thetas : happy (0°), euphoric (45°), tense (90°), angry (135°), depressive (180°), sad (225°), calm (270°), relaxed (315°).
* our data for both variables lie between the range 0 and 1 so 

* given the fact that we have the valence and energy values already, we can calculate the 8 basic moods for the model with:
 - theta = arctan2( energy, valence) - the arctangent when given for 2 parameters

* 8 basic mood quadrants and sub-quadrants : 8 mood co-ordinates (4 main quadrants) for the 8 basic moods 
    - Quadrants : 
        - Q1 (Happy/Exuberant)  : [(0.5, 1.0), (0.5, 1.0)], # high valence, high energy
        - Q2 (Anger/anxious)   : [(0.0, 0.5), (0.5, 1.0)], # low valence, high energy
        - Q3(Sad/depressed)        : [(0.0, 0.5), (0.0, 0.5)], # low valence, low energy
        - Q4 (Relaxed/Content/calm) : [(0.5, 1.0), (0.0, 0.5)], # high valence, low energy 

    <br>

  - Sub-quadrants (using the polar method) : * not in use here, need to obtain the vertices of the triangles in each quadrant, so using the other to save time
    - Q1 (Happy)  : M1 - happy [r.cos(0°), r.sin(0°)],  M2 - euphoric [r.cos(45°), r.sin(45°)] 
    - Q2 (Angry)  : M3 - tense [r.cos(90°), r.sin(90°)],  M4 - angry [r.cos(135°), r.sin(135°)] 
    - Q3 (Sad) : M5 - depressive [r.cos(180°), r.sin(180°)],  M6 - sad [r.cos(225°), r.sin(225°)]
    - Q4 (Relaxed)  : M7 - calm [r.cos(270°), r.sin(270°)],  M8 - relaxed [r.cos(315°), r.sin(315°)] 




  - Sub-quadrants (modified-russell-thayer): 
    - Q1 (Happy)  : M1 - happy [(0.5, 1.0), (0.5, 0.75)],  M2 - euphoric [(0.5, 1.0), (0.75, 1.0)] 
    - Q2 (Angry)  : M3 - tense [(0.0, 0.5), (0.75, 1.0)],  M4 - angry [(0.0, 0.5), (0.5, 0.75)]
    - Q3 (Sad) : M5 - depressive [(0.0, 0.5), (0.25, 0.5)],  M6 - sad [(0.0, 0.5), (0.0, 0.25)]
    - Q4 (Relaxed)  : M7 - calm [(0.5, 1.0), (0.0, 0.25)],  M8 - relaxed [(0.5, 1.0), (0.0, 0.25)]
  

 so, using the valence and energy values we have for the affect terms listed in the mood_labels variable to calculate the angle it lies on the circumplex 2d graph using the russell 2-d valence and arousal 8 (energy) formula we get

```python 

# quadrants
mood_quadrants = {
                    'Q1 (Happy/Exuberant)'  : [(0.5, 1.0), (0.5, 1.0)], # high valence, high energy
                    'Q2 (Angry/anxious)'   : [(0.0, 0.5), (0.5, 1.0)], # low valence, high energy                  
                    'Q3(Sad/depressed)'        : [(0.0, 0.5), (0.0, 0.5)], # low valence, low energy
                    'Q4 (Relaxed/Content/calm)' : [(0.5, 1.0), (0.0, 0.5)] # high valence, low energy

}



# 1D : subquadrants (valence)

mood_sub_quadrants = {
                        'happy' : [0.875, 1.0),
                        'euphoric' : [0.875, 1.0),                 
                        'tense' : [0.875, 1.0),
                        'angry' :[0.875, 1.0),
                        'depressed' : [0.0, 0.125),
                        'sad' :[0.125, 0.25),
                        'calm' : [0.875, 1.0),
                        'relaxed' : [0.875, 1.0),

    }

# 2D : subquadrants (valence & energy)

mood_sub_quadrants = {
                        'happy' : [(0.5, 1.0), (0.5, 0.75)],
                        'euphoric' : [(0.5, 1.0), (0.75, 1.0)],                  
                        'tense' : [(0.0, 0.5), (0.75, 1.0)], 
                        'angry' :[(0.0, 0.5), (0.5, 0.75)], 
                        'depressed' : [(0.0, 0.5), (0.25, 0.5)], 
                        'sad' :[(0.0, 0.5), (0.0, 0.25)], 
                        'calm' : [(0.5, 1.0), (0.0, 0.25)], 
                        'relaxed' : [(0.5, 1.0), (0.25, 0.5)]

    }


```







In [None]:


# mood maps the 4 quadrants

def mood_quadrant_map(data):
    """ 
    Mood quadrant feature engineering (4 Quadrants)
        VE 2D circumplex model : valence, energy

    :param data: object - data
    :return: object - data

    Description:

    Function to map mood (affect) quadrants to terms based on the valence and energy values
    it is associated with

    Mood quadrant terms and values : 
        - 'happy' : high valence, high energy [(0.5, 1.0), (0.5, 1.0)]
        - 'tense' : low valence, high energy [(0.0, 0.5), (0.5, 1.0)]
        - 'sad' : low valence, low energy [(0.0, 0.5), (0.0, 0.5)]
        - 'relaxed' : high valence, low energy [(0.5, 1.0), (0.0, 0.5)]

    - Sub-quadrants (our modified-russell-thayer): 
        - Q1 (Happy)  : M1 - happy [(0.5, 1.0), (0.5, 0.75)],  M2 - euphoric [(0.5, 1.0), (0.75, 1.0)] 
        - Q2 (Tense)  : M3 - tense [(0.0, 0.5), (0.75, 1.0)],  M4 - angry [(0.0, 0.5), (0.5, 0.75)]
        - Q3 (Sad) : M5 - depressive [(0.0, 0.5), (0.25, 0.5)],  M6 - sad [(0.0, 0.5), (0.0, 0.25)]
        - Q4 (Relaxed)  : M7 - calm [(0.5, 1.0), (0.0, 0.25)],  M8 - relaxed [(0.5, 1.0), (0.0, 0.25)]
    
    """

    valence = data['valence']
    energy = data['energy']
    
    mood_quadrant_map = {
        'Q1 (Happy)': (valence >= 0.5) & (valence <= 1.0) & (energy >= 0.5) & (energy <= 1.0),
        'Q2 (Tense)': (valence >= 0.0) & (valence <= 0.5) & (energy >= 0.5) & (energy <= 1.0),
        'Q3 (Sad)': (valence >= 0.0) & (valence <= 0.5) & (energy >= 0.0) & (energy <= 0.5),
        'Q4 (Relaxed)': (valence >= 0.5) & (valence <= 1.0) & (energy >= 0.0) & (energy <= 0.5)
    }

    if not (0 <= valence <= 1) or not (0 <= energy <= 1):
        raise ValueError(f''' Values for valence and energy must be between 0 and 1.
                        Mood term can't be assigned for these values: valence={valence} and energy={energy}''')


    for valid_mood_quadrant, mood_quadrant_range in mood_quadrant_map.items():
        if mood_quadrant_range:
            mood_quadrant =  valid_mood_quadrant

    print("mapping sucessful!")
    return mood_quadrant



In [None]:
# mood maps the 8 sub-quadrants (1d) based on valence


def mood_1d_map(data):
    """ 
    Mood feature engineering (1D)
        V affect terms : valence

    :param data: object - data
    :return: object - data

    Description:

    Function to map mood (affect) terms based on the valence values
    it is associated with.

    The list of tuples represents the core 8 moods dominate based on valence.

    Mood terms and values:
        - 'happy' : (0.875, 1.0)
        - 'euphoric' : (0.75, 0.875)                
        - 'tense' : (0.375, 0.5)
        - 'angry' :(0.25, 0.375)
        - 'depressed' : (0.0, 0.125)
        - 'sad' :(0.125, 0.25)
        - 'calm' : (0.5, 0.625)
        - 'relaxed' : (0.625, 0.75)    


    """

    valence = data['valence']
    
    mood_map_1D = {
        'happy': (valence > 0.875) & (valence <= 1.0),
        'euphoric': (valence > 0.75) & (valence <= 0.875),
        'tense': (valence > 0.375) & (valence <= 0.5),
        'angry': (valence > 0.25) & (valence <= 0.375),
        'depressed': (valence > 0.0) & (valence <= 0.125),
        'sad': (valence > 0.125) & (valence <= 0.25),
        'calm': (valence > 0.5) & (valence <= 0.625),
        'relaxed': (valence > 0.625) & (valence <= 0.75),        
    }

    if not (0 <= valence <= 1):
        raise ValueError(f''' Values for valence must be between 0 and 1.
                        Mood term can't be assigned for these values: valence={valence}''')

    moods = np.select(list(mood_map_1D.values()), list(mood_map_1D.keys()), default='unknown')

    return moods



In [None]:
# mood maps the 8 sub-quadrants (2d) based on valence and energy


def mood_2d_map(data):
    """ 
    Mood labels feature engineering (2D)
        VE affect terms : valence, energy

    :param data: object - data
    :return: object - data

    Description:

    Function to map mood (affect) terms based on the valence and energy values
    it is associated with.

    The list of tuples represents the core 8 moods (or the 2d cartesian plot) based on valence and energy.

    Mood terms and values:
        - 'happy' : [(0.5, 1.0), (0.5, 0.75)]
        - 'euphoric' : [(0.5, 1.0), (0.75, 1.0)]                
        - 'tense' : [(0.0, 0.5), (0.75, 1.0)]
        - 'angry' : [(0.0, 0.5), (0.5, 0.75)]
        - 'depressed' : [(0.0, 0.5), (0.25, 0.5)]
        - 'sad' : [(0.0, 0.5), (0.0, 0.25)]
        - 'calm' : [(0.5, 1.0), (0.0, 0.25)]
        - 'relaxed' : [(0.5, 1.0), (0.25, 0.5)]    

    
    """

    valence = data['valence']
    energy = data['energy']
    
    mood_map_2D = {
        'happy': (valence >= 0.5) & (valence <= 1.0) & (energy >= 0.5) & (energy <= 0.75),
        'euphoric': (valence >= 0.5) & (valence <= 1.0) & (energy >= 0.75) & (energy <= 1.0),
        'tense': (valence >= 0.0) & (valence <= 0.5) & (energy >= 0.75) & (energy <= 1.0),
        'angry': (valence >= 0.0) & (valence <= 0.5) & (energy >= 0.5) & (energy <= 0.75),
        'depressed': (valence >= 0.0) & (valence <= 0.5) & (energy >= 0.25) & (energy <= 0.5),
        'sad': (valence >= 0.0) & (valence <= 0.5) & (energy >= 0.0) & (energy <= 0.25),
        'calm': (valence >= 0.5) & (valence <= 1.0) & (energy >= 0.0) & (energy <= 0.25),
        'relaxed': (valence >= 0.5) & (valence <= 1.0) & (energy >= 0.25) & (energy <= 0.5),       
    }

    if not (0 <= valence <= 1) or not (0 <= energy <= 1):
        raise ValueError(f''' Values for valence and energy must be between 0 and 1.
                        Mood term can't be assigned for these values: valence={valence} and energy={energy}''')

    moods = np.select(list(mood_map_2D.values()), list(mood_map_2D.keys()), default='unknown')


    return moods






In [None]:
# apply function to data to create engineered mood quadrants and subquadrant features

moosic_data['mood_42d_label'] = moosic_data.apply(mood_quadrant_map , axis=1)
moosic_data['mood_1d_label'] = moosic_data.apply(mood_1d_map, axis=1)
moosic_data['mood_goal'] = moosic_data.apply(mood_2d_map, axis=1)


moosic_data.head(3)



In [None]:
# current features for moosic data

features = moosic_data.columns.tolist()
features

In [None]:
moosic_data.info()


In [None]:
# drop null/empty rows

null_rows11 = moosic_data[moosic_data.isnull().T.any()].index
moosic_data = moosic_data.drop(null_rows11)
empty_values = moosic_data.isna().sum()
print(empty_values)

In [None]:
# current features for moosic data

features = moosic_data.columns.tolist()
features 



In [None]:
moosic_data.info()


In [None]:
# save current moosic (mood-music) dataset : removed nulls and empty lists
# compress and send processed file to be for further analysis and modelling

#moosic_data.to_csv('../data/processed/moosic_data_processed.csv', chunksize=len(moosic_data)//5, index=False)




In [None]:
# encode categorical data : genre 

#genre
genre2d_dummies = pd.get_dummies(moosic_data['core_genres'], drop_first=True).replace({True: 1, False: 0})
genre2d_encoded = pd.concat([moosic_data['core_genres'], genre2d_dummies], axis=1)
display(genre2d_encoded.head(2))



In [None]:

# add labels to moosic data

moosic_data = pd.concat([moosic_data, genre2d_encoded.drop(['core_genres'], axis = 1) ], axis=1)
display(moosic_data.head(2))




In [None]:
# get shape of the current moosic data

print(f''' 

    Processed Mood - Music (MOOSIC) data
    Number of observations : {moosic_data.shape[0]} 
    Number of feature variables : {moosic_data.shape[1]} 

    ''')

# Make data balanced for modelling purposes



In [None]:
# dataset is unbalanced from the perspective of the associated mood 
#    get the count of how the mood is distributed wrt the data


def get_balanced_data(processed_dataset):

    ''' 
    get the count of how the mood is distributed wrt the data
        - count of tracks by each mood
        - get mood label with lowest count
        - group the data based on the mood labels (mood_goal of tracks for user)
        - get dataset with randomly selected track samples for each mood labels based on 
        the value from the least occuring mood label
        
    '''

    balanced_mood_data = pd.DataFrame()

    mood_label_counts = processed_dataset['mood_goal'].value_counts()
    least_frequent_mood_label = mood_label_counts.idxmin()
    count_least_frequent_mood_label = mood_label_counts.min()

    print(f"The mood label count {mood_label_counts} ")
    print("______"*10)

    print(f"The least frequent mood label is '{least_frequent_mood_label}' with {count_least_frequent_mood_label} features.")
    print("______"*10)

    grouped_data = processed_dataset.groupby('mood_goal')

    sample_size = 11879

    for mood_label, group in grouped_data:
        
        #print(f' getting balanced data for the mood : {mood_label} \n ')

        if len(group) >= count_least_frequent_mood_label: 
            random_rows = group.sample(count_least_frequent_mood_label, random_state=42) 
        else:
            random_rows = group  

        balanced_mood_data = pd.concat([balanced_mood_data, random_rows])

        continue

    print(f' Finished processing, data has balanced number of samples for all categories. ')

    balanced_mood_data = balanced_mood_data.reset_index(drop=True) 

    current_mood_label_counts = balanced_mood_data['mood_goal'].value_counts()
    print(f"The size of data mood label count {current_mood_label_counts} ")
    print("______"*10)

    return balanced_mood_data




In [None]:

## display balanced moosic : mood-music data

moosic_data = get_balanced_data(moosic_data)





In [None]:
# info etc

print(f"-----"*10)


print(f''' 

    Processed Mood - Music (MOOSIC) data
    Number of observations : {moosic_data.shape[0]} 
    Number of feature variables : {moosic_data.shape[1]} 

    ''')

print(f"-----"*10)

moosic_data.head(2)

In [None]:
moosic_data.info()


In [None]:

# saving balanced data  

#moosic_data.to_csv('../data/processed/moodsic_data.csv', chunksize=len(moosic_data)//5, index=False)





In [None]:
#moosic_data[['explicit', 'key', 'mode', 'loudness']]