In [1]:
# https://github.com/kvsingh/music-mood-classification

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
import joblib

In [43]:
df = pd.read_csv('../songs/data_moods.csv')
df = df.drop(columns=[ 'fear', 'negative', 'sadness', 'anger', 'surprise', 'positive', 'disgust', 'joy', 'anticipation', 'happiness', 'lyrics','emotion_percentages','trust' ])
df

Unnamed: 0,name,album,artist,id,release_date,popularity,length,danceability,acousticness,energy,instrumentalness,liveness,valence,loudness,speechiness,tempo,key,time_signature,mood,trust
0,1999,1999,Prince,2H7PHVdQ3mXqEHXcvclTB0,1982-10-27,68,379266,0.866,0.13700,0.7300,0.000000,0.0843,0.6250,-8.201,0.0767,118.523,5,4,Happy,0.057692
1,23,23,Blonde Redhead,4HIwL9ii9CcXpTOTzMq0MP,2007-04-16,43,318800,0.381,0.01890,0.8320,0.196000,0.1530,0.1660,-5.069,0.0492,120.255,8,4,Sad,0.059701
2,9 Crimes,9,Damien Rice,5GZEeowhvSieFDiR8fQ2im,2006-11-06,60,217946,0.346,0.91300,0.1390,0.000077,0.0934,0.1160,-15.326,0.0321,136.168,0,4,Sad,0.030303
3,99 Luftballons,99 Luftballons,Nena,6HA97v4wEGQ5TUClRM0XLc,1984-08-21,2,233000,0.466,0.08900,0.4380,0.000006,0.1130,0.5870,-12.858,0.0608,193.100,4,4,Happy,0.071429
4,A Boy Brushed Red Living In Black And White,They're Only Chasing Safety,Underoath,47IWLfIKOKhFnz1FUEUIkE,2004-01-01,60,268000,0.419,0.00171,0.9320,0.000000,0.1370,0.4450,-3.604,0.1060,169.881,1,4,Energetic,0.101010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,windcatcher,windcatcher,Leo Nocta,59VApBbrS2IADQk4ml5mdo,2020-06-19,36,123066,0.402,0.96100,0.2360,0.919000,0.0921,0.1460,-20.615,0.0603,129.736,0,3,Calm,0.000000
682,yellow is the color of her eyes,yellow is the color of her eyes,Soccer Mommy,4D3nttJPU6L0M2epr7sId6,2019-11-19,5,435080,0.452,0.75700,0.5150,0.120000,0.1400,0.1910,-7.351,0.0255,80.537,11,4,Sad,0.166667
683,you broke me first,you broke me first,Tate McRae,45bE4HXI0AwGZXfZtMp8JR,2020-04-17,87,169265,0.642,0.78600,0.3740,0.000000,0.0906,0.0799,-9.386,0.0545,124.099,4,4,Sad,0.014493
684,you were good to me,brent,Jeremy Zucker,4CxFN5zON70B3VOPBYbd6P,2019-05-03,76,219146,0.561,0.91300,0.0848,0.000026,0.1120,0.2060,-15.099,0.0404,102.128,2,4,Sad,0.000000


In [7]:
import spotify_lyrics_scraper as spotify
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict
import time

# Function to load the NRC Emotion Lexicon
def load_nrc_lexicon(local_path='/Users/vittoriomocchi/Documents/Vitos/Projects/Emotion-Song-Recommender-System/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'):
    emotion_dict = defaultdict(set)
    with open(local_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                word, emotion, association = parts
                if int(association) > 0:
                    emotion_dict[emotion].add(word)
    return emotion_dict

# Load the emotion lexicon once
emotion_lexicon = load_nrc_lexicon()

# Obtain the SP_DC token
token = spotify.getToken("AQBSz33_ODmldHiskIdlUiuu4yS92-tR4swS5FeqJuHYfBNUdEs3j0mWK4Hmo9wDc3eTK6IRAE0xFoXnH94CkXHKUWvZuFLTdTxyBdOhLCtSyhZLRWM21tgwJDAI1MxFAA_dEPWMtVMxGd9GefWoL4JbtxCXlsfKvk0CF5zpBQ1m_JskXi7hHLj-wLJBlNdTlpdkeA_LZwBCZG5yQowbiE4hO8K5")

# Function to get lyrics and calculate emotion percentages
def get_emotion_percentages(song_name):
    start_time = time.time()
    emotion_counts = defaultdict(int)
    total_emotion_words = 0

    lyrics_data = spotify.getLyrics(token, songName=song_name)
    if lyrics_data['status']:
        lyrics_lines = lyrics_data['message']['lyrics']['lines']
        for line in lyrics_lines:
            words_line = line['words']
            tokens = word_tokenize(words_line.lower())
            for word in tokens:
                for emotion, words_set in emotion_lexicon.items():
                    if word in words_set:
                        emotion_counts[emotion] += 1
                        total_emotion_words += 1
        # Calculate emotion percentages
        emotion_percentages = {emotion: (count / total_emotion_words) if total_emotion_words > 0 else 0 for emotion, count in emotion_counts.items()}
    else:
        emotion_percentages = {}

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Processed song '{song_name}' in {elapsed_time:.2f} seconds.")

    return emotion_percentages

# Apply the function to each song in the dataframe
df['emotion_percentages'] = df['name'].apply(lambda x: get_emotion_percentages(x))

# Save the updated DataFrame back to the CSV file
df.to_csv('../songs/data_moods.csv', index=False)


Processed song '1999' in 1.14 seconds.
Processed song '23' in 0.92 seconds.
Processed song '9 Crimes' in 1.17 seconds.
Processed song '99 Luftballons' in 4.43 seconds.
Processed song 'A Boy Brushed Red Living In Black And White' in 12.58 seconds.
Processed song 'A Burden to Bear' in 4.73 seconds.
Processed song 'A La Plage' in 2.10 seconds.
Processed song 'A Little Less Conversation - JXL Radio Edit Remix' in 1.19 seconds.
Processed song 'A Place for My Head' in 1.40 seconds.
Processed song 'ATTACK' in 1.39 seconds.
Processed song 'Adagio For Strings' in 1.57 seconds.
Processed song 'Adjustments' in 3.87 seconds.
Processed song 'Adrift' in 1.37 seconds.
Processed song 'Afraid of Heights' in 3.20 seconds.
Processed song 'Africa' in 2.09 seconds.
Processed song 'After The Rain' in 4.34 seconds.
Processed song 'Afterlife' in 2.14 seconds.
Processed song 'Algo Rhythm' in 3.90 seconds.
Processed song 'Alison' in 1.36 seconds.
Processed song 'Alive' in 2.07 seconds.
Processed song 'All I Wan

## Lexicon 

In [22]:
import spotify_lyrics_scraper as spotify
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict
import pandas as pd
import time

# Ensure you have the necessary NLTK resources downloaded
nltk.download('punkt')

# Load the NRC Emotion Lexicon
def load_nrc_lexicon(local_path='/Users/vittoriomocchi/Documents/Vitos/Projects/Emotion-Song-Recommender-System/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'):
    emotion_dict = defaultdict(set)
    with open(local_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                word, emotion, association = parts
                if int(association) > 0:
                    emotion_dict[emotion].add(word)
    return emotion_dict

# Load the emotion lexicon once
emotion_lexicon = load_nrc_lexicon()

# Obtain the SP_DC token
token = spotify.getToken("AQBSz33_ODmldHiskIdlUiuu4yS92-tR4swS5FeqJuHYfBNUdEs3j0mWK4Hmo9wDc3eTK6IRAE0xFoXnH94CkXHKUWvZuFLTdTxyBdOhLCtSyhZLRWM21tgwJDAI1MxFAA_dEPWMtVMxGd9GefWoL4JbtxCXlsfKvk0CF5zpBQ1m_JskXi7hHLj-wLJBlNdTlpdkeA_LZwBCZG5yQowbiE4hO8K5")

# Function to get lyrics and calculate emotion percentages
def get_emotion_percentages(song_name):
    start_time = time.time()
    emotion_counts = defaultdict(int)
    total_emotion_words = 0

    lyrics_data = spotify.getLyrics(token, songName=song_name)
    if lyrics_data['status']:
        lyrics_lines = lyrics_data['message']['lyrics']['lines']
        for line in lyrics_lines:
            words_line = line['words']
            tokens = word_tokenize(words_line.lower())
            for word in tokens:
                for emotion, words_set in emotion_lexicon.items():
                    if word in words_set:
                        emotion_counts[emotion] += 1
                        total_emotion_words += 1

        # Calculate emotion percentages
        emotion_percentages = {emotion: (count / total_emotion_words) if total_emotion_words > 0 else 0 for emotion, count in emotion_counts.items()}
    else:
        # In case lyrics are not found, return 0 for each emotion
        emotion_percentages = {emotion: 0 for emotion in emotion_lexicon.keys()}

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Processed song '{song_name}' in {elapsed_time:.2f} seconds.")

    return emotion_percentages

# Apply the function and unpack the emotion percentages into new columns
emotion_columns = list(emotion_lexicon.keys())  # List of all emotion labels from the lexicon

# Initialize each emotion column with 0
for emotion in emotion_columns:
    df[emotion] = 0

# Apply the emotion percentage calculation to each song and unpack the result into columns
for index, row in df.iterrows():
    emotion_percentages = get_emotion_percentages(row['name'])
    for emotion, percentage in emotion_percentages.items():
        df.at[index, emotion] = percentage

# Save the updated DataFrame back to the CSV file
df.to_csv('../songs/data_moods.csv', index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vittoriomocchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processed song '1999' in 0.86 seconds.


  df.at[index, emotion] = percentage
  df.at[index, emotion] = percentage
  df.at[index, emotion] = percentage
  df.at[index, emotion] = percentage
  df.at[index, emotion] = percentage
  df.at[index, emotion] = percentage
  df.at[index, emotion] = percentage
  df.at[index, emotion] = percentage
  df.at[index, emotion] = percentage


Processed song '23' in 0.69 seconds.


  df.at[index, emotion] = percentage


Processed song '9 Crimes' in 0.79 seconds.
Processed song '99 Luftballons' in 0.67 seconds.
Processed song 'A Boy Brushed Red Living In Black And White' in 0.84 seconds.
Processed song 'A Burden to Bear' in 0.78 seconds.
Processed song 'A La Plage' in 0.71 seconds.
Processed song 'A Little Less Conversation - JXL Radio Edit Remix' in 0.71 seconds.
Processed song 'A Place for My Head' in 0.77 seconds.
Processed song 'ATTACK' in 0.81 seconds.
Processed song 'Adagio For Strings' in 0.78 seconds.
Processed song 'Adjustments' in 0.85 seconds.
Processed song 'Adrift' in 0.69 seconds.
Processed song 'Afraid of Heights' in 0.82 seconds.
Processed song 'Africa' in 0.77 seconds.
Processed song 'After The Rain' in 0.73 seconds.
Processed song 'Afterlife' in 0.75 seconds.
Processed song 'Algo Rhythm' in 0.81 seconds.
Processed song 'Alison' in 0.87 seconds.
Processed song 'Alive' in 0.73 seconds.
Processed song 'All I Want' in 0.78 seconds.
Processed song 'All Mirrors' in 0.83 seconds.
Processed s

## Roberta

In [14]:
import spotify_lyrics_scraper as spotify
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict
import pandas as pd
import time
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Ensure you have the necessary NLTK resources downloaded
nltk.download('punkt')

# Load the NRC Emotion Lexicon (this part is from your original approach)
def load_nrc_lexicon(local_path='/Users/vittoriomocchi/Documents/Vitos/Projects/Emotion-Song-Recommender-System/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'):
    emotion_dict = defaultdict(set)
    with open(local_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                word, emotion, association = parts
                if int(association) > 0:
                    emotion_dict[emotion].add(word)
    return emotion_dict

# Load the emotion lexicon once (optional, you may skip this if not using the lexicon-based approach)
emotion_lexicon = load_nrc_lexicon()

# Load the pre-trained Hugging Face transformer model and tokenizer
model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Obtain the SP_DC token (replace with your token retrieval process)
token = spotify.getToken("AQBSz33_ODmldHiskIdlUiuu4yS92-tR4swS5FeqJuHYfBNUdEs3j0mWK4Hmo9wDc3eTK6IRAE0xFoXnH94CkXHKUWvZuFLTdTxyBdOhLCtSyhZLRWM21tgwJDAI1MxFAA_dEPWMtVMxGd9GefWoL4JbtxCXlsfKvk0CF5zpBQ1m_JskXi7hHLj-wLJBlNdTlpdkeA_LZwBCZG5yQowbiE4hO8K5")

# Function to get lyrics and calculate emotion percentages using the Hugging Face model
def get_emotion_probabilities(song_name):
    start_time = time.time()
    
    lyrics_data = spotify.getLyrics(token, songName=song_name)
    emotion_probs = {}

    if lyrics_data['status']:
        lyrics_lines = lyrics_data['message']['lyrics']['lines']
        song_lyrics = "\n".join([line['words'] for line in lyrics_lines])

        # Tokenize the input text
        inputs = tokenizer(song_lyrics, return_tensors="pt", truncation=True, padding=True)
        
        # Get the model outputs
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Extract logits and compute probabilities
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=1)[0]
        
        # Define the emotions corresponding to the model's output
        emotions = ['happiness', 'sadness', 'anger', 'disgust', 'surprise', 'fear']
        
        # Map emotions to probabilities
        emotion_probs = {emotion: float(probabilities[idx]) for idx, emotion in enumerate(emotions)}

    else:
        # If lyrics not found, return 0 for each emotion
        emotion_probs = {emotion: 0 for emotion in ['happiness', 'sadness', 'anger', 'disgust', 'surprise', 'fear']}

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Processed song '{song_name}' in {elapsed_time:.2f} seconds.")

    return emotion_probs

# Apply the function and unpack the emotion probabilities into new columns
emotion_columns = ['happiness', 'sadness', 'anger', 'disgust', 'surprise', 'fear']  # Emotions used by the model

# Initialize each emotion column with 0 in the DataFrame
for emotion in emotion_columns:
    df[emotion] = 0

# Apply the emotion calculation to each song and unpack the result into columns
for index, row in df.iterrows():
    emotion_probabilities = get_emotion_probabilities(row['name'])
    for emotion, probability in emotion_probabilities.items():
        df.at[index, emotion] = probability

# Save the updated DataFrame back to the CSV file
df.to_csv('../songs/data_moods.csv', index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vittoriomocchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processed song '1999' in 1.53 seconds.


  df.at[index, emotion] = probability
  df.at[index, emotion] = probability
  df.at[index, emotion] = probability
  df.at[index, emotion] = probability
  df.at[index, emotion] = probability
  df.at[index, emotion] = probability


Processed song '23' in 0.98 seconds.
Processed song '9 Crimes' in 0.77 seconds.
Processed song '99 Luftballons' in 0.85 seconds.
Processed song 'A Boy Brushed Red Living In Black And White' in 0.93 seconds.
Processed song 'A Burden to Bear' in 0.78 seconds.
Processed song 'A La Plage' in 0.80 seconds.
Processed song 'A Little Less Conversation - JXL Radio Edit Remix' in 1.03 seconds.
Processed song 'A Place for My Head' in 1.05 seconds.
Processed song 'ATTACK' in 0.97 seconds.
Processed song 'Adagio For Strings' in 0.86 seconds.
Processed song 'Adjustments' in 2.29 seconds.
Processed song 'Adrift' in 1.65 seconds.
Processed song 'Afraid of Heights' in 1.05 seconds.
Processed song 'Africa' in 0.82 seconds.
Processed song 'After The Rain' in 1.00 seconds.
Processed song 'Afterlife' in 0.83 seconds.
Processed song 'Algo Rhythm' in 1.45 seconds.
Processed song 'Alison' in 1.05 seconds.
Processed song 'Alive' in 0.76 seconds.
Processed song 'All I Want' in 1.03 seconds.
Processed song 'All 

## Definitive model selection

In [45]:
import spotify_lyrics_scraper as spotify
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict
import pandas as pd
import time
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Ensure you have the necessary NLTK resources downloaded
nltk.download('punkt')

# Load the NRC Emotion Lexicon
def load_nrc_lexicon(local_path='/Users/vittoriomocchi/Documents/Vitos/Projects/Emotion-Song-Recommender-System/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'):
    emotion_dict = defaultdict(set)
    with open(local_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                word, emotion, association = parts
                if int(association) > 0:
                    emotion_dict[emotion].add(word)
    return emotion_dict

# Load the emotion lexicon once (for Lexicon-based model)
emotion_lexicon = load_nrc_lexicon()

# Load the pre-trained Hugging Face transformer model and tokenizer (for RoBERTa model)
model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Obtain the SP_DC token
token = spotify.getToken("AQBSz33_ODmldHiskIdlUiuu4yS92-tR4swS5FeqJuHYfBNUdEs3j0mWK4Hmo9wDc3eTK6IRAE0xFoXnH94CkXHKUWvZuFLTdTxyBdOhLCtSyhZLRWM21tgwJDAI1MxFAA_dEPWMtVMxGd9GefWoL4JbtxCXlsfKvk0CF5zpBQ1m_JskXi7hHLj-wLJBlNdTlpdkeA_LZwBCZG5yQowbiE4hO8K5")

# Function to get emotion percentages using the Lexicon-based approach (Model 2)
def get_emotion_percentages_lexicon(song_name):
    start_time = time.time()
    emotion_counts = defaultdict(int)
    total_emotion_words = 0

    lyrics_data = spotify.getLyrics(token, songName=song_name)
    if lyrics_data['status']:
        lyrics_lines = lyrics_data['message']['lyrics']['lines']
        for line in lyrics_lines:
            words_line = line['words']
            tokens = word_tokenize(words_line.lower())
            for word in tokens:
                for emotion, words_set in emotion_lexicon.items():
                    if word in words_set:
                        emotion_counts[emotion] += 1
                        total_emotion_words += 1

        # Calculate emotion percentages
        emotion_percentages = {emotion: (count / total_emotion_words) if total_emotion_words > 0 else 0 for emotion, count in emotion_counts.items()}
    else:
        # In case lyrics are not found, return 0 for each emotion
        emotion_percentages = {emotion: 0 for emotion in emotion_lexicon.keys()}

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Processed song '{song_name}' using Lexicon model in {elapsed_time:.2f} seconds.")

    return emotion_percentages

# Function to get emotion probabilities using the Hugging Face RoBERTa model (Model 1)
def get_emotion_probabilities_roberta(song_name):
    start_time = time.time()
    
    lyrics_data = spotify.getLyrics(token, songName=song_name)
    emotion_probs = {}

    if lyrics_data['status']:
        lyrics_lines = lyrics_data['message']['lyrics']['lines']
        song_lyrics = "\n".join([line['words'] for line in lyrics_lines])

        # Tokenize the input text
        inputs = tokenizer(song_lyrics, return_tensors="pt", truncation=True, padding=True)
        
        # Get the model outputs
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Extract logits and compute probabilities
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=1)[0]
        
        # Define the emotions corresponding to the model's output
        emotions = ['happiness', 'sadness', 'anger', 'disgust', 'surprise', 'fear']
        
        # Map emotions to probabilities
        emotion_probs = {emotion: float(probabilities[idx]) for idx, emotion in enumerate(emotions)}

    else:
        # If lyrics not found, return 0 for each emotion
        emotion_probs = {emotion: 0 for emotion in ['happiness', 'sadness', 'anger', 'disgust', 'surprise', 'fear']}

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Processed song '{song_name}' using RoBERTa model in {elapsed_time:.2f} seconds.")

    return emotion_probs

# Ask the user which model to use
def choose_model():
    print("Select emotion analysis model:")
    print("1. RoBERTa-based Transformer Model")
    print("2. Lexicon-based Model")
    choice = input("Enter the number of the model to use (1 or 2): ")
    return int(choice)

# Main function to apply the chosen model to the dataframe
def apply_emotion_analysis(df):
    # Ask the user which model to use
    model_choice = choose_model()

    # Define emotion columns based on the chosen model
    if model_choice == 1:
        emotion_columns = ['happiness', 'sadness', 'anger', 'disgust', 'surprise', 'fear']
    elif model_choice == 2:
        emotion_columns = list(emotion_lexicon.keys())
    else:
        print("Invalid choice. Exiting.")
        return

    # Initialize each emotion column with 0 in the DataFrame
    for emotion in emotion_columns:
        df[emotion] = 0

    # Apply the chosen model to each song
    for index, row in df.iterrows():
        song_name = row['name']
        if model_choice == 1:
            emotion_values = get_emotion_probabilities_roberta(song_name)
        elif model_choice == 2:
            emotion_values = get_emotion_percentages_lexicon(song_name)

        # Update the DataFrame with the emotion values
        for emotion, value in emotion_values.items():
            df.at[index, emotion] = value

    # Save the updated DataFrame back to the CSV file
    df.to_csv('../songs/data_moods.csv', index=False)
    print("Emotion analysis completed and saved to '../songs/data_moods.csv'")

# Assuming df is your DataFrame with song names
# Apply the emotion analysis to the DataFrame
apply_emotion_analysis(df)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vittoriomocchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Select emotion analysis model:
1. RoBERTa-based Transformer Model
2. Lexicon-based Model
Processed song '1999' using RoBERTa model in 0.78 seconds.


  df.at[index, emotion] = value
  df.at[index, emotion] = value
  df.at[index, emotion] = value
  df.at[index, emotion] = value
  df.at[index, emotion] = value
  df.at[index, emotion] = value


Processed song '23' using RoBERTa model in 0.94 seconds.
Processed song '9 Crimes' using RoBERTa model in 0.93 seconds.
Processed song '99 Luftballons' using RoBERTa model in 0.72 seconds.
Processed song 'A Boy Brushed Red Living In Black And White' using RoBERTa model in 1.01 seconds.
Processed song 'A Burden to Bear' using RoBERTa model in 0.95 seconds.
Processed song 'A La Plage' using RoBERTa model in 0.84 seconds.
Processed song 'A Little Less Conversation - JXL Radio Edit Remix' using RoBERTa model in 1.00 seconds.
Processed song 'A Place for My Head' using RoBERTa model in 1.43 seconds.
Processed song 'ATTACK' using RoBERTa model in 1.00 seconds.
Processed song 'Adagio For Strings' using RoBERTa model in 1.01 seconds.
Processed song 'Adjustments' using RoBERTa model in 1.29 seconds.
Processed song 'Adrift' using RoBERTa model in 0.98 seconds.
Processed song 'Afraid of Heights' using RoBERTa model in 1.06 seconds.
Processed song 'Africa' using RoBERTa model in 1.07 seconds.
Proce

KeyboardInterrupt: 

In [46]:
df

Unnamed: 0,name,album,artist,id,release_date,popularity,length,danceability,acousticness,energy,...,key,time_signature,mood,trust,happiness,sadness,anger,disgust,surprise,fear
0,1999,1999,Prince,2H7PHVdQ3mXqEHXcvclTB0,1982-10-27,68,379266,0.866,0.13700,0.7300,...,5,4,Happy,0.057692,0.217366,0.028740,0.488837,0.035107,0.132523,0.063581
1,23,23,Blonde Redhead,4HIwL9ii9CcXpTOTzMq0MP,2007-04-16,43,318800,0.381,0.01890,0.8320,...,8,4,Sad,0.059701,0.054792,0.042334,0.203747,0.085927,0.470925,0.044508
2,9 Crimes,9,Damien Rice,5GZEeowhvSieFDiR8fQ2im,2006-11-06,60,217946,0.346,0.91300,0.1390,...,0,4,Sad,0.030303,0.302652,0.440267,0.126672,0.004643,0.085946,0.029589
3,99 Luftballons,99 Luftballons,Nena,6HA97v4wEGQ5TUClRM0XLc,1984-08-21,2,233000,0.466,0.08900,0.4380,...,4,4,Happy,0.071429,0.080118,0.111096,0.344535,0.009089,0.300639,0.095294
4,A Boy Brushed Red Living In Black And White,They're Only Chasing Safety,Underoath,47IWLfIKOKhFnz1FUEUIkE,2004-01-01,60,268000,0.419,0.00171,0.9320,...,1,4,Energetic,0.101010,0.002782,0.000988,0.986736,0.000757,0.003369,0.003209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,windcatcher,windcatcher,Leo Nocta,59VApBbrS2IADQk4ml5mdo,2020-06-19,36,123066,0.402,0.96100,0.2360,...,0,3,Calm,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
682,yellow is the color of her eyes,yellow is the color of her eyes,Soccer Mommy,4D3nttJPU6L0M2epr7sId6,2019-11-19,5,435080,0.452,0.75700,0.5150,...,11,4,Sad,0.166667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
683,you broke me first,you broke me first,Tate McRae,45bE4HXI0AwGZXfZtMp8JR,2020-04-17,87,169265,0.642,0.78600,0.3740,...,4,4,Sad,0.014493,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
684,you were good to me,brent,Jeremy Zucker,4CxFN5zON70B3VOPBYbd6P,2019-05-03,76,219146,0.561,0.91300,0.0848,...,2,4,Sad,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [47]:
moods= {'Sad' : 0, 'Calm' : 1, 'Energetic' : 2, 'Happy' : 3}
df['mood'] = df['mood'].apply(lambda x: moods[x])
df['mood'].value_counts()

mood
0    197
1    195
2    154
3    140
Name: count, dtype: int64

In [48]:
df['release_date'] = pd.to_datetime(df['release_date'], format='mixed')
#df['release_year'] = df['release_date'].dt.year
#df['release_month']= df['release_date'].dt.month
#df['release_day']= df['release_date'].dt.day

X = df.drop(['mood','id','name','album','artist','release_date'],axis=1)
y = df['mood']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X

Unnamed: 0,popularity,length,danceability,acousticness,energy,instrumentalness,liveness,valence,loudness,speechiness,tempo,key,time_signature,trust,happiness,sadness,anger,disgust,surprise,fear
0,68,379266,0.866,0.13700,0.7300,0.000000,0.0843,0.6250,-8.201,0.0767,118.523,5,4,0.057692,0.217366,0.028740,0.488837,0.035107,0.132523,0.063581
1,43,318800,0.381,0.01890,0.8320,0.196000,0.1530,0.1660,-5.069,0.0492,120.255,8,4,0.059701,0.054792,0.042334,0.203747,0.085927,0.470925,0.044508
2,60,217946,0.346,0.91300,0.1390,0.000077,0.0934,0.1160,-15.326,0.0321,136.168,0,4,0.030303,0.302652,0.440267,0.126672,0.004643,0.085946,0.029589
3,2,233000,0.466,0.08900,0.4380,0.000006,0.1130,0.5870,-12.858,0.0608,193.100,4,4,0.071429,0.080118,0.111096,0.344535,0.009089,0.300639,0.095294
4,60,268000,0.419,0.00171,0.9320,0.000000,0.1370,0.4450,-3.604,0.1060,169.881,1,4,0.101010,0.002782,0.000988,0.986736,0.000757,0.003369,0.003209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,36,123066,0.402,0.96100,0.2360,0.919000,0.0921,0.1460,-20.615,0.0603,129.736,0,3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
682,5,435080,0.452,0.75700,0.5150,0.120000,0.1400,0.1910,-7.351,0.0255,80.537,11,4,0.166667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
683,87,169265,0.642,0.78600,0.3740,0.000000,0.0906,0.0799,-9.386,0.0545,124.099,4,4,0.014493,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
684,76,219146,0.561,0.91300,0.0848,0.000026,0.1120,0.2060,-15.099,0.0404,102.128,2,4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [49]:
print(X_train.shape)
print(X_test.shape)

(548, 20)
(138, 20)


In [50]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(X_train)

nn = MLPClassifier(max_iter = 150000, alpha=1.0, hidden_layer_sizes=8)
scores = cross_val_score(nn, train_scaled, y_train, cv=5)
print ("cv score: " + str(scores.mean()))

hyper_opt = False
if hyper_opt:
    params = {"alpha": np.logspace(-4, 2, 7), 'hidden_layer_sizes': [1, 2, 5, 10, 20, 50, 100]}
    clf = GridSearchCV(nn, params)
    clf.fit(train_scaled, y_train)
    print("hyperparam optimized score : " + str(clf.best_score_))

cv score: 0.8211342785654713


In [51]:
# Ensure clf is defined by running the cell where GridSearchCV is used
hyper_opt = True
if hyper_opt:
    params = {"alpha": np.logspace(-4, 2, 7), 'hidden_layer_sizes': [1, 2, 5, 10, 20, 50, 100]}
    clf = GridSearchCV(nn, params)
    clf.fit(train_scaled, y_train)
    print("hyperparam optimized score : " + str(clf.best_score_))

# Now you can access clf.best_estimator_
clf.best_estimator_

hyperparam optimized score : 0.8248373644703921


In [52]:
clf.best_params_

{'alpha': 1.0, 'hidden_layer_sizes': 50}

In [53]:
results = cross_validate(nn, train_scaled, y_train, cv=10, return_train_score=True, return_estimator=True)
results


{'fit_time': array([0.23274016, 0.20808339, 0.18424702, 0.20559311, 0.18853593,
        0.23582006, 0.17263103, 0.23323703, 0.20696592, 0.20093107]),
 'score_time': array([0.00054407, 0.00032473, 0.00036979, 0.00035572, 0.00042605,
        0.00038505, 0.00031114, 0.00034404, 0.00034118, 0.00033092]),
 'estimator': [MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=150000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=150000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=150000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=150000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=150000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=150000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=150000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=150000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=150000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=150000)],
 'test_score':

In [54]:
nn = MLPClassifier(hidden_layer_sizes=8, max_iter=150000, alpha=1.0)
nn.fit(train_scaled, y_train)
test_preds = nn.predict(scaler.transform(X_test))
accuracy_score(test_preds, y_test)

0.8333333333333334

In [55]:
# safe  model
joblib.dump(nn, "./neural_network.joblib")

['./neural_network.joblib']

In [56]:
nn.predict(scaler.transform(X_test))

array([1, 2, 2, 2, 3, 0, 1, 2, 1, 0, 3, 0, 1, 2, 0, 0, 1, 3, 3, 1, 0, 0,
       2, 2, 3, 0, 0, 1, 0, 2, 0, 0, 2, 3, 3, 3, 1, 1, 0, 1, 0, 2, 1, 0,
       2, 1, 2, 2, 2, 0, 0, 2, 3, 2, 0, 1, 1, 3, 1, 2, 0, 1, 0, 1, 1, 3,
       2, 0, 0, 3, 2, 0, 2, 3, 0, 1, 0, 2, 1, 0, 0, 1, 0, 0, 0, 3, 0, 2,
       0, 1, 3, 3, 0, 2, 0, 2, 2, 3, 0, 1, 0, 3, 1, 0, 3, 1, 2, 2, 1, 1,
       1, 1, 1, 1, 2, 1, 2, 2, 0, 3, 0, 2, 2, 0, 1, 1, 1, 3, 3, 2, 0, 1,
       3, 1, 1, 0, 1, 2])

## Lets see if we can leverage the accuracy with more data

### Ger more data

In [57]:
keys = df.keys()
keys =keys.to_list()
#keys.remove('mood')

sad_songs = pd.read_csv('../songs/very_sadSongs.csv',index_col=None)
calm_songs = pd.read_csv('../songs/very_calmSongs.csv',index_col=None)
energetic_songs = pd.read_csv('../songs/very_energeticSongs.csv',index_col=None)
happy_songs = pd.read_csv('../songs/very_happySongs.csv',index_col=None)

df_existing = df.copy()

new_songs = [sad_songs, calm_songs, energetic_songs, happy_songs]
moods = [0,1,2,3]

for i in range(len(new_songs)):
    new_df = new_songs[i]
    new_df = new_df.reset_index(drop=True)
    new_df['mood'] = moods[i]
    df_existing = pd.concat([df_existing,new_df],ignore_index=True)
    

df_existing = df_existing.drop(['id','name','album','artist','release_date','Unnamed: 0'],axis=1)



In [37]:
X = df_existing.drop(['mood'],axis=1)
y = df_existing['mood']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(X_train)

nn = MLPClassifier(max_iter = 150000, alpha=1.0, hidden_layer_sizes=8)
scores = cross_val_score(nn, train_scaled, y_train, cv=5)
print ("cv score: " + str(scores.mean()))

hyper_opt = True
if hyper_opt:
    params = {"alpha": np.logspace(-4, 2, 7), 'hidden_layer_sizes': [1, 2, 5, 10, 20, 50, 100]}
    clf = GridSearchCV(nn, params)
    clf.fit(train_scaled, y_train)
    print("hyperparam optimized score : " + str(clf.best_score_))

NameError: name 'SimpleImputer' is not defined

In [73]:
clf.best_estimator_

MLPClassifier(alpha=1.0, hidden_layer_sizes=100, max_iter=15000)

In [74]:
clf.best_params_

{'alpha': 1.0, 'hidden_layer_sizes': 100}

In [75]:
results = cross_validate(nn, train_scaled, y_train, cv=10, return_train_score=True, return_estimator=True)
results

{'fit_time': array([1.01678324, 0.796242  , 0.98170805, 1.02307606, 0.89058304,
        1.00307989, 0.88434219, 0.93762207, 1.02510095, 1.03690481]),
 'score_time': array([0.00037789, 0.00033998, 0.0003109 , 0.00045419, 0.00036883,
        0.00032592, 0.00034094, 0.00030589, 0.00031996, 0.00033903]),
 'estimator': [MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=15000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=15000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=15000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=15000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=15000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=15000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=15000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=15000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=15000),
  MLPClassifier(alpha=1.0, hidden_layer_sizes=8, max_iter=15000)],
 'test_score': array([0.

In [89]:
nn = MLPClassifier(hidden_layer_sizes=100, max_iter=15000, alpha=1.0)
nn.fit(train_scaled, y_train)
test_preds = nn.predict(scaler.transform(X_test))
accuracy_score(test_preds, y_test)

0.8564814814814815

In [91]:
nn2 = MLPClassifier(hidden_layer_sizes=100, max_iter=15000, alpha=1.0)
nn2.fit(train_scaled, y_train)
test_preds = nn.predict(scaler.transform(X_test))
accuracy_score(test_preds, y_test)

0.8564814814814815

In [90]:
# safe  model
joblib.dump(nn, "./neural_network_86.joblib")

['./neural_network_86.joblib']