In [5]:
import sys

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
import pandas as pd
import contractions
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.ensemble import VotingClassifier
from sklearn.impute import SimpleImputer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\melih\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\melih\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\melih\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\melih\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
data=pd.read_csv('D:/Python_Projects/spotivibe_exp/data.csv')
audio_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
print(data.shape)

(14643, 27)


In [5]:
# Initialize the scaler
scaler = StandardScaler()

# Scale the audio features
data[audio_features] = scaler.fit_transform(data[audio_features])

# Group the data by label (assuming 'mood' is the label column) and calculate the mean of each feature
mean_values_per_label = data.groupby('mood_cats')[audio_features].mean()

# Display the mean values for each label
print(mean_values_per_label)

           acousticness  danceability    energy  instrumentalness  liveness  \
mood_cats                                                                     
0              0.008902      0.114597  0.032946         -0.107782  0.009064   
1              0.071381     -0.075932 -0.126077          0.052655 -0.012502   
2              0.160174      0.039954 -0.201839          0.056050 -0.067977   
3             -0.267407     -0.091364  0.326379          0.012385  0.074780   

           loudness  speechiness     tempo   valence  
mood_cats                                             
0          0.062197    -0.041869 -0.000305  0.203921  
1         -0.089532    -0.031800 -0.014809 -0.148878  
2         -0.148095    -0.045095 -0.052108 -0.060003  
3          0.187450     0.139999  0.073110 -0.009197  


In [7]:
data[audio_features].describe().round(2)

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
count,14613.0,14613.0,14613.0,14613.0,14613.0,14613.0,14613.0,14613.0,14613.0
mean,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.87,-3.19,-2.3,-0.6,-1.19,-6.95,-0.9,-4.03,-1.79
25%,-0.84,-0.66,-0.69,-0.6,-0.59,-0.33,-0.6,-0.82,-0.83
50%,-0.49,0.09,0.1,-0.6,-0.41,0.24,-0.47,-0.04,-0.11
75%,0.73,0.73,0.81,0.28,0.31,0.65,0.09,0.65,0.76
max,2.15,2.19,1.61,2.29,5.45,1.98,6.88,3.21,2.19


In [19]:
# Step 1: Convert lyrics into TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_features = tfidf_vectorizer.fit_transform(data['cleaned_lyrics']).toarray()

# Step 2: Scale your audio features to 0-1 range using MinMaxScaler
scaler = MinMaxScaler()
audio_features = data[['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 
                     'loudness', 'speechiness', 'tempo', 'valence']]

# Handle missing or constant values (fill missing with 0, and check variance)
audio_features = audio_features.fillna(0)

# Scale the audio features
scaled_audio_features = scaler.fit_transform(audio_features)

# Step 3: Correlation Analysis
# Compute correlation between each audio feature and the mean of TF-IDF vectors
tfidf_mean = np.mean(tfidf_features, axis=1)

# Check if TF-IDF mean has constant values (if so, correlation is not meaningful)
if np.std(tfidf_mean) == 0:
    print("TF-IDF mean vector has constant values. Correlation may not be meaningful.")
else:
    # Calculate correlation, handle constant columns
    correlations = pd.DataFrame(scaled_audio_features, columns=audio_features.columns).apply(
        lambda col: np.corrcoef(col, tfidf_mean)[0, 1] if np.std(col) > 0 else np.nan
    )

# Display correlations
print(correlations)

acousticness       -0.005992
danceability        0.007543
energy             -0.001556
instrumentalness   -0.055199
liveness            0.006302
loudness            0.023503
speechiness         0.039845
tempo              -0.024381
valence            -0.012659
dtype: float64


In [34]:
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_lyrics'], data['mood_cats'], test_size=0.2, random_state=42)

# Step 1: Process text-based features (TF-IDF for lyrics)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=200)  # Example using 1-grams and 2-grams
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 2: Process audio features (scaling them between 0 and 1)
audio_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
scaler = MinMaxScaler()

# Scale the audio features
X_train_audio = scaler.fit_transform(data.loc[X_train.index, audio_features])
X_test_audio = scaler.transform(data.loc[X_test.index, audio_features])


audio_weight = 10  # Increase this factor to give more weight to audio features
X_train_combined = np.hstack([X_train_tfidf.toarray(), X_train_audio * audio_weight])
X_test_combined = np.hstack([X_test_tfidf.toarray(), X_test_audio * audio_weight])

# Step 1: Find the rows with NaN values in training and test sets
nan_rows_train = np.isnan(X_train_combined).any(axis=1)
nan_rows_test = np.isnan(X_test_combined).any(axis=1)

# Step 2: Drop the rows with NaN values in the audio features
X_train = X_train_combined[~nan_rows_train]
X_test= X_test_combined[~nan_rows_test]

# Also drop corresponding rows in y_train and y_test
y_train = y_train[~nan_rows_train]
y_test = y_test[~nan_rows_test]

# Display the shapes after dropping NaN rows
print(f"Shape of X_train before removing NaNs: {X_train_combined.shape}")
print(f"Shape of X_train after removing NaNs: {X_train.shape}")


# Step 4: Train the SVM model on the combined feature set

svc_model = SVC(kernel='rbf', C=10, gamma=1)  # Adjust parameters as needed
svc_model.fit(X_train, y_train)

# Step 5: Make predictions and evaluate the model
y_pred = svc_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Shape of X_train before removing NaNs: (11714, 209)
Shape of X_train after removing NaNs: (11688, 209)
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.44      0.49       840
           1       0.48      0.68      0.57       779
           2       0.98      0.82      0.89       630
           3       0.99      0.94      0.96       676

    accuracy                           0.70      2925
   macro avg       0.75      0.72      0.73      2925
weighted avg       0.73      0.70      0.71      2925



In [6]:
# Step 1: Split the dataset for text (TF-IDF) features
X_train_text, X_test_text, y_train, y_test = train_test_split(data['cleaned_lyrics'], data['mood_cats'], test_size=0.2, random_state=42)

# Step 2: Vectorize text using TF-IDF
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

# Step 3: Prepare and split audio features
audio_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 
                  'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
audio_features_data = data[audio_features]

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
audio_features_data_imputed = imputer.fit_transform(audio_features_data)

# Split the imputed audio features and the labels
X_train_audio, X_test_audio, y_train_audio, y_test_audio = train_test_split(
    audio_features_data_imputed, data['mood_cats'], test_size=0.2, random_state=42
)

# Step 4: Train separate models
# Model 1: SVM on TF-IDF features
svc_text = SVC(kernel='rbf', C=10, gamma=1, probability=True)  # Use probability=True for voting
svc_text.fit(X_train_tfidf, y_train)

# Model 2: SVM on audio features
svc_audio = SVC(kernel='rbf', C=10, gamma=1, probability=True)  # Use probability=True for voting
svc_audio.fit(X_train_audio, y_train)

# Step 5: Voting Classifier (Soft Voting)
voting_clf = VotingClassifier(estimators=[
    ('svc_text', svc_text),
    ('svc_audio', svc_audio)
], voting='soft')

# Combine text (TF-IDF) and audio features
X_train_combined = np.hstack([X_train_tfidf.toarray(), X_train_audio])
X_test_combined = np.hstack([X_test_tfidf.toarray(), X_test_audio])

# Train the ensemble model
voting_clf.fit(X_train_combined, y_train)

# Step 6: Make predictions and evaluate
y_pred = voting_clf.predict(X_test_combined)

print("Ensemble Model Classification Report:")
print(classification_report(y_test, y_pred))

Ensemble Model Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.47      0.49       842
           1       0.46      0.60      0.52       779
           2       0.99      0.83      0.90       632
           3       1.00      0.94      0.97       676

    accuracy                           0.69      2929
   macro avg       0.74      0.71      0.72      2929
weighted avg       0.72      0.69      0.70      2929

