In [2]:
import pandas as pd
import statsmodels.formula.api as smf

# Load the data
data = pd.read_csv('ratings.csv')

# Display the first few rows of the dataframe to understand its structure
data.head()


Unnamed: 0,track_id,danceability,energy,key_,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,artist,album,duration_ms.1,popularity,explicit,url,track_id.1,user_id,rating,date
0,01KurZH1ejoarO8tp3wXGo,0.495,0.731,5,-8.32,1,0.0276,0.00512,0.0471,0.0979,...,Franz Ferdinand,Franz Ferdinand,259773.0,44,0,https://open.spotify.com/track/01KurZH1ejoarO8...,01KurZH1ejoarO8tp3wXGo,31uqhjwtpiiaay5ripzqwrdjw7ga,2.0,2024-03-12 20:27:08
1,01tykAEjuZsAX5I1y0FLf8,0.382,0.855,9,-8.068,1,0.038,0.000147,0.745,0.106,...,Avatar,Construction of Souls,292735.0,40,0,https://open.spotify.com/track/01tykAEjuZsAX5I...,01tykAEjuZsAX5I1y0FLf8,31sk54n54vp7yoro5vkuuhi66gx4,5.0,2024-03-12 20:27:08
2,02tKPhHyLttuqmBEUTlmw2,0.237,0.783,0,-6.806,1,0.0284,0.00217,0.569,0.489,...,Youth Lagoon,Wondrous Bughouse,232266.0,22,0,https://open.spotify.com/track/02tKPhHyLttuqmB...,02tKPhHyLttuqmBEUTlmw2,31uqhjwtpiiaay5ripzqwrdjw7ga,6.0,2024-03-12 20:27:08
3,03GBiorLGTk7T4IScIl4jj,0.703,0.443,0,-9.601,0,0.0862,0.0499,0.0,0.338,...,SIX,Six: The Musical (Studio Cast Recording),350218.0,67,0,https://open.spotify.com/track/03GBiorLGTk7T4I...,03GBiorLGTk7T4IScIl4jj,yanniquevanmegen,10.0,2024-03-12 20:27:08
4,03HZaJepBONLxbgWTsV7AK,0.479,0.187,8,-15.742,1,0.0343,0.907,0.871,0.138,...,The Bob Davis Group,Easy Living,203500.0,45,0,https://open.spotify.com/track/03HZaJepBONLxbg...,03HZaJepBONLxbgWTsV7AK,1166187914,6.0,2024-03-12 20:27:08


In [3]:
# Since we have two columns with duration in ms and one is empty for the full csv we can safely drop this column from the dataset because the second column with duration is usable.

data2 = data.drop(columns=['duration_ms'])
data2.head()


Unnamed: 0,track_id,danceability,energy,key_,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,artist,album,duration_ms.1,popularity,explicit,url,track_id.1,user_id,rating,date
0,01KurZH1ejoarO8tp3wXGo,0.495,0.731,5,-8.32,1,0.0276,0.00512,0.0471,0.0979,...,Franz Ferdinand,Franz Ferdinand,259773.0,44,0,https://open.spotify.com/track/01KurZH1ejoarO8...,01KurZH1ejoarO8tp3wXGo,31uqhjwtpiiaay5ripzqwrdjw7ga,2.0,2024-03-12 20:27:08
1,01tykAEjuZsAX5I1y0FLf8,0.382,0.855,9,-8.068,1,0.038,0.000147,0.745,0.106,...,Avatar,Construction of Souls,292735.0,40,0,https://open.spotify.com/track/01tykAEjuZsAX5I...,01tykAEjuZsAX5I1y0FLf8,31sk54n54vp7yoro5vkuuhi66gx4,5.0,2024-03-12 20:27:08
2,02tKPhHyLttuqmBEUTlmw2,0.237,0.783,0,-6.806,1,0.0284,0.00217,0.569,0.489,...,Youth Lagoon,Wondrous Bughouse,232266.0,22,0,https://open.spotify.com/track/02tKPhHyLttuqmB...,02tKPhHyLttuqmBEUTlmw2,31uqhjwtpiiaay5ripzqwrdjw7ga,6.0,2024-03-12 20:27:08
3,03GBiorLGTk7T4IScIl4jj,0.703,0.443,0,-9.601,0,0.0862,0.0499,0.0,0.338,...,SIX,Six: The Musical (Studio Cast Recording),350218.0,67,0,https://open.spotify.com/track/03GBiorLGTk7T4I...,03GBiorLGTk7T4IScIl4jj,yanniquevanmegen,10.0,2024-03-12 20:27:08
4,03HZaJepBONLxbgWTsV7AK,0.479,0.187,8,-15.742,1,0.0343,0.907,0.871,0.138,...,The Bob Davis Group,Easy Living,203500.0,45,0,https://open.spotify.com/track/03HZaJepBONLxbg...,03HZaJepBONLxbgWTsV7AK,1166187914,6.0,2024-03-12 20:27:08


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Dropping unnecessary columns which won't effect the rating of a song
columns_to_drop = ['track_id', 'artist', 'album', 'url', 'track_id.1', 'date', 'user_id', 'id', 'name']
data3 = data2.drop(columns=columns_to_drop)

# Checking for missing values in the columns after cleaning part of the data
missing_values = data3.isnull().sum()




In [5]:
# Cleaning up the missing values in the rating column. Those aren't usable for the model.
data4 = data3.dropna(subset=['rating'])
data4.head()


Unnamed: 0,danceability,energy,key_,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_ms.1,popularity,explicit,rating
0,0.495,0.731,5,-8.32,1,0.0276,0.00512,0.0471,0.0979,0.384,125.568,4,259773.0,44,0,2.0
1,0.382,0.855,9,-8.068,1,0.038,0.000147,0.745,0.106,0.55,136.138,4,292735.0,40,0,5.0
2,0.237,0.783,0,-6.806,1,0.0284,0.00217,0.569,0.489,0.565,170.768,3,232266.0,22,0,6.0
3,0.703,0.443,0,-9.601,0,0.0862,0.0499,0.0,0.338,0.367,136.647,4,350218.0,67,0,10.0
4,0.479,0.187,8,-15.742,1,0.0343,0.907,0.871,0.138,0.156,119.415,4,203500.0,45,0,6.0


In [6]:
# Double check on missing values
data4.isnull().sum()

danceability         0
energy               0
key_                 0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
time_signature       0
duration_ms.1       11
popularity           0
explicit             0
rating               0
dtype: int64

In [7]:
# Simple regression model to check if the duration of the song has a significant enough impact on the rating. 
model1 = smf.ols('rating ~ Q("duration_ms.1")', data=data4).fit()
print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.033
Model:                            OLS   Adj. R-squared:                  0.032
Method:                 Least Squares   F-statistic:                     22.38
Date:                Fri, 29 Mar 2024   Prob (F-statistic):           2.74e-06
Time:                        08:58:05   Log-Likelihood:                -1325.2
No. Observations:                 650   AIC:                             2654.
Df Residuals:                     648   BIC:                             2663.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept              5.8279      0

In [8]:
# Because we give little snippets of songs to rate, the entire duration is not of any value to the rating. Thus we drop it from the data.

data_cleaned = data4.drop(columns=['duration_ms.1'])
data_cleaned.head()

Unnamed: 0,danceability,energy,key_,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,popularity,explicit,rating
0,0.495,0.731,5,-8.32,1,0.0276,0.00512,0.0471,0.0979,0.384,125.568,4,44,0,2.0
1,0.382,0.855,9,-8.068,1,0.038,0.000147,0.745,0.106,0.55,136.138,4,40,0,5.0
2,0.237,0.783,0,-6.806,1,0.0284,0.00217,0.569,0.489,0.565,170.768,3,22,0,6.0
3,0.703,0.443,0,-9.601,0,0.0862,0.0499,0.0,0.338,0.367,136.647,4,67,0,10.0
4,0.479,0.187,8,-15.742,1,0.0343,0.907,0.871,0.138,0.156,119.415,4,45,0,6.0


In [9]:
# Splitting the dataset into features (X) and target (y)
X = data_cleaned.drop('rating', axis=1)
y = data_cleaned['rating']

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

((528, 14), (133, 14))

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 1: Training the KNN Model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Step 2: Predicting and Evaluating the Model
# Predict the responses for the test dataset
y_pred = knn.predict(X_test)

# Evaluating the Model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.18796992481203006

Confusion Matrix:
 [[ 0  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0]
 [ 0  0  1  0  0  4  0  1  1  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  1  2  3  2  1  0]
 [ 0  2  1  3  2  8 13  7  4  0]
 [ 0  0  1  3  2  6  9  4  2  2]
 [ 3  1  0  0  2  7  7  5  3  0]
 [ 0  1  0  0  0  1  5  2  1  0]
 [ 0  0  1  0  1  2  1  2  1  0]]

Classification Report:
               precision    recall  f1-score   support

         1.0       0.00      0.00      0.00         1
         2.0       0.00      0.00      0.00         1
         3.0       0.25      0.14      0.18         7
         4.0       0.00      0.00      0.00         0
         5.0       0.11      0.11      0.11         9
         6.0       0.27      0.20      0.23        40
         7.0       0.23      0.31      0.26        29
         8.0       0.22      0.18      0.20        28
         9.0       0.08      0.10      0.09        10
        10.0       0.00      0.00      0.00         8

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


We created an app in which we generate song recommendations for the users of the app. This way the users can come across new songs to add to their playlists on Spotify.
The recommendations are based upon scores they gave to their own songs from their own playlist so our model can look for songs with the same features so the model makes valid recommendations.

We have made a KNN model to try and predict the score that people with give to certain songs. We have dont his with all scores and not on a personal basis.
This model had a accuracy of 18.8%. Our analysis of this is that every person has a different taste in music so predicting people's ratings based on the ratings of people with other tastes
is pretty much impossible. It might be more doable and more accurate if we made a KNN model for every single user of the app.
