In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [9]:
spotify_data = pd.read_csv("../datasets/spotify-track.csv", index_col=0)

In [10]:
spotify_data.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [11]:
# df.head(2000).to_csv("../datasets/spotify-track-sample.csv")

In [12]:
# Create a binary 'hit' column based on the popularity threshold
spotify_data['hit'] = spotify_data['popularity'].apply(lambda x: 1 if x >= 60 else 0)

# Check the distribution of "hit" and "non-hit" tracks
hit_distribution = spotify_data['hit'].value_counts()

hit_distribution

0    99178
1    14822
Name: hit, dtype: int64

In [13]:
# Randomly sample "non-hit" tracks to balance the dataset
non_hit_sample = spotify_data[spotify_data['hit'] == 0].sample(n=14822, random_state=42)

# Concatenate the "hit" tracks with the sampled "non-hit" tracks
balanced_data = pd.concat([spotify_data[spotify_data['hit'] == 1], non_hit_sample])

# Shuffle the rows to randomize the data
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the distribution of "hit" and "non-hit" tracks in the balanced dataset
balanced_hit_distribution = balanced_data['hit'].value_counts()

balanced_hit_distribution

1    14822
0    14822
Name: hit, dtype: int64

In [14]:
# Selecting relevant features
# We'll exclude columns like 'track_id', 'artists', 'album_name', 'track_name', 'popularity', and 'track_genre' 
# as they are not directly related to the audio characteristics of the song.
features = ['duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 
            'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 
            'valence', 'tempo', 'time_signature']

X = balanced_data[features]
y = balanced_data['hit']

# Splitting the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled[:5]  # Displaying the first 5 rows of the scaled training data for a quick check



array([[-0.59909916, -0.33958458,  1.68841074,  0.46805729,  0.75221855,
         0.28008585, -1.3048234 , -0.29612749, -0.31801468, -0.43691582,
        -0.38362004,  0.30983475,  0.01266548,  0.2160241 ],
       [-0.17114762, -0.33958458,  0.74897965, -1.12578158,  0.47128715,
        -0.22949383,  0.76638724, -0.44836393,  1.33876538,  2.47741955,
         0.53038247,  0.29781387, -0.05402852,  0.2160241 ],
       [ 1.1196264 , -0.33958458, -1.04716534, -0.74392435,  1.31408135,
        -0.109022  , -1.3048234 , -0.44836393,  0.86897897, -0.44029198,
        -0.65957848,  0.10948682, -0.01608252,  0.2160241 ],
       [-1.22453372, -0.33958458,  0.18768434, -1.60310311, -1.49523265,
        -0.97577235,  0.76638724, -0.50400897,  1.83987088, -0.38806746,
        -0.55938975, -1.25287907,  0.18471226,  0.2160241 ],
       [-0.21440479, -0.33958458, -2.95143352, -2.20494331,  1.31408135,
        -2.39455052,  0.76638724, -0.45046374,  1.95575153,  2.63591286,
        -0.48322287, -1.75

In [18]:
class CustomLogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None
        
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def fit(self, X, y):
        num_samples, num_features = X.shape
        
        # 1. Initialize weights and bias
        self.weights = np.zeros(num_features)
        self.bias = 0
        
        # 2. Gradient descent
        for _ in range(self.num_iterations):
            # Model prediction
            linear_model = np.dot(X, self.weights) + self.bias
            predictions = self.sigmoid(linear_model)
            
            # Compute gradients
            dw = (1 / num_samples) * np.dot(X.T, (predictions - y))
            db = (1 / num_samples) * np.sum(predictions - y)
            
            # Update weights and bias
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            
    def predict_proba(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        return self.sigmoid(linear_model)
    
    def predict(self, X, threshold=0.5):
        probabilities = self.predict_proba(X)
        return [1 if i > threshold else 0 for i in probabilities]

In [19]:
# Initialize and train the custom logistic regression model
custom_logreg = CustomLogisticRegression(learning_rate=0.01, num_iterations=1000)
custom_logreg.fit(X_train_scaled, y_train)

# Predict on the test set
custom_y_pred = custom_logreg.predict(X_test_scaled)

# Evaluate the model's performance using sklearn
custom_accuracy = accuracy_score(y_test, custom_y_pred)
custom_classification_rep = classification_report(y_test, custom_y_pred)

print(custom_accuracy)
print(custom_classification_rep)

0.6068476977567887
              precision    recall  f1-score   support

           0       0.64      0.51      0.57      2982
           1       0.59      0.71      0.64      2947

    accuracy                           0.61      5929
   macro avg       0.61      0.61      0.60      5929
weighted avg       0.61      0.61      0.60      5929



In [16]:
# Train the logistic regression model
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = logreg.predict(X_test_scaled)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(accuracy)
print(classification_rep)

0.6070163602631136
              precision    recall  f1-score   support

           0       0.63      0.52      0.57      2982
           1       0.59      0.69      0.64      2947

    accuracy                           0.61      5929
   macro avg       0.61      0.61      0.60      5929
weighted avg       0.61      0.61      0.60      5929

