In [25]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.metrics import classification_report,accuracy_score


Read csv and explore contents

In [2]:
df = pd.read_csv('datasets\spotify_songs.csv')

In [3]:
df.isnull().sum()

danceability         3662
energy              11261
loudness             7620
speechiness          2361
acousticness          894
instrumentalness     6470
liveness             2986
valence              7489
tempo                6557
genre                   0
dtype: int64

In [4]:
df.sample(15)

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
1468,0.77,,-6.724,0.131,0.169,0.0,0.0676,0.492,105.93,pop
4165,0.688,0.177,-14.061,0.0507,0.454,0.626,0.376,0.316,,pop
4527,0.693,,-8.376,0.192,0.0398,0.0,0.186,0.663,85.026,pop
6178,0.807,0.693,-5.056,0.0481,0.0504,8e-06,0.0762,0.142,,rap
8527,0.688,,-5.311,0.162,0.018,3e-06,,0.578,,rap
1694,0.35,0.956,-3.737,0.0807,0.00707,0.0,0.239,0.668,145.431,pop
11885,0.755,0.869,-9.271,0.0572,0.383,0.00162,,0.782,138.896,rock
1376,0.791,,,0.0439,0.0623,0.0,0.156,0.181,,pop
29693,0.746,,-6.722,,0.103,0.0036,0.138,0.324,111.961,edm
615,0.741,0.565,-6.621,0.0426,0.497,0.0,0.067,0.566,105.116,pop


Impute missing values 

In [13]:
def impute_numbers(data:pd.DataFrame, columns: list):
    data_copy = data.copy()
    imputer = SimpleImputer(strategy='mean')
    for column in columns:
        if type((data_copy[column].to_list()[0])) != type("lol"):
            data_copy[column] = imputer.fit_transform(data_copy[[column]])
    
    return data_copy

In [8]:
def encode_columns(data:pd.DataFrame, columns: list):
    data_copy = data.copy()
    encoder = LabelEncoder()
    for column in columns:
        if type(data[column].to_list()[0]) == str:
            data_copy[column] = encoder.fit_transform(data_copy[column])
    return data_copy

In [14]:
df = impute_numbers(df, df.columns.to_list())
df = encode_columns(df,df.columns.to_list())

In [15]:
df.sample(10)

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
13250,0.543,0.944,-7.495,0.0543,0.402,0.000564,0.190442,0.735,101.174,5
28838,0.584,0.643,-6.415,0.107035,0.288,0.0,0.186,0.51076,90.036,0
27241,0.576,0.977,-3.106,0.0392,0.000452,0.00045,0.163,0.51076,128.054,0
10102,0.743,0.695,-5.793,0.101,0.38,0.0,0.105,0.51076,133.95,4
7399,0.768,0.697964,-7.439,0.403,0.211,0.0,0.356,0.461,89.916,4
21267,0.554,0.596,-8.214,0.13,0.175157,0.00482,0.0805,0.51076,120.905999,1
22305,0.806,0.442,-10.75,0.17,0.0374,0.0,0.0788,0.283,90.006,3
23322,0.758,0.697964,-6.700534,0.112,0.128,0.0,0.190442,0.51076,155.955,3
2193,0.679,0.697964,-6.383,0.0407,0.0755,0.0,0.271,0.571,127.435,2
26543,0.749,0.566,-7.637,0.0862,0.533,1e-06,0.189,0.771,81.141,3


In [16]:
df.isnull().sum()

danceability        0
energy              0
loudness            0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
genre               0
dtype: int64

Split the dataset, test using different models and determine best model

In [17]:
from sklearn.model_selection import train_test_split

X = df.drop(['genre'],axis=1)
Y = df.genre
X_train,X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)

In [20]:
#Иницијализација

#GaussainNB
gaus_model = GaussianNB()

#Logistic regression
log_model = LogisticRegression()

#Decision tree Classifier
dt_classifier = DecisionTreeClassifier()

#К nearest neighbour classifier
knn_classifier = KNeighborsClassifier(n_neighbors=3)

#XGBoost Classifier 
xgb_model = XGBClassifier()

In [21]:
decision_param_grip = {
    'max_depth' : [5,7],
    'min_samples_leaf': [3,5,9],
    'min_samples_split': [5,10],
    'max_leaf_nodes' : [5,10]
}

xgb_param_grid = {
    'max_depth': [5,10],
    'min_child_weight' : [1,1.1],
    'n_estimators': [50,100],
}

In [22]:
dt_grid_search = GridSearchCV(estimator=dt_classifier,param_grid=decision_param_grip,cv = 5, scoring='accuracy')
xgb_grid_search = GridSearchCV(estimator = xgb_model,param_grid=xgb_param_grid, scoring='accuracy',cv=5)

dt_grid_search.fit(X_train,y_train)
xgb_grid_search.fit(X_train, y_train)

0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,param_grid,"{'max_depth': [5, 10], 'min_child_weight': [1, 1.1], 'n_estimators': [50, 100]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [23]:
best_dt_classifier = dt_grid_search.best_estimator_
best_xgb_classifier = xgb_grid_search.best_estimator_

In [24]:
models = {
    'Gaussian Classifier' : gaus_model,
    'Logistic Regression' : log_model,
    'Decision Tree Classifier' : best_dt_classifier,
    'XGB Classifier': best_xgb_classifier,
    'Knearest neighbours Classifier': knn_classifier
}

In [26]:
def evaluate_models(models, X_train,X_test, y_train, y_test):
    results ={}

    for name, model in models.items():
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test,y_pred)
        cr = classification_report(y_test,y_pred)

        results[name] ={
            'Accuracy': acc,
            'classification_report': cr
        }
        
    return results

In [27]:
metrics = evaluate_models(models,X_train,X_test, y_train, y_test)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
for model_name, metric in metrics.items():
    print(f"Model: {model_name}")
    for metric_name, value in metric.items():
        print(f"{metric_name} : {value}")
    print("\n")

Model: Gaussian Classifier
Accuracy : 0.41754225673823664
classification_report :               precision    recall  f1-score   support

           0       0.48      0.64      0.55      1160
           1       0.33      0.39      0.36      1003
           2       0.29      0.25      0.27      1131
           3       0.35      0.34      0.34      1104
           4       0.57      0.38      0.46      1172
           5       0.51      0.50      0.50       997

    accuracy                           0.42      6567
   macro avg       0.42      0.42      0.41      6567
weighted avg       0.42      0.42      0.41      6567



Model: Logistic Regression
Accuracy : 0.3849550784224151
classification_report :               precision    recall  f1-score   support

           0       0.45      0.62      0.52      1160
           1       0.29      0.44      0.35      1003
           2       0.29      0.12      0.17      1131
           3       0.35      0.30      0.32      1104
           4       0.

In [29]:
def evaluate_models2(models, X_train,X_test, y_train, y_test):
    results ={}

    for name, model in models.items():
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test,y_pred)
        #cr = classification_report(y_test,y_pred)

        results[name] ={
            'Accuracy': acc,
            # 'classification_report': cr
        }
        
    return results

In [30]:
metrics2 = evaluate_models2(models,X_train,X_test, y_train, y_test)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
for model_name, metric in metrics2.items():
    print(f"Model: {model_name}")
    for metric_name, value in metric.items():
        print(f"{metric_name} : {value}")
    print("\n")

Model: Gaussian Classifier
Accuracy : 0.41754225673823664


Model: Logistic Regression
Accuracy : 0.3849550784224151


Model: Decision Tree Classifier
Accuracy : 0.3666818943200853


Model: XGB Classifier
Accuracy : 0.4990102025277905


Model: Knearest neighbours Classifier
Accuracy : 0.3318105679914725




In [32]:
def best_model(metrics):
    best_model_name = None
    best_score = -float('inf')

    for model_name,metric in metrics.items():
        accuracy = metric['Accuracy']

        if accuracy>best_score:
            best_score = accuracy
            best_model_name = model_name

    print(f"Best Model: {best_model_name}")
    print(f"Accuracy: {best_score}")

In [33]:
best_model(metrics)

Best Model: XGB Classifier
Accuracy: 0.4990102025277905
