In [268]:
import pandas as pd
import numpy as np

In [269]:
data = pd.read_csv('./spotify.csv', na_values=['-', ' ', ''])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   track_name            953 non-null    object 
 1   released_year         953 non-null    int64  
 2   in_spotify_playlists  953 non-null    int64  
 3   in_spotify_charts     951 non-null    float64
 4   streams               953 non-null    int64  
 5   in_apple_playlists    953 non-null    int64  
 6   in_apple_charts       951 non-null    float64
 7   bpm                   953 non-null    int64  
 8   mode                  953 non-null    object 
 9   energy                953 non-null    int64  
 10  instrumentalness      953 non-null    int64  
 11  liveness              953 non-null    int64  
 12  speechiness           953 non-null    int64  
dtypes: float64(2), int64(9), object(2)
memory usage: 96.9+ KB


In [270]:
data.head()

Unnamed: 0,track_name,released_year,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,in_apple_charts,bpm,mode,energy,instrumentalness,liveness,speechiness
0,Seven (feat. Latto) (Explicit Ver.),2023,553,147.0,141381703,43,263.0,125,Major,83,0,8,4
1,LALA,2023,1474,48.0,133716286,48,126.0,92,Major,74,0,10,4
2,vampire,2023,1397,113.0,140003974,94,207.0,138,Major,53,0,31,6
3,Cruel Summer,2019,7858,,800840817,116,207.0,170,Major,72,0,11,15
4,WHERE SHE GOES,2023,3133,50.0,303236322,84,133.0,144,Minor,80,63,11,6


In [271]:
data['HighStreams'] = np.where(data['streams'] > data['streams'].quantile(0.75), 1, 0)

In [272]:
data.head()

Unnamed: 0,track_name,released_year,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,in_apple_charts,bpm,mode,energy,instrumentalness,liveness,speechiness,HighStreams
0,Seven (feat. Latto) (Explicit Ver.),2023,553,147.0,141381703,43,263.0,125,Major,83,0,8,4,0
1,LALA,2023,1474,48.0,133716286,48,126.0,92,Major,74,0,10,4,0
2,vampire,2023,1397,113.0,140003974,94,207.0,138,Major,53,0,31,6,0
3,Cruel Summer,2019,7858,,800840817,116,207.0,170,Major,72,0,11,15,1
4,WHERE SHE GOES,2023,3133,50.0,303236322,84,133.0,144,Minor,80,63,11,6,0


In [273]:
# Shapiro
from scipy.stats import shapiro

# Proveravamo da li varijable imaju normalnu raspodelu
p_spotify = shapiro(data['in_spotify_charts'].dropna())[1]
p_apple = shapiro(data['in_apple_charts'].dropna())[1]

print("p-vrednost za in_spotify_charts:", p_spotify)
print("p-vrednost za in_apple_charts:", p_apple)
# Obe varijable imaju vrednost p < 0.05 što znači da nemaju normalnu raspodelu i nedostajuće vrednosti zamenićemo medijanom

median_spotify = data['in_spotify_charts'].median()
print(median_spotify)
median_apple = data['in_apple_charts'].median()
print(median_apple)
# Dobili smo da su medijane 3 i 38 za spotify i apple respektivno

# Zamena nedostajućih vrednosti medijanama
data['in_spotify_charts'] = data['in_spotify_charts'].fillna(median_spotify)
data['in_apple_charts'] = data['in_apple_charts'].fillna(median_apple)

print(data.isna().sum())
# Nema više nedostajućih vrednosti

p-vrednost za in_spotify_charts: 1.4519175687516224e-39
p-vrednost za in_apple_charts: 4.468788251553773e-26
3.0
38.0
track_name              0
released_year           0
in_spotify_playlists    0
in_spotify_charts       0
streams                 0
in_apple_playlists      0
in_apple_charts         0
bpm                     0
mode                    0
energy                  0
instrumentalness        0
liveness                0
speechiness             0
HighStreams             0
dtype: int64


In [274]:
# kdeplot-ovi i countplot-ovi
data = data[['released_year', 'in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists', 'in_apple_charts', 'mode', 'HighStreams']]

In [275]:
data['mode'] = data['mode'].map({
    'Major': 1,
    'Minor': 0,
})

In [276]:
# Ako varijable imaju outliere => standardizacija
# Ako varijable nemaju outliere => normalizacija
for col in data.columns:
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    print(f'{col}: {((data[col] < lower) | (data[col] > upper)).sum()}')


released_year: 151
in_spotify_playlists: 109
in_spotify_charts: 76
in_apple_playlists: 78
in_apple_charts: 12
mode: 0
HighStreams: 238


In [277]:
# Odredjivanje da li cemo da skejlujemo podatke pomocu standard scaler-a ili robust scaler-a
numeric_vars = ['released_year', 'in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists', 'in_apple_charts']
# Proveravamo da li varijable imaju normalnu raspodelu kako bismo odlučili na koji način da izvršimo standardizaciju
shapiro_results = data[numeric_vars].apply(lambda x: shapiro(x)[1])
print("Shapiro-Wilk p-vrednosti:\n", shapiro_results)


Shapiro-Wilk p-vrednosti:
 released_year           0.000000e+00
in_spotify_playlists    2.585256e-41
in_spotify_charts       1.284532e-39
in_apple_playlists      4.262230e-37
in_apple_charts         4.332554e-26
dtype: float64


In [278]:
from sklearn.preprocessing import StandardScaler, RobustScaler

# Posto smo u prethodnom koraku dobili da sve varijable nemaju normalnu raspodelu => Robust scaler za sve
data[numeric_vars] = RobustScaler().fit_transform(data[numeric_vars])

In [279]:
data.head()

Unnamed: 0,released_year,in_spotify_playlists,in_spotify_charts,in_apple_playlists,in_apple_charts,mode,HighStreams
0,0.5,-0.358046,9.0,0.12,2.848101,1,0
1,0.5,-0.160703,2.8125,0.186667,1.113924,1,0
2,0.5,-0.177202,6.875,0.8,2.139241,1,0
3,-1.5,1.207199,0.0,1.093333,2.139241,1,1
4,0.5,0.194772,2.9375,0.666667,1.202532,0,0


In [280]:
from sklearn.model_selection import train_test_split

X = data.drop(columns='HighStreams')
y = data['HighStreams']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [281]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)
param_grid = {'n_neighbors': np.arange(3, 26, 2)}
model = GridSearchCV(
    KNeighborsClassifier(),
    cv=cv,
    param_grid=param_grid
).fit(X_train, y_train)

best_k = model.best_params_['n_neighbors']
print(best_k)
# pronadji odgovarajuce k
# pozovi k nearest neighbor

5


In [282]:
model = KNeighborsClassifier(n_neighbors=best_k).fit(X_train, y_train)

In [283]:
y_pred = model.predict(X_test)
result = pd.DataFrame({
    'True': y_test, 
    'Pred': y_pred
})
result.head()

Unnamed: 0,True,Pred
162,1,1
260,0,0
531,1,1
138,1,1
597,0,0


In [285]:
TP = ((result['True'] == 1) & (result['Pred'] == 1)).sum()
TN = np.sum((result['True'] == 0) & (result['Pred'] == 0))
FP = np.sum((result['True'] == 1) & (result['Pred'] == 0))
FN = np.sum((result['True'] == 0) & (result['Pred'] == 1))

In [286]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

pd.DataFrame({
    'False': [TN, FN],
    'True': [FP, TP]
}, index=['False', 'True'])

Unnamed: 0,False,True
False,143,3
True,5,40


In [287]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

a = accuracy_score(y_test, y_pred)
print(a)
p = precision_score(y_test, y_pred)
print(p)
r = recall_score(y_test, y_pred)
print(r)
f1 = f1_score(y_test, y_pred)
print(f1)

0.9581151832460733
0.8888888888888888
0.9302325581395349
0.9090909090909091
