In [1]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV, cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler 
from sklearn.inspection import permutation_importance
import pickle

In [11]:
conn = sqlite3.connect('../data/spotify_kpop.db')
df = pd.read_sql('select * from spotify_kpop_train', conn)

In [19]:
features = ['popularity', 'duration_ms', 'valence',  'energy', 'key', 
            'lyric_word_yeah', 'lyric_sentiment_score']

In [20]:
le = LabelEncoder()
df["Vibe_Group_Num"] = le.fit_transform(df['Vibe_Group'])

In [21]:
outcome = 'Vibe_Group_Num'

In [22]:
features_outcome = features.copy()
features_outcome.append(outcome)

In [23]:
df = df.rename(columns = {'Popularity': 'popularity'})

In [24]:
df_no_na = df[features_outcome].dropna()

In [25]:
X = df_no_na[features]
y = df_no_na[[outcome]]

In [26]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)
scaled_features_df = pd.DataFrame(scaled_features, index=X.index, columns=X.columns)

pickle.dump(scaler, open(f"../src/standardscaler.pkl", 'wb'))

In [27]:
svm = SVC(probability=True)

In [28]:
hyperparameters = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': np.arange(1, 1000, 100),
    'gamma': ['scale', 'auto']
}

In [29]:
rs_cv = RandomizedSearchCV(svm, param_distributions = hyperparameters, n_iter = 3, scoring='neg_log_loss', n_jobs = -1, cv = 10, verbose = 3, random_state = 24)

In [16]:
rs_cv.fit(scaled_features_df, y)

Fitting 10 folds for each of 3 candidates, totalling 30 fits
[CV 1/10] END C=401, gamma=auto, kernel=sigmoid;, score=-0.884 total time=   0.0s
[CV 2/10] END C=401, gamma=auto, kernel=sigmoid;, score=-1.070 total time=   0.0s
[CV 3/10] END C=401, gamma=auto, kernel=sigmoid;, score=-0.789 total time=   0.0s
[CV 4/10] END C=401, gamma=auto, kernel=sigmoid;, score=-1.014 total time=   0.0s
[CV 5/10] END C=401, gamma=auto, kernel=sigmoid;, score=-0.867 total time=   0.0s
[CV 6/10] END C=401, gamma=auto, kernel=sigmoid;, score=-0.962 total time=   0.0s
[CV 7/10] END C=401, gamma=auto, kernel=sigmoid;, score=-1.097 total time=   0.0s
[CV 8/10] END C=401, gamma=auto, kernel=sigmoid;, score=-0.983 total time=   0.0s
[CV 9/10] END C=401, gamma=auto, kernel=sigmoid;, score=-0.962 total time=   0.0s
[CV 10/10] END C=401, gamma=auto, kernel=sigmoid;, score=-0.862 total time=   0.0s
[CV 1/10] END ..C=901, gamma=auto, kernel=poly;, score=-0.871 total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 2/10] END ..C=901, gamma=auto, kernel=poly;, score=-1.084 total time=   0.0s
[CV 3/10] END ..C=901, gamma=auto, kernel=poly;, score=-0.844 total time=   0.1s
[CV 4/10] END ..C=901, gamma=auto, kernel=poly;, score=-0.982 total time=   0.0s
[CV 5/10] END ..C=901, gamma=auto, kernel=poly;, score=-0.915 total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 6/10] END ..C=901, gamma=auto, kernel=poly;, score=-1.137 total time=   0.0s
[CV 7/10] END ..C=901, gamma=auto, kernel=poly;, score=-0.915 total time=   0.0s
[CV 8/10] END ..C=901, gamma=auto, kernel=poly;, score=-0.868 total time=   0.1s
[CV 9/10] END ..C=901, gamma=auto, kernel=poly;, score=-0.929 total time=   0.0s
[CV 10/10] END .C=901, gamma=auto, kernel=poly;, score=-1.020 total time=   0.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 1/10] END ...C=101, gamma=auto, kernel=rbf;, score=-0.865 total time=   0.0s
[CV 2/10] END ...C=101, gamma=auto, kernel=rbf;, score=-1.035 total time=   0.0s
[CV 3/10] END ...C=101, gamma=auto, kernel=rbf;, score=-0.812 total time=   0.0s
[CV 4/10] END ...C=101, gamma=auto, kernel=rbf;, score=-0.804 total time=   0.0s
[CV 5/10] END ...C=101, gamma=auto, kernel=rbf;, score=-0.876 total time=   0.0s
[CV 6/10] END ...C=101, gamma=auto, kernel=rbf;, score=-0.994 total time=   0.0s
[CV 7/10] END ...C=101, gamma=auto, kernel=rbf;, score=-0.957 total time=   0.0s
[CV 8/10] END ...C=101, gamma=auto, kernel=rbf;, score=-0.812 total time=   0.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 9/10] END ...C=101, gamma=auto, kernel=rbf;, score=-0.954 total time=   0.0s
[CV 10/10] END ..C=101, gamma=auto, kernel=rbf;, score=-0.941 total time=   0.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [17]:
rs_cv.best_estimator_

In [18]:
hyperparameters = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': np.arange(1, 200, 1),
    'gamma': ['scale', 'auto']
}

rs_cv = RandomizedSearchCV(svm, param_distributions = hyperparameters, n_iter = 3, scoring='neg_log_loss', n_jobs = -1, cv = 10, verbose = 3, random_state = 24)
rs_cv.fit(scaled_features_df, y)
rs_cv.best_estimator_

Fitting 10 folds for each of 3 candidates, totalling 30 fits


  y = column_or_1d(y, warn=True)


[CV 3/10] END C=113, gamma=scale, kernel=sigmoid;, score=-0.793 total time=   0.0s
[CV 9/10] END C=113, gamma=scale, kernel=sigmoid;, score=-0.956 total time=   0.0s
[CV 5/10] END ...C=53, gamma=scale, kernel=rbf;, score=-0.846 total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 10/10] END ..C=53, gamma=scale, kernel=rbf;, score=-0.946 total time=   0.1s
[CV 6/10] END ...C=53, gamma=scale, kernel=rbf;, score=-0.957 total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 3/10] END ...C=53, gamma=scale, kernel=rbf;, score=-0.835 total time=   0.1s
[CV 4/10] END ...C=53, gamma=scale, kernel=rbf;, score=-0.785 total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 4/10] END C=113, gamma=scale, kernel=sigmoid;, score=-1.044 total time=   0.0s
[CV 7/10] END C=113, gamma=scale, kernel=sigmoid;, score=-1.075 total time=   0.0s
[CV 3/10] END C=25, gamma=scale, kernel=linear;, score=-0.671 total time=   0.1s
[CV 6/10] END C=113, gamma=scale, kernel=sigmoid;, score=-0.939 total time=   0.0s
[CV 1/10] END C=25, gamma=scale, kernel=linear;, score=-0.818 total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 7/10] END ...C=53, gamma=scale, kernel=rbf;, score=-0.942 total time=   0.1s
[CV 8/10] END C=25, gamma=scale, kernel=linear;, score=-0.640 total time=   0.1s
[CV 5/10] END C=113, gamma=scale, kernel=sigmoid;, score=-0.882 total time=   0.0s
[CV 2/10] END C=25, gamma=scale, kernel=linear;, score=-1.184 total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 2/10] END C=113, gamma=scale, kernel=sigmoid;, score=-1.149 total time=   0.0s
[CV 8/10] END C=113, gamma=scale, kernel=sigmoid;, score=-0.982 total time=   0.0s
[CV 7/10] END C=25, gamma=scale, kernel=linear;, score=-0.889 total time=   0.1s
[CV 2/10] END ...C=53, gamma=scale, kernel=rbf;, score=-1.053 total time=   0.1s
[CV 6/10] END C=25, gamma=scale, kernel=linear;, score=-0.697 total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 1/10] END C=113, gamma=scale, kernel=sigmoid;, score=-0.946 total time=   0.0s
[CV 10/10] END C=113, gamma=scale, kernel=sigmoid;, score=-0.926 total time=   0.0s
[CV 4/10] END C=25, gamma=scale, kernel=linear;, score=-0.899 total time=   0.1s
[CV 9/10] END ...C=53, gamma=scale, kernel=rbf;, score=-0.876 total time=   0.1s
[CV 10/10] END C=25, gamma=scale, kernel=linear;, score=-0.833 total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 1/10] END ...C=53, gamma=scale, kernel=rbf;, score=-0.850 total time=   0.1s
[CV 5/10] END C=25, gamma=scale, kernel=linear;, score=-0.557 total time=   0.1s
[CV 8/10] END ...C=53, gamma=scale, kernel=rbf;, score=-0.807 total time=   0.1s
[CV 9/10] END C=25, gamma=scale, kernel=linear;, score=-0.667 total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [19]:
hyperparameters = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': np.arange(1, 100, 1),
    'gamma': ['scale', 'auto']
}

rs_cv = RandomizedSearchCV(svm, param_distributions = hyperparameters, n_iter = 3, scoring='neg_log_loss', n_jobs = -1, cv = 10, verbose = 3, random_state = 24)
rs_cv.fit(scaled_features_df, y)
rs_cv.best_estimator_

Fitting 10 folds for each of 3 candidates, totalling 30 fits


  y = column_or_1d(y, warn=True)


In [20]:
hyperparameters = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': np.arange(1, 100, 1),
    'gamma': ['scale', 'auto']
}

rs_cv = RandomizedSearchCV(svm, param_distributions = hyperparameters, n_iter = 3, scoring='neg_log_loss', n_jobs = -1, cv = 10, verbose = 3, random_state = 24)
rs_cv.fit(scaled_features_df, y)
rs_cv.best_estimator_

Fitting 10 folds for each of 3 candidates, totalling 30 fits


  y = column_or_1d(y, warn=True)


In [30]:
svm = SVC(C=25, kernel='linear', probability=True)
svm.fit(scaled_features_df, y)
pickle.dump(svm, open(f"../src/svm.pkl", 'wb'))

  y = column_or_1d(y, warn=True)
