In [None]:
import pandas as pd
df = pd.read_csv('SpotifyFeatures.csv')
df['popularity'] = df['popularity']/100
df = df.drop_duplicates(subset=['track_name', 'artist_name'])

In [None]:
corr_df = df[['popularity', 
              'acousticness', 
              'danceability', 
              'duration_ms',
              'energy',
              'instrumentalness', 
              'liveness', 
              'loudness', 
              'speechiness',
              'tempo',
              'valence',
             ]].corr()
corr_df = corr_df.apply(lambda x: round(x, 2))
import seaborn as sns
sns.heatmap(corr_df, 
        xticklabels=corr_df.columns,
        yticklabels=corr_df.columns, vmax=1.0, vmin=-1.0, annot=True, cmap='Blues').set_title('Correlation matrix for music features')


In [None]:
df = pd.concat((df,pd.get_dummies(df['mode'])),1)
df = pd.concat((df,pd.get_dummies(df['key'])),1)
df = pd.concat((df,pd.get_dummies(df['time_signature'])),1)

In [None]:
df.head(5)

In [None]:
df = df.loc[:, df.columns != 'genre']
df = df.loc[:, df.columns != 'artist_name']
df = df.loc[:, df.columns != 'track_name']
df = df.loc[:, df.columns != 'track_id']
df = df.loc[:, df.columns != 'mode']
df = df.loc[:, df.columns != 'key']
df = df.loc[:, df.columns != 'time_signature']

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
import numpy as np

df = df.sample(frac=1).reset_index(drop=True)

x_data = df.loc[:, df.columns != 'popularity']


y_data = df['popularity']

X_train, X_test, y_train, y_test = train_test_split(
    x_data, y_data, test_size=0.1, random_state=0)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=100)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))
print(mean_squared_error(knn.predict(X_test), y_test))

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
print(lin_reg.score(X_test, y_test))
print(mean_squared_error(lin_reg.predict(X_test), y_test))

In [None]:
from sklearn.ensemble import RandomForestRegressor
random_forest = RandomForestRegressor(n_estimators=50,
                                     max_depth=10,
                                     )
random_forest.fit(X_train, y_train)
print(random_forest.score(X_test, y_test))
print(mean_squared_error(random_forest.predict(X_test), y_test))

In [None]:
import matplotlib.pyplot as plt

plt.bar(x_data.columns[np.argsort(random_forest.feature_importances_)[::-1][:10]], 
       random_forest.feature_importances_[np.argsort(random_forest.feature_importances_)[::-1][:10]])
plt.xlabel("Feature")
plt.xticks(rotation=60)
plt.title("Feature importances for random forest model")
plt.ylabel("Feature importance")
plt.savefig("images/feature_importances_random_forest.png")
plt.show()

In [None]:
from sklearn.tree import DecisionTreeRegressor

decision_tree = DecisionTreeRegressor(max_depth=10,)
decision_tree.fit(X_train, y_train)
print(decision_tree.score(X_test, y_test))
print(mean_squared_error(decision_tree.predict(X_test), y_test))

In [None]:
# from sklearn.svm import LinearSVR
# svm = LinearSVR(C=1, max_iter=10000)
# svm.fit(X_train, y_train)
# print(svm.score(X_test, y_test))

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer

scoring = {'R2': make_scorer(r2_score), 'MSE': make_scorer(mean_squared_error)}

models = {"Linear_Regression": LinearRegression(),
          "KNN": KNeighborsRegressor(n_neighbors=100),
          "Random_Forest": RandomForestRegressor(n_estimators=50,
                                          max_depth=10,
                                          ),
          "Decision_Tree": DecisionTreeRegressor(max_depth=10,)
         }

R2_scores = []
RMSE_scores = []
for name, model in models.items():
    cv_scores = cross_validate(model, x_data, y_data, cv=10, n_jobs=2, scoring =scoring, return_train_score=False)
    print(f"{name} R2 score: {cv_scores['test_R2'].mean()}")
    print(f"{name} RMSE score: {cv_scores['test_MSE'].mean() ** (1/2)}")
    R2_scores.append(cv_scores['test_R2'].mean())
    RMSE_scores.append(cv_scores['test_MSE'].mean()** (1/2))

In [None]:
import matplotlib.pyplot as plt

plt.bar(models.keys(), R2_scores)
plt.xlabel("Model")
plt.title("R2 Score for each model")
plt.ylabel("R2 Score")
plt.savefig("images/r2_bar_plot.png")
plt.show()

plt.bar(models.keys(), RMSE_scores)
plt.xlabel("Model")
plt.ylabel("RMSE")
plt.title("RMSE for each model")

plt.savefig("images/rmse_bar_plot.png")
plt.show()