## NEW

In [1]:
!pip install semopy



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statistics as statistics
import semopy as sem

np.random.seed(1)

KeyboardInterrupt: 

In [None]:
data = pd.read_csv('complete_data_term2run.csv', index_col=0)
print(data)

In [None]:
# split data
data = data.drop(columns=['title', 'artist/s'])
X_data =  data.values[:,:-1]
y_data = data.values[:,-1]

print(X_data.shape)
print(y_data.shape)

In [None]:
data2 = data
data_copy = data

# SEM part

### intrinsic, extrinsic, crowdsourced

In [None]:
# standardize non-categorical features
from sklearn.preprocessing import StandardScaler
cols_to_norm = ['danceability', 'loudness', 'speechiness', 'liveness', 'valence', 'tempo', 'duration_ms', 'chart_history', 'mean_trends', 'song_familiarity', 
                'artist_familiarity', 'bad_good', 'distasteful_tasty', 'dull_exciting', 'tasteless_tasteful', 'unimaginative_creative', 'untalented_talented', 
                'unpleasant_pleasant', 'forgettable_memorable', 'boring_interesting', 'listen_similar', 'share_friends', 'add_playlist']
data[cols_to_norm] = StandardScaler().fit_transform(data[cols_to_norm])
print(data)

In [None]:
print(sem.efa.explore_cfa_model(data))

In [None]:
mod = """
chart_success =~ stream1 + stream2 + stream3 + stream4 + stream5 + stream6 + stream6 + stream7 + stream8 + stream9 + stream_category
intrinsic =~ danceability + key + loudness + speechiness + liveness + valence + tempo + duration_ms
extrinsic =~ music_label + artist_collab + chart_history + mean_trends
crowdsource =~ song_familiarity + artist_familiarity + bad_good + distasteful_tasty + dull_exciting + tasteless_tasteful + unimaginative_creative + untalented_talented + unpleasant_pleasant + forgettable_memorable + boring_interesting + listen_similar + share_friends + add_playlist

chart_success ~ intrinsic + extrinsic + crowdsource

DEFINE(ordinal) key music_label artist_collab stream1 stream2 stream3 stream4 stream5 stream6 stream7 stream8 stream9 stream_category
      """

In [None]:
model = sem.Model(mod)
result = model.fit(data, obj="FIML", solver="SLSQP") #can change obj and solver
print(result)

In [None]:
sem_all_pd = model.inspect()
sem_all_pd

In [None]:
sem_all_pd.to_csv(r'SEM_three_results_term3.csv')

In [None]:
stats = sem.calc_stats(model)
print(stats.T)

In [None]:
g = sem.semplot(model, "pd1_term3.png")
g

### intrinsic, extrinsic

In [None]:
mod2 = """
chart_success =~ stream1 + stream2 + stream3 + stream4 + stream5 + stream6 + stream6 + stream7 + stream8 + stream9 + stream_category
intrinsic =~ danceability + key + loudness + speechiness + liveness + valence + tempo + duration_ms
extrinsic =~ music_label + artist_collab + chart_history + mean_trends

chart_success ~ intrinsic + extrinsic

DEFINE(ordinal) key music_label artist_collab stream1 stream2 stream3 stream4 stream5 stream6 stream7 stream8 stream9 stream_category
      """

In [None]:
model2 = sem.Model(mod2)
result2 = model2.fit(data, obj="FIML", solver="SLSQP") #can change obj and solver
print(result2)

In [None]:
sem_ixe_pd = model2.inspect()
sem_ixe_pd

In [None]:
sem_ixe_pd.to_csv(r'SEM_two_term3_results.csv')

In [None]:
stats2 = sem.calc_stats(model2)
print(stats2.T)

In [None]:
g2 = sem.semplot(model2, "pd2_term3.png")
g2

### relationship of features to danceability

In [None]:
mod3 = """
danceability =~ valence + duration_ms + loudness + speechiness + tempo + key + liveness

DEFINE(ordinal) key
      """

In [None]:
model3 = sem.Model(mod3)
result3 = model3.fit(data) #can change obj and solver
print(result3)

In [None]:
model3.inspect()

In [None]:
stats3 = sem.calc_stats(model3)
print(stats3.T)

In [None]:
g3 = sem.semplot(model3, "pd3_term3.png")
g3

### relationship of faetures to valence

In [None]:
mod4 = """
valence =~ danceability + key + loudness + speechiness + liveness + tempo + duration_ms

DEFINE(ordinal) key
      """

In [None]:
model4 = sem.Model(mod4)
result4 = model4.fit(data) #can change obj and solver
print(result4)

In [None]:
model4.inspect()

In [None]:
stats4 = sem.calc_stats(model4)
print(stats4.T)

In [None]:
g4 = sem.semplot(model4, "pd4_term3.png")
g4

## SEM predict

### intrinsic, extrinsic, and crowdsourced

In [None]:
from sklearn.model_selection import train_test_split
import random

num = random.randint(0,100)
X_train, X_test, y_train, y_test = train_test_split(data, data, random_state=num, test_size=0.20)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

In [None]:
from semopy import Model
model5 = Model(mod)
model5.fit(X_train)
predictions_test_sem = model5.predict(X_test)

In [None]:
def compute_RMSE(predictions, actual):
    return np.sqrt(np.square(np.subtract(actual, predictions)).mean())
def compute_mae(predictions, actual):
    mae = np.absolute(np.subtract(actual, predictions)).mean()
    return mae
def compute_accuracy(predictions, actual):
    # write code here
    num_correct = np.sum(predictions == actual)
    accuracy = num_correct/len(actual)
    return accuracy*100

In [None]:
y_data_sem = X_test['stream_category']
test_rmse_sem = compute_RMSE(predictions_test_sem['stream_category'], y_data_sem)
print('RMSE: {}'.format(test_rmse_sem))

test_mae_sem = compute_mae(predictions_test_sem['stream_category'], y_data_sem)
print('MAE: {}'.format(test_mae_sem))

test_acc_sem = compute_accuracy(predictions_test_sem['stream_category'], y_data_sem)
print('Accuracy: {}'.format(test_acc_sem))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, data, random_state=0, test_size=0.20)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

In [None]:
from semopy import Model
model5 = Model(mod)
model5.fit(X_train)
predictions_test_sem = model5.predict(X_test)

In [None]:
def compute_RMSE(predictions, actual):
    return np.sqrt(np.square(np.subtract(actual, predictions)).mean())
def compute_mae(predictions, actual):
    mae = np.absolute(np.subtract(actual, predictions)).mean()
    return mae
def compute_accuracy(predictions, actual):
    # write code here
    num_correct = np.sum(predictions == actual)
    accuracy = num_correct/len(actual)
    return accuracy*100

In [None]:
y_data_sem = X_test['stream_category']
test_rmse_sem = compute_RMSE(predictions_test_sem['stream_category'], y_data_sem)
print('RMSE: {}'.format(test_rmse_sem))

test_mae_sem = compute_mae(predictions_test_sem['stream_category'], y_data_sem)
print('MAE: {}'.format(test_mae_sem))

test_acc_sem = compute_accuracy(predictions_test_sem['stream_category'], y_data_sem)
print('Accuracy: {}'.format(test_acc_sem))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, data, random_state=1, test_size=0.20)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

In [None]:
from semopy import Model
model5 = Model(mod)
model5.fit(X_train)
predictions_test_sem = model5.predict(X_test)

In [None]:
def compute_RMSE(predictions, actual):
    return np.sqrt(np.square(np.subtract(actual, predictions)).mean())
def compute_mae(predictions, actual):
    mae = np.absolute(np.subtract(actual, predictions)).mean()
    return mae
def compute_accuracy(predictions, actual):
    # write code here
    num_correct = np.sum(predictions == actual)
    accuracy = num_correct/len(actual)
    return accuracy*100

In [None]:
y_data_sem = X_test['stream_category']
test_rmse_sem = compute_RMSE(predictions_test_sem['stream_category'], y_data_sem)
print('RMSE: {}'.format(test_rmse_sem))

test_mae_sem = compute_mae(predictions_test_sem['stream_category'], y_data_sem)
print('MAE: {}'.format(test_mae_sem))

test_acc_sem = compute_accuracy(predictions_test_sem['stream_category'], y_data_sem)
print('Accuracy: {}'.format(test_acc_sem))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, data, random_state=42, test_size=0.20)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

In [None]:
from semopy import Model
model5 = Model(mod)
model5.fit(X_train)
predictions_test_sem = model5.predict(X_test)

In [None]:
def compute_RMSE(predictions, actual):
    return np.sqrt(np.square(np.subtract(actual, predictions)).mean())
def compute_mae(predictions, actual):
    mae = np.absolute(np.subtract(actual, predictions)).mean()
    return mae
def compute_accuracy(predictions, actual):
    # write code here
    num_correct = np.sum(predictions == actual)
    accuracy = num_correct/len(actual)
    return accuracy*100

In [None]:
y_data_sem = X_test['stream_category']
test_rmse_sem = compute_RMSE(predictions_test_sem['stream_category'], y_data_sem)
print('RMSE: {}'.format(test_rmse_sem))

test_mae_sem = compute_mae(predictions_test_sem['stream_category'], y_data_sem)
print('MAE: {}'.format(test_mae_sem))

test_acc_sem = compute_accuracy(predictions_test_sem['stream_category'], y_data_sem)
print('Accuracy: {}'.format(test_acc_sem))

In [None]:
predictions_test_sem

In [None]:
y_data_sem

In [None]:
# from semopy.examples import political_democracy
# from semopy import ModelMeans
# import numpy as np

# desc = political_democracy.get_model()
# data = political_democracy.get_data()

# i, v = 0, 'x1'
# x = data[v].values[i]

# data[v].values[i] = float('nan')
# model = ModelMeans(desc)
# model.fit(data, )
# preds = model.predict(data)
# diff = np.abs((x - preds[v].values[i])/x)
# print('{:.2f}%'.format(diff * 100))

### intrinsic and extrinsic

In [None]:
data_temp = data_copy
data2 = data_temp.drop(columns=['song_familiarity', 'artist_familiarity', 'bad_good', 'distasteful_tasty', 'dull_exciting', 'tasteless_tasteful', 'unimaginative_creative', 'untalented_talented', 'unpleasant_pleasant', 'forgettable_memorable', 'boring_interesting', 'listen_similar', 'share_friends', 'add_playlist'])

data2

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data2, data2, random_state=42, test_size=0.20)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

In [None]:
from semopy import Model
model6 = Model(mod2)
model6.fit(X_train)
predictions_test_sem = model6.predict(X_test)

In [None]:
def compute_RMSE(predictions, actual):
    return np.sqrt(np.square(np.subtract(actual, predictions)).mean())
def compute_mae(predictions, actual):
    mae = np.absolute(np.subtract(actual, predictions)).mean()
    return mae
def compute_accuracy(predictions, actual):
    # write code here
    num_correct = np.sum(predictions == actual)
    accuracy = num_correct/len(actual)
    return accuracy*100

In [None]:
y_data_sem = X_test['stream_category']
test_rmse_sem = compute_RMSE(predictions_test_sem['stream_category'], y_data_sem)
print('RMSE: {}'.format(test_rmse_sem))

test_mae_sem = compute_mae(predictions_test_sem['stream_category'], y_data_sem)
print('MAE: {}'.format(test_mae_sem))

test_acc_sem = compute_accuracy(predictions_test_sem['stream_category'], y_data_sem)
print('Accuracy: {}'.format(test_acc_sem))

In [None]:
predictions_test_sem

In [None]:
y_data_sem

## Prediction models (intrinsic, extrinsic, crowdsourced)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=42, test_size=0.20, stratify=y_data)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier()
rfc.get_params()

In [None]:
hyperparameters = [
        {
        'n_estimators' : [50, 100, 200, 400, 600, 800, 1000, 1200, 1600, 1800],
        'max_depth' : [10, 30, 50, 100, 150, 200, 250, 300, 350, None],
        'min_samples_split' : [2, 4, 6, 10, 15, 20],
        'max_leaf_nodes' : [100, 300, 500, 700, 900, 1100, 1300, None]
    }
]

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rsrfc = RandomizedSearchCV(rfc, hyperparameters, random_state=42, cv=10, n_iter=50)
rsrfc.fit(X_train, y_train)

In [None]:
best_params_rfc = rsrfc.best_params_
best_params_rfc

In [None]:
best_model_rfc = rsrfc.best_estimator_

predictions_train_rfc = best_model_rfc.predict(X_train)

train_rmse_rfc = compute_RMSE(predictions_train_rfc, y_train)
print('Training RMSE: {}'.format(train_rmse_rfc))

train_mae_rfc = compute_mae(predictions_train_rfc, y_train)
print('Training MAE: {}'.format(train_mae_rfc))

train_acc = compute_accuracy(predictions_train_rfc, y_train)
print('Training Accuracy: {}'.format(train_acc))

predictions_test_rfc = best_model_rfc.predict(X_test)

test_rmse_rfc = compute_RMSE(predictions_test_rfc, y_test)
print('Test RMSE: {}'.format(test_rmse_rfc))

test_mae_rfc = compute_mae(predictions_test_rfc, y_test)
print('Test MAE: {}'.format(test_mae_rfc))

test_acc = compute_accuracy(predictions_test_rfc, y_test)
print('Test Accuracy: {}'.format(test_acc))

In [None]:
best_model_rfc_3 = best_model_rfc

### SVM

In [None]:
from sklearn.svm import SVC

In [None]:
svm = SVC()
svm.get_params()

In [None]:
hyperparameters = [
        {
        'C' : [0.1, 1, 100, 1000],
        'kernel' : ['rbf', 'poly', 'sigmoid', 'linear'],
        'degree' : [1, 2, 3, 4, 5, 6]
    }
]

In [None]:
rssvm = RandomizedSearchCV(svm, hyperparameters, random_state=42, cv=10, n_iter=50)
rssvm.fit(X_train, y_train)

In [None]:
best_params_svm = rssvm.best_params_
best_params_svm

In [None]:
best_model_svm = rssvm.best_estimator_

predictions_train_svm = best_model_svm.predict(X_train)

train_rmse_svm = compute_RMSE(predictions_train_svm, y_train)
print('Training RMSE: {}'.format(train_rmse_svm))

train_mae_svm = compute_mae(predictions_train_svm, y_train)
print('Training MAE: {}'.format(train_mae_svm))

train_acc = compute_accuracy(predictions_train_svm, y_train)
print('Training Accuracy: {}'.format(train_acc))

predictions_test_svm = best_model_svm.predict(X_test)

test_rmse_svm = compute_RMSE(predictions_test_svm, y_test)
print('Test RMSE: {}'.format(test_rmse_svm))

test_mae_svm = compute_mae(predictions_test_svm, y_test)
print('Test MAE: {}'.format(test_mae_svm))

test_acc = compute_accuracy(predictions_test_svm, y_test)
print('Test Accuracy: {}'.format(test_acc))

In [None]:
best_model_svm_3 = best_model_svm

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier()
dt.get_params()

In [None]:
hyperparameters = [
    {
        'min_impurity_decrease': [0.001, 0.01, 0.05, 0.1, 0.3, 0.5],
        'max_depth': [5, 10, 20, 30],
        'min_samples_split': [2, 4, 6, 10, 15, 20],
        'max_leaf_nodes': [3, 5, 10, 20, 50, 100]
    }
]

In [None]:
rsdt = RandomizedSearchCV(dt, hyperparameters, random_state=42, cv=10, n_iter=50)
rsdt.fit(X_train, y_train)

In [None]:
best_params_dt = rsdt.best_params_
best_params_dt

In [None]:
best_model_dt = rsdt.best_estimator_

predictions_train_dt = best_model_dt.predict(X_train)

train_rmse_dt = compute_RMSE(predictions_train_dt, y_train)
print('Training RMSE: {}'.format(train_rmse_dt))

train_mae_dt = compute_mae(predictions_train_dt, y_train)
print('Training MAE: {}'.format(train_mae_dt))

train_acc_dt = compute_accuracy(predictions_train_dt, y_train)
print('Training Accuracy: {}'.format(train_acc_dt))

predictions_test_dt = best_model_dt.predict(X_test)

test_rmse_dt = compute_RMSE(predictions_test_dt, y_test)
print('Test RMSE: {}'.format(test_rmse_dt))

test_mae_dt = compute_mae(predictions_test_dt, y_test)
print('Test MAE: {}'.format(test_mae_dt))

test_acc_dt = compute_accuracy(predictions_test_dt, y_test)
print('Test Accuracy: {}'.format(test_acc_dt))

In [None]:
best_model_dt_3 = best_model_dt

## Prediction models (intrinsic and extrinsic)

In [None]:
data_temp = data_copy
data2 = data_temp.drop(columns=['song_familiarity', 'artist_familiarity', 'bad_good', 'distasteful_tasty', 'dull_exciting', 'tasteless_tasteful', 'unimaginative_creative', 'untalented_talented', 'unpleasant_pleasant', 'forgettable_memorable', 'boring_interesting', 'listen_similar', 'share_friends', 'add_playlist'])
X_data =  data2.values[:,:-1]
y_data = data2.values[:,-1]

print(X_data.shape)
print(y_data.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=42, test_size=0.20, stratify=y_data)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Random Forest 

In [None]:
rfc = RandomForestClassifier()
rfc.get_params()

In [None]:
hyperparameters = [
        {
        'n_estimators' : [50, 100, 200, 400, 600, 800, 1000, 1200, 1600, 1800],
        'max_depth' : [10, 30, 50, 100, 150, 200, 250, 300, 350, None],
        'min_samples_split' : [2, 4, 6, 10, 15, 20],
        'max_leaf_nodes' : [100, 300, 500, 700, 900, 1100, 1300, None]
    }
]

In [None]:
rsrfc = RandomizedSearchCV(rfc, hyperparameters, random_state=42, cv=10, n_iter=50)
rsrfc.fit(X_train, y_train)

In [None]:
best_params_rfc = rsrfc.best_params_
best_params_rfc

In [None]:
best_model_rfc = rsrfc.best_estimator_

predictions_train_rfc = best_model_rfc.predict(X_train)

train_rmse_rfc = compute_RMSE(predictions_train_rfc, y_train)
print('Training RMSE: {}'.format(train_rmse_rfc))

train_mae_rfc = compute_mae(predictions_train_rfc, y_train)
print('Training MAE: {}'.format(train_mae_rfc))

train_acc = compute_accuracy(predictions_train_rfc, y_train)
print('Training Accuracy: {}'.format(train_acc))

predictions_test_rfc = best_model_rfc.predict(X_test)

test_rmse_rfc = compute_RMSE(predictions_test_rfc, y_test)
print('Test RMSE: {}'.format(test_rmse_rfc))

test_mae_rfc = compute_mae(predictions_test_rfc, y_test)
print('Test MAE: {}'.format(test_mae_rfc))

test_acc = compute_accuracy(predictions_test_rfc, y_test)
print('Test Accuracy: {}'.format(test_acc))

In [None]:
best_model_rfc_2 = best_model_rfc

### SVM

In [None]:
svm = SVC()
svm.get_params()

In [None]:
hyperparameters = [
        {
        'C' : [0.1, 1, 100, 1000],
        'kernel' : ['rbf', 'poly', 'sigmoid', 'linear'],
        'degree' : [1, 2, 3, 4, 5, 6]
    }
]

In [None]:
rssvm = RandomizedSearchCV(svm, hyperparameters, random_state=42, cv=10, n_iter=50)
rssvm.fit(X_train, y_train)

In [None]:
best_params_svm = rssvm.best_params_
best_params_svm

In [None]:
best_model_svm = rssvm.best_estimator_

predictions_train_svm = best_model_svm.predict(X_train)

train_rmse_svm = compute_RMSE(predictions_train_svm, y_train)
print('Training RMSE: {}'.format(train_rmse_svm))

train_mae_svm = compute_mae(predictions_train_svm, y_train)
print('Training MAE: {}'.format(train_mae_svm))

train_acc = compute_accuracy(predictions_train_svm, y_train)
print('Training Accuracy: {}'.format(train_acc))

predictions_test_svm = best_model_svm.predict(X_test)

test_rmse_svm = compute_RMSE(predictions_test_svm, y_test)
print('Test RMSE: {}'.format(test_rmse_svm))

test_mae_svm = compute_mae(predictions_test_svm, y_test)
print('Test MAE: {}'.format(test_mae_svm))

test_acc = compute_accuracy(predictions_test_svm, y_test)
print('Test Accuracy: {}'.format(test_acc))

In [None]:
best_model_svm_2 = best_model_svm

### Decision Tree

In [None]:
dt = DecisionTreeClassifier()
dt.get_params()

In [None]:
hyperparameters = [
    {
        'min_impurity_decrease': [0.001, 0.01, 0.05, 0.1, 0.3, 0.5],
        'max_depth': [5, 10, 20, 30],
        'min_samples_split': [2, 4, 6, 10, 15, 20],
        'max_leaf_nodes': [3, 5, 10, 20, 50, 100]
    }
]

In [None]:
rsdt = RandomizedSearchCV(dt, hyperparameters, random_state=42, cv=10, n_iter=50)
rsdt.fit(X_train, y_train)

In [None]:
best_params_dt = rsdt.best_params_
best_params_dt

In [None]:
best_model_dt = rsdt.best_estimator_

predictions_train_dt = best_model_dt.predict(X_train)

train_rmse_dt = compute_RMSE(predictions_train_dt, y_train)
print('Training RMSE: {}'.format(train_rmse_dt))

train_mae_dt = compute_mae(predictions_train_dt, y_train)
print('Training MAE: {}'.format(train_mae_dt))

train_acc_dt = compute_accuracy(predictions_train_dt, y_train)
print('Training Accuracy: {}'.format(train_acc_dt))

predictions_test_dt = best_model_dt.predict(X_test)

test_rmse_dt = compute_RMSE(predictions_test_dt, y_test)
print('Test RMSE: {}'.format(test_rmse_dt))

test_mae_dt = compute_mae(predictions_test_dt, y_test)
print('Test MAE: {}'.format(test_mae_dt))

test_acc_dt = compute_accuracy(predictions_test_dt, y_test)
print('Test Accuracy: {}'.format(test_acc_dt))

In [None]:
best_model_dt_2 = best_model_dt

## Prediction models (extrinsic only)

In [None]:
# data = data2
# data = data.drop(columns=['song_familiarity', 'artist_familiarity', 'bad_good', 'distasteful_tasty', 'dull_exciting', 'tasteless_tasteful', 'unimaginative_creative', 'untalented_talented', 'unpleasant_pleasant', 'forgettable_memorable', 'boring_interesting', 'listen_similar', 'share_friends', 'add_playlist'])
# data = data.drop(columns=['danceability', 'key', 'loudness', 'speechiness', 'liveness', 'valence', 'tempo', 'duration_ms'])
# X_data =  data.values[:,:-1]
# y_data = data.values[:,-1]

# print(X_data.shape)
# print(y_data.shape)

In [None]:
data_temp = data_copy
data2 = data_temp.drop(columns=['danceability', 'key', 'loudness', 'speechiness', 'liveness', 'valence', 'tempo', 'duration_ms', 'song_familiarity', 'artist_familiarity', 'bad_good', 'distasteful_tasty', 'dull_exciting', 'tasteless_tasteful', 'unimaginative_creative', 'untalented_talented', 'unpleasant_pleasant', 'forgettable_memorable', 'boring_interesting', 'listen_similar', 'share_friends', 'add_playlist'])
X_data =  data2.values[:,:-1]
y_data = data2.values[:,-1]

print(X_data.shape)
print(y_data.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=42, test_size=0.20, stratify=y_data)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Random Forest 

In [None]:
rfc = RandomForestClassifier()
rfc.get_params()

In [None]:
hyperparameters = [
        {
        'n_estimators' : [50, 100, 200, 400, 600, 800, 1000, 1200, 1600, 1800],
        'max_depth' : [10, 30, 50, 100, 150, 200, 250, 300, 350, None],
        'min_samples_split' : [2, 4, 6, 10, 15, 20],
        'max_leaf_nodes' : [100, 300, 500, 700, 900, 1100, 1300, None]
    }
]

In [None]:
rsrfc = RandomizedSearchCV(rfc, hyperparameters, random_state=42, cv=10, n_iter=50)
rsrfc.fit(X_train, y_train)

In [None]:
best_params_rfc = rsrfc.best_params_
best_params_rfc

In [None]:
best_model_rfc = rsrfc.best_estimator_

predictions_train_rfc = best_model_rfc.predict(X_train)

train_rmse_rfc = compute_RMSE(predictions_train_rfc, y_train)
print('Training RMSE: {}'.format(train_rmse_rfc))

train_mae_rfc = compute_mae(predictions_train_rfc, y_train)
print('Training MAE: {}'.format(train_mae_rfc))

train_acc = compute_accuracy(predictions_train_rfc, y_train)
print('Training Accuracy: {}'.format(train_acc))

predictions_test_rfc = best_model_rfc.predict(X_test)

test_rmse_rfc = compute_RMSE(predictions_test_rfc, y_test)
print('Test RMSE: {}'.format(test_rmse_rfc))

test_mae_rfc = compute_mae(predictions_test_rfc, y_test)
print('Test MAE: {}'.format(test_mae_rfc))

test_acc = compute_accuracy(predictions_test_rfc, y_test)
print('Test Accuracy: {}'.format(test_acc))

### SVM

In [None]:
svm = SVC()
svm.get_params()

In [None]:
hyperparameters = [
        {
        'C' : [0.1, 1, 100, 1000],
        'kernel' : ['rbf', 'poly', 'sigmoid', 'linear'],
        'degree' : [1, 2, 3, 4, 5, 6]
    }
]

In [None]:
rssvm = RandomizedSearchCV(svm, hyperparameters, random_state=42, cv=10, n_iter=50)
rssvm.fit(X_train, y_train)

In [None]:
best_params_svm = rssvm.best_params_
best_params_svm

In [None]:
best_model_svm = rssvm.best_estimator_

predictions_train_svm = best_model_svm.predict(X_train)

train_rmse_svm = compute_RMSE(predictions_train_svm, y_train)
print('Training RMSE: {}'.format(train_rmse_svm))

train_mae_svm = compute_mae(predictions_train_svm, y_train)
print('Training MAE: {}'.format(train_mae_svm))

train_acc = compute_accuracy(predictions_train_svm, y_train)
print('Training Accuracy: {}'.format(train_acc))

predictions_test_svm = best_model_svm.predict(X_test)

test_rmse_svm = compute_RMSE(predictions_test_svm, y_test)
print('Test RMSE: {}'.format(test_rmse_svm))

test_mae_svm = compute_mae(predictions_test_svm, y_test)
print('Test MAE: {}'.format(test_mae_svm))

test_acc = compute_accuracy(predictions_test_svm, y_test)
print('Test Accuracy: {}'.format(test_acc))

### Decision Tree

In [None]:
dt = DecisionTreeClassifier()
dt.get_params()

In [None]:
hyperparameters = [
    {
        'min_impurity_decrease': [0.001, 0.01, 0.05, 0.1, 0.3, 0.5],
        'max_depth': [5, 10, 20, 30],
        'min_samples_split': [2, 4, 6, 10, 15, 20],
        'max_leaf_nodes': [3, 5, 10, 20, 50, 100]
    }
]

In [None]:
rsdt = RandomizedSearchCV(dt, hyperparameters, random_state=42, cv=10, n_iter=50)
rsdt.fit(X_train, y_train)

In [None]:
best_params_dt = rsdt.best_params_
best_params_dt

In [None]:
best_model_dt = rsdt.best_estimator_

predictions_train_dt = best_model_dt.predict(X_train)

train_rmse_dt = compute_RMSE(predictions_train_dt, y_train)
print('Training RMSE: {}'.format(train_rmse_dt))

train_mae_dt = compute_mae(predictions_train_dt, y_train)
print('Training MAE: {}'.format(train_mae_dt))

train_acc_dt = compute_accuracy(predictions_train_dt, y_train)
print('Training Accuracy: {}'.format(train_acc_dt))

predictions_test_dt = best_model_dt.predict(X_test)

test_rmse_dt = compute_RMSE(predictions_test_dt, y_test)
print('Test RMSE: {}'.format(test_rmse_dt))

test_mae_dt = compute_mae(predictions_test_dt, y_test)
print('Test MAE: {}'.format(test_mae_dt))

test_acc_dt = compute_accuracy(predictions_test_dt, y_test)
print('Test Accuracy: {}'.format(test_acc_dt))

## Prediction models (intrinsic only)

In [None]:
# data = data2
# data = data.drop(columns=['song_familiarity', 'artist_familiarity', 'bad_good', 'distasteful_tasty', 'dull_exciting', 'tasteless_tasteful', 'unimaginative_creative', 'untalented_talented', 'unpleasant_pleasant', 'forgettable_memorable', 'boring_interesting', 'listen_similar', 'share_friends', 'add_playlist'])
# data = data.drop(columns=['music_label', 'artist_collab', 'chart_history', 'mean_trends'])
# X_data =  data.values[:,:-1]
# y_data = data.values[:,-1]

# print(X_data.shape)
# print(y_data.shape)

In [None]:
data_temp = data_copy
data2 = data_temp.drop(columns=['music_label', 'artist_collab', 'chart_history', 'mean_trends', 'song_familiarity', 'artist_familiarity', 'bad_good', 'distasteful_tasty', 'dull_exciting', 'tasteless_tasteful', 'unimaginative_creative', 'untalented_talented', 'unpleasant_pleasant', 'forgettable_memorable', 'boring_interesting', 'listen_similar', 'share_friends', 'add_playlist'])
X_data =  data2.values[:,:-1]
y_data = data2.values[:,-1]

print(X_data.shape)
print(y_data.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=42, test_size=0.20, stratify=y_data)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Random Forest 

In [None]:
rfc = RandomForestClassifier()
rfc.get_params()

In [None]:
hyperparameters = [
        {
        'n_estimators' : [50, 100, 200, 400, 600, 800, 1000, 1200, 1600, 1800],
        'max_depth' : [10, 30, 50, 100, 150, 200, 250, 300, 350, None],
        'min_samples_split' : [2, 4, 6, 10, 15, 20],
        'max_leaf_nodes' : [100, 300, 500, 700, 900, 1100, 1300, None]
    }
]

In [None]:
rsrfc = RandomizedSearchCV(rfc, hyperparameters, random_state=42, cv=10, n_iter=50)
rsrfc.fit(X_train, y_train)

In [None]:
best_params_rfc = rsrfc.best_params_
best_params_rfc

In [None]:
best_model_rfc = rsrfc.best_estimator_

predictions_train_rfc = best_model_rfc.predict(X_train)

train_rmse_rfc = compute_RMSE(predictions_train_rfc, y_train)
print('Training RMSE: {}'.format(train_rmse_rfc))

train_mae_rfc = compute_mae(predictions_train_rfc, y_train)
print('Training MAE: {}'.format(train_mae_rfc))

train_acc = compute_accuracy(predictions_train_rfc, y_train)
print('Training Accuracy: {}'.format(train_acc))

predictions_test_rfc = best_model_rfc.predict(X_test)

test_rmse_rfc = compute_RMSE(predictions_test_rfc, y_test)
print('Test RMSE: {}'.format(test_rmse_rfc))

test_mae_rfc = compute_mae(predictions_test_rfc, y_test)
print('Test MAE: {}'.format(test_mae_rfc))

test_acc = compute_accuracy(predictions_test_rfc, y_test)
print('Test Accuracy: {}'.format(test_acc))

### SVM

In [None]:
svm = SVC()
svm.get_params()

In [None]:
hyperparameters = [
        {
        'C' : [0.1, 1, 100, 1000],
        'kernel' : ['rbf', 'poly', 'sigmoid', 'linear'],
        'degree' : [1, 2, 3, 4, 5, 6]
    }
]

In [None]:
rssvm = RandomizedSearchCV(svm, hyperparameters, random_state=42, cv=10, n_iter=50)
rssvm.fit(X_train, y_train)

In [None]:
best_params_svm = rssvm.best_params_
best_params_svm

In [None]:
best_model_svm = rssvm.best_estimator_

predictions_train_svm = best_model_svm.predict(X_train)

train_rmse_svm = compute_RMSE(predictions_train_svm, y_train)
print('Training RMSE: {}'.format(train_rmse_svm))

train_mae_svm = compute_mae(predictions_train_svm, y_train)
print('Training MAE: {}'.format(train_mae_svm))

train_acc = compute_accuracy(predictions_train_svm, y_train)
print('Training Accuracy: {}'.format(train_acc))

predictions_test_svm = best_model_svm.predict(X_test)

test_rmse_svm = compute_RMSE(predictions_test_svm, y_test)
print('Test RMSE: {}'.format(test_rmse_svm))

test_mae_svm = compute_mae(predictions_test_svm, y_test)
print('Test MAE: {}'.format(test_mae_svm))

test_acc = compute_accuracy(predictions_test_svm, y_test)
print('Test Accuracy: {}'.format(test_acc))

### Decision Tree

In [None]:
dt = DecisionTreeClassifier()
dt.get_params()

In [None]:
hyperparameters = [
    {
        'min_impurity_decrease': [0.001, 0.01, 0.05, 0.1, 0.3, 0.5],
        'max_depth': [5, 10, 20, 30],
        'min_samples_split': [2, 4, 6, 10, 15, 20],
        'max_leaf_nodes': [3, 5, 10, 20, 50, 100]
    }
]

In [None]:
rsdt = RandomizedSearchCV(dt, hyperparameters, random_state=42, cv=10, n_iter=50)
rsdt.fit(X_train, y_train)

In [None]:
best_params_dt = rsdt.best_params_
best_params_dt

In [None]:
best_model_dt = rsdt.best_estimator_

predictions_train_dt = best_model_dt.predict(X_train)

train_rmse_dt = compute_RMSE(predictions_train_dt, y_train)
print('Training RMSE: {}'.format(train_rmse_dt))

train_mae_dt = compute_mae(predictions_train_dt, y_train)
print('Training MAE: {}'.format(train_mae_dt))

train_acc_dt = compute_accuracy(predictions_train_dt, y_train)
print('Training Accuracy: {}'.format(train_acc_dt))

predictions_test_dt = best_model_dt.predict(X_test)

test_rmse_dt = compute_RMSE(predictions_test_dt, y_test)
print('Test RMSE: {}'.format(test_rmse_dt))

test_mae_dt = compute_mae(predictions_test_dt, y_test)
print('Test MAE: {}'.format(test_mae_dt))

test_acc_dt = compute_accuracy(predictions_test_dt, y_test)
print('Test Accuracy: {}'.format(test_acc_dt))

## Prediction models (crowdsource only)

In [None]:
# data = data2
# data = data.drop(columns=['song_familiarity', 'artist_familiarity', 'bad_good', 'distasteful_tasty', 'dull_exciting', 'tasteless_tasteful', 'unimaginative_creative', 'untalented_talented', 'unpleasant_pleasant', 'forgettable_memorable', 'boring_interesting', 'listen_similar', 'share_friends', 'add_playlist'])
# data = data.drop(columns=['music_label', 'artist_collab', 'chart_history', 'mean_trends'])
# X_data =  data.values[:,:-1]
# y_data = data.values[:,-1]

# print(X_data.shape)
# print(y_data.shape)

In [None]:
data_temp = data_copy
data2 = data_temp.drop(columns=['music_label', 'artist_collab', 'chart_history', 'mean_trends', 'danceability', 'key', 'loudness', 'speechiness', 'liveness', 'valence', 'tempo', 'duration_ms'])
X_data =  data2.values[:,:-1]
y_data = data2.values[:,-1]

print(X_data.shape)
print(y_data.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=42, test_size=0.20, stratify=y_data)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Random Forest 

In [None]:
rfc = RandomForestClassifier()
rfc.get_params()

In [None]:
hyperparameters = [
        {
        'n_estimators' : [50, 100, 200, 400, 600, 800, 1000, 1200, 1600, 1800],
        'max_depth' : [10, 30, 50, 100, 150, 200, 250, 300, 350, None],
        'min_samples_split' : [2, 4, 6, 10, 15, 20],
        'max_leaf_nodes' : [100, 300, 500, 700, 900, 1100, 1300, None]
    }
]

In [None]:
rsrfc = RandomizedSearchCV(rfc, hyperparameters, random_state=42, cv=10, n_iter=50)
rsrfc.fit(X_train, y_train)

In [None]:
best_params_rfc = rsrfc.best_params_
best_params_rfc

In [None]:
best_model_rfc = rsrfc.best_estimator_

predictions_train_rfc = best_model_rfc.predict(X_train)

train_rmse_rfc = compute_RMSE(predictions_train_rfc, y_train)
print('Training RMSE: {}'.format(train_rmse_rfc))

train_mae_rfc = compute_mae(predictions_train_rfc, y_train)
print('Training MAE: {}'.format(train_mae_rfc))

train_acc = compute_accuracy(predictions_train_rfc, y_train)
print('Training Accuracy: {}'.format(train_acc))

predictions_test_rfc = best_model_rfc.predict(X_test)

test_rmse_rfc = compute_RMSE(predictions_test_rfc, y_test)
print('Test RMSE: {}'.format(test_rmse_rfc))

test_mae_rfc = compute_mae(predictions_test_rfc, y_test)
print('Test MAE: {}'.format(test_mae_rfc))

test_acc = compute_accuracy(predictions_test_rfc, y_test)
print('Test Accuracy: {}'.format(test_acc))

### SVM

In [None]:
svm = SVC()
svm.get_params()

In [None]:
hyperparameters = [
        {
        'C' : [0.1, 1, 100, 1000],
        'kernel' : ['rbf', 'poly', 'sigmoid', 'linear'],
        'degree' : [1, 2, 3, 4, 5, 6]
    }
]

In [None]:
rssvm = RandomizedSearchCV(svm, hyperparameters, random_state=42, cv=10, n_iter=50)
rssvm.fit(X_train, y_train)

In [None]:
best_params_svm = rssvm.best_params_
best_params_svm

In [None]:
best_model_svm = rssvm.best_estimator_

predictions_train_svm = best_model_svm.predict(X_train)

train_rmse_svm = compute_RMSE(predictions_train_svm, y_train)
print('Training RMSE: {}'.format(train_rmse_svm))

train_mae_svm = compute_mae(predictions_train_svm, y_train)
print('Training MAE: {}'.format(train_mae_svm))

train_acc = compute_accuracy(predictions_train_svm, y_train)
print('Training Accuracy: {}'.format(train_acc))

predictions_test_svm = best_model_svm.predict(X_test)

test_rmse_svm = compute_RMSE(predictions_test_svm, y_test)
print('Test RMSE: {}'.format(test_rmse_svm))

test_mae_svm = compute_mae(predictions_test_svm, y_test)
print('Test MAE: {}'.format(test_mae_svm))

test_acc = compute_accuracy(predictions_test_svm, y_test)
print('Test Accuracy: {}'.format(test_acc))

### Decision Tree

In [None]:
dt = DecisionTreeClassifier()
dt.get_params()

In [None]:
hyperparameters = [
    {
        'min_impurity_decrease': [0.001, 0.01, 0.05, 0.1, 0.3, 0.5],
        'max_depth': [5, 10, 20, 30],
        'min_samples_split': [2, 4, 6, 10, 15, 20],
        'max_leaf_nodes': [3, 5, 10, 20, 50, 100]
    }
]

In [None]:
rsdt = RandomizedSearchCV(dt, hyperparameters, random_state=42, cv=10, n_iter=50)
rsdt.fit(X_train, y_train)

In [None]:
best_params_dt = rsdt.best_params_
best_params_dt

In [None]:
best_model_dt = rsdt.best_estimator_

predictions_train_dt = best_model_dt.predict(X_train)

train_rmse_dt = compute_RMSE(predictions_train_dt, y_train)
print('Training RMSE: {}'.format(train_rmse_dt))

train_mae_dt = compute_mae(predictions_train_dt, y_train)
print('Training MAE: {}'.format(train_mae_dt))

train_acc_dt = compute_accuracy(predictions_train_dt, y_train)
print('Training Accuracy: {}'.format(train_acc_dt))

predictions_test_dt = best_model_dt.predict(X_test)

test_rmse_dt = compute_RMSE(predictions_test_dt, y_test)
print('Test RMSE: {}'.format(test_rmse_dt))

test_mae_dt = compute_mae(predictions_test_dt, y_test)
print('Test MAE: {}'.format(test_mae_dt))

test_acc_dt = compute_accuracy(predictions_test_dt, y_test)
print('Test Accuracy: {}'.format(test_acc_dt))

# Out of Time testing

Test data using best model from **crowdsource+extrinsic+intrinsic** and **intrinsic+extrinsic**

best_model_rfc_3, best_model_svm_3, best_model_dt_3

best_model_rfc_2, best_model_svm_2, best_model_dt_2

In [None]:
data = pd.read_csv('complete_data_term3run.csv', index_col=0)
print(data)

In [None]:
data = data.drop(columns=['title', 'artist/s'])
X_data =  data.values[:,:-1]
y_data = data.values[:,-1]

print(X_data.shape)
print(y_data.shape)

In [None]:
# predictions_rfc_3 = best_model_rfc_3.predict(X_data)

# test_rmse_rfc_3 = compute_RMSE(predictions_rfc_3, y_data)
# print('Test RMSE: {}'.format(test_rmse_rfc_3))

# test_mae_rfc_3 = compute_mae(predictions_rfc_3, y_data)
# print('Test MAE: {}'.format(test_mae_rfc_3))

# test_acc_rfc_3 = compute_accuracy(predictions_rfc_3, y_data)
# print('Test Accuracy: {}'.format(test_acc_rfc_3))

In [None]:
# predictions_svm_3 = best_model_svm_3.predict(X_data)

# test_rmse_svm_3 = compute_RMSE(predictions_svm_3, y_data)
# print('Test RMSE: {}'.format(test_rmse_svm_3))

# test_mae_svm_3 = compute_mae(predictions_svm_3, y_data)
# print('Test MAE: {}'.format(test_mae_svm_3))

# test_acc_svm_3 = compute_accuracy(predictions_svm_3, y_data)
# print('Test Accuracy: {}'.format(test_acc_svm_3))

In [None]:
# predictions_dt_3 = best_model_dt_3.predict(X_data)

# test_rmse_dt_3 = compute_RMSE(predictions_dt_3, y_data)
# print('Test RMSE: {}'.format(test_rmse_dt_3))

# test_mae_dt_3 = compute_mae(predictions_dt_3, y_data)
# print('Test MAE: {}'.format(test_mae_dt_3))

# test_acc_dt_3 = compute_accuracy(predictions_dt_3, y_data)
# print('Test Accuracy: {}'.format(test_acc_dt_3))

In [None]:
predictions_rfc_2 = best_model_rfc_2.predict(X_data)

test_rmse_rfc_2 = compute_RMSE(predictions_rfc_2, y_data)
print('Test RMSE: {}'.format(test_rmse_rfc_2))

test_mae_rfc_2 = compute_mae(predictions_rfc_2, y_data)
print('Test MAE: {}'.format(test_mae_rfc_2))

test_acc_rfc_2 = compute_accuracy(predictions_rfc_2, y_data)
print('Test Accuracy: {}'.format(test_acc_rfc_2))

In [None]:
predictions_svm_2 = best_model_svm_2.predict(X_data)

test_rmse_svm_2 = compute_RMSE(predictions_svm_2, y_data)
print('Test RMSE: {}'.format(test_rmse_svm_2))

test_mae_svm_2 = compute_mae(predictions_svm_2, y_data)
print('Test MAE: {}'.format(test_mae_svm_2))

test_acc_svm_2 = compute_accuracy(predictions_svm_2, y_data)
print('Test Accuracy: {}'.format(test_acc_svm_2))

In [None]:
predictions_dt_2 = best_model_dt_2.predict(X_data)

test_rmse_dt_2 = compute_RMSE(predictions_dt_2, y_data)
print('Test RMSE: {}'.format(test_rmse_dt_2))

test_mae_dt_2 = compute_mae(predictions_dt_2, y_data)
print('Test MAE: {}'.format(test_mae_dt_2))

test_acc_dt_2 = compute_accuracy(predictions_dt_2, y_data)
print('Test Accuracy: {}'.format(test_acc_dt_2))

In [None]:
feature_importance = best_model_rfc_2.feature_importances_
df_rfc_importance = pd.DataFrame(data=feature_importance, index=X_data.columns, columns=["importance"])
df_rfc_importance

In [None]:
feature_importance = best_model_svm_2.feature_importances_
df_svm_importance = pd.DataFrame(data=feature_importance, index=X_data.columns, columns=["importance"])
df_svm_importance

In [None]:
feature_importance = best_model_dt_2.feature_importances_
df_dt_importance = pd.DataFrame(data=feature_importance, index=X_data.columns, columns=["importance"])
df_dt_importance

## SEM prediction

In [None]:
predictions_test_sem = model6.predict(X_data)

In [None]:
y_data_sem = y_data
test_rmse_sem = compute_RMSE(predictions_test_sem['stream_category'], y_data_sem)
print('RMSE: {}'.format(test_rmse_sem))

test_mae_sem = compute_mae(predictions_test_sem['stream_category'], y_data_sem)
print('MAE: {}'.format(test_mae_sem))

test_acc_sem = compute_accuracy(predictions_test_sem['stream_category'], y_data_sem)
print('Accuracy: {}'.format(test_acc_sem))

In [None]:
predictions_test_sem

In [None]:
y_data_sem