In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
# This cell contains "things" that are common to both
# versions. 

# Function for Grouping
# I didn't really think about how it should be grouped,
# I was just trying to create as many classes as possible.
# So, treat these groups as a mere example.
def group(streams):
    # Each group has a lower and upper bound.
    groups = [(0, 1000000, "Group A"), (1000000, 5000000, "Group B"), (5000000, 10000000, "Group C"), (10000000, 100000000, "Group D"),]
    for l, r, groupName in groups:
        if l <= streams <= r:
            return groupName
    return "Group E"

# Non-Numerical Features; do we need track??
dropFeatures = ['Unnamed: 0', 'Artist', 'Url_spotify', 'Track', 'Album', 'Album_type', 'Uri', 'Title', 'Channel', 'Views', 'Likes', 'Comments', 'Description', 'Licensed', 'official_video', 'Url_youtube']

# Data
data = pd.read_csv("data/Spotify_Youtube.csv")

# for column in categorical_columns:
#     data[column] = LabelEncoder().fit_transform(data[column])

data.drop(dropFeatures, axis=1, inplace=True)

data.dropna(inplace=True)

In [3]:
# Classification (Grouping Done Before)
# I think Jaden proposed that we partition the streams beforehand and use classification.
# So, here is something like that.

# x = data.drop("Stream", axis=1) 
# x_scaler = StandardScaler()
# x_scaled = x_scaler.fit_transform(x)

# y = data["Stream"].values.reshape(-1, 1)
# y_scaler = StandardScaler()
# y_scaled = y_scaler.fit_transform(y)

x = data.drop(columns=["Stream"])
y = data["Stream"]
x_scaled = StandardScaler().fit_transform(x)
xTrain, xTest, yTrain, yTest = train_test_split(x_scaled, y, test_size=0.2)

# # Model
# # Using the default final estimator (LogisticRegression)
# st = StackingClassifier(estimators=[
#     ('LogisticRegression', LogisticRegression()), 
#     ('KNeighbors', KNeighborsClassifier()), 
#     ('DecisionTree', DecisionTreeClassifier())
# ])

# # Fit and Predict
# st.fit(xTrain, yTrain)
# yPred = st.predict(xTest)

# # This has a score of 1.0,
# # I'm not sure if that means anything
# st.score(xTest, yTest)

In [4]:
# Regression (Grouping Done After)
# Ana proposed that we group the streams after the prediction;
# so we'd use regression to predict the number of streams, and then we'd
# return the group said stream belongs to (i.e. streams < 100,000,000 belong to "flop").

# Extracting
x = StandardScaler().fit_transform(data)
y = data["Stream"]
y_scaled = StandardScaler().fit_transform(y.values.reshape(-1, 1))
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2)

In [5]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_model(y_true, predictions, model_name):
    mse = mean_squared_error(yTest, predictions)
    mae = mean_absolute_error(yTest, predictions)
    r2 = r2_score(yTest, predictions)

    print(f"{model_name} Evaluation Metrics:")
    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")
    print(f"R-squared: {r2}")

In [6]:
model = LinearRegression()

model.fit(xTrain, yTrain)

linear_regr_pred = model.predict(xTest)
evaluate_model(yTest,linear_regr_pred,"Linear Regression Kernel")

# bg_pred = bg.predict(xTest)
# evaluate_model(yTest,bg_pred,"BG Model")

# print("Max Stream Value:", y.max())
# print("Min Stream Value:", y.min())
# print("Mean Stream Value:", y.mean())

# st_pred = st.predict(xTest)
# evaluate_model(yTest,st_pred,"St Kernel")

Linear Regression Kernel Evaluation Metrics:
Mean Squared Error: 1.178916833955166e-13
Mean Absolute Error: 2.724974015736888e-07
R-squared: 1.0


In [7]:
# Model
# I'm just using the default estimator provided for the final estimator (RidgeCV).
st = StackingRegressor(estimators=[
    ('KNeighbors', KNeighborsRegressor()), 
    ('LinearRegression', LinearRegression()), 
    ('DecisionTree', DecisionTreeRegressor())
])

# Fitting and Scoring
st.fit(xTrain, yTrain)

# This has a dire score,
# I think it's caused by the models being used?
# Maybe stacking is not the best way to go here.
st.score(xTest, yTest)

-3900539972.5257874

In [8]:
# Regression (Continued)
# When bagging is used, the R^2 score is closer to 1.
# As a side note, it had a score of 0.9999371139478829.
# So, if we do regression, maybe we use bagging.
# Also, I'm using the default model for BaggingRegressor,
# which I think is a Decision Tree?
bg = BaggingRegressor(estimator=LinearRegression())
bg.fit(xTrain, yTrain)

bg_pred = bg.predict(xTest)
evaluate_model(yTest,bg_pred,"BG Model")

# dt = DecisionTreeRegressor(max_depth=10, random_state=42)
# dt.fit(xTrain, yTrain)
# dt_predictions = dt.predict(xTest)
# evaluate_model(yTest,dt_predictions,"Decision Tree Regressor")

BG Model Evaluation Metrics:
Mean Squared Error: 1.9263984209087927e-14
Mean Absolute Error: 9.366606943575894e-08
R-squared: 1.0


In [9]:
print("x_scaled shape:", x_scaled.shape)
print("xTest shape:", xTest.shape)

print("Training columns:", data.drop("Stream", axis=1).columns)
print("Test columns:", xTest.columns if hasattr(xTest, 'columns') else "No column names")


x_scaled shape: (20140, 11)
xTest shape: (4028, 12)
Training columns: Index(['Danceability', 'Energy', 'Key', 'Loudness', 'Speechiness',
       'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo',
       'Duration_ms'],
      dtype='object')
Test columns: No column names


In [10]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
rf.fit(x_scaled, y)

rf_pred = rf.predict(xTest)
evaluate_model(yTest,rf_pred,"RandomForestRegressor")

ValueError: X has 12 features, but RandomForestRegressor is expecting 11 features as input.