# We can use GradientBoostingRegressor which allows for the optimization of arbitrary differentiable loss functions

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.subplots as sp
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px
import plotly.offline as pyo


from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv('Spotify_Youtube .csv')
df = df.drop(df[df.Duration_ms > 600000].index) # drop rows that hold durations  
df = df.dropna()
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
rule = ((df < (Q1 - 1.5 * (Q3 - Q1))) | (df > (Q3 + 1.5 *(Q3 -Q1))))
df.skew()



Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version. Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



Unnamed: 0          -0.024627
Danceability        -0.554416
Energy              -0.727934
Key                 -0.002342
Loudness            -2.797320
Speechiness          3.077427
Acousticness         0.915316
Instrumentalness     3.772123
Liveness             2.335986
Valence             -0.092880
Tempo                0.387777
Duration_ms          1.195004
Views                9.070239
Likes                8.573440
Comments            42.729724
Licensed            -0.950991
official_video      -1.454562
Stream               4.054457
dtype: float64

In [6]:
# Select the relevant columns for the model
X = df[['Danceability', 'Energy', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence','Tempo','Likes','Views','Comments','Stream']]
y = df['Views']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the gradient boosting model
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=10, random_state=42)
gb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = gb_model.predict(X_test)

# Calculate the mean squared error and R2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Gradient Boosting Regression:')
print('Mean Squared Error:', mse)
print('R2 Score:', r2)


Gradient Boosting Regression:
Mean Squared Error: 112653504715864.88
R2 Score: 0.9987051296409681


### The results suggest that the model has a high degree of explanatory power, but may not be the best fit for the data due to the large MSE value. Therefore using this set of data for prediction purposes might not be accurate enough