# Data Visualization after cleaning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.subplots as sp
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px
import plotly.offline as pyo


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv('cleaneddf.csv')


## Identify relevant columns for the model

In [None]:
# Select the relevant columns for the model
X = df[['Danceability', 'Energy', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence','Tempo','Loudness','Likes','Duration_ms','Views','Comments','Stream']]
X.corr()

## There are higher correlation between ( Likes and Views),(comments and likes),(Loudness and energy),(Stream and likes),(stream and views)

In [None]:
cols = ['Danceability', 'Energy', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence','Tempo','Likes','Views','Comments','Stream']

features = ['Danceability', 'Energy', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo']

# Calculate the IQR for each feature
Q1 = df[features].quantile(0.25)
Q3 = df[features].quantile(0.75)
IQR = Q3 - Q1

# Remove outliers using Tukey's method
df_clean = df[~((df[features] < (Q1 - 1.5 * IQR)) | (df[features] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Visualize the cleaned data using a pairplot
sns.pairplot(df_clean[features])
sns.pairplot(df_clean, y_vars=["Views"], x_vars= features)

In [None]:
# Calculate correlations
numeric_columns = cols
descriptive_stats = df[numeric_columns].describe()
correlations = df[numeric_columns].corr()

# Create a correlation heatmap with custom appearance
fig = ff.create_annotated_heatmap(
    z=correlations.values,
    x=numeric_columns,
    y=numeric_columns,
    annotation_text=correlations.round(2).values,
    colorscale='Blackbody',
    showscale=True,
    hoverinfo='z'
)

# Customize the appearance
fig.update_xaxes(title_text='Features', side='bottom')
fig.update_yaxes(title_text='Features', side='left')
# Update layout 
fig.update_layout(
    title='Correlation Heatmap',
    title_x=0.5,
    width=1000,
    height=800,
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    xaxis_zeroline=False,
    yaxis_zeroline=False,
    margin=dict(l=150, r=100, t=100, b=100)
)


# Display the heatmap using  the 
pyo.iplot(fig)

In [None]:
numeric_cols = df.select_dtypes(include='number')
numeric_cols.hist(bins=50, figsize=(20,15))

## Relationship between performance metrics
### Showing relation between Loudness and Views

In [None]:
plt.scatter(df['Loudness'], df['Views'], s=100, c='red', alpha=0.7, edgecolors='none')
plt.title('Loudness vs. Views', fontsize=18)
plt.xlabel('Loudness', fontsize=14)
plt.ylabel('Views', fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.grid(True)
plt.show()

## Showing relation between Loudness and Energy

In [None]:
plt.scatter(df['Loudness'], df['Energy'], s=100, c='red', alpha=0.7, edgecolors='none')
plt.title('Loudness vs. Energy', fontsize=18)
plt.xlabel('Loudness', fontsize=14)
plt.ylabel('Energy', fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.grid(True)
plt.show()

## Showing relation between Stream and Views

In [None]:
plt.scatter(df['Stream'], df['Views'], s=100, c='red', alpha=0.7, edgecolors='none')
plt.title('Stream vs. View', fontsize=18)
plt.xlabel('Stream', fontsize=14)
plt.ylabel('View', fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.grid(True)
plt.show()

## showing relation between Stream and Likes

In [None]:
plt.scatter(df['Stream'], df['Likes'], s=100, c='red', alpha=0.7, edgecolors='none')
plt.title('Stream vs. Likes', fontsize=18)
plt.xlabel('Stream', fontsize=14)
plt.ylabel('Likes', fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.grid(True)
plt.show()

## Showing relation between Views, comments,likes and stream in relation to artists

In [None]:

def plot_relationship(df,metric_1,metric_2):
    # Create a scatter plot for Views vs. Likes
    fig = px.scatter(df, x=metric_1, y=metric_2, title=f"{metric_1} vs {metric_2}", color="Artist", hover_name='Track', hover_data=['Title', 'Channel'], opacity=0.7)

    # Customize the appearance
    fig.update_layout(xaxis_title=metric_1, yaxis_title=metric_2, xaxis_type='log', yaxis_type='log')
    fig.update_traces(marker=dict(size=8))
    fig.show()

plot_relationship(df,"Views","Likes")
plot_relationship(df,"Views","Comments")
plot_relationship(df,"Comments","Likes")
plot_relationship(df,"Stream","Likes")
plot_relationship(df,"Comments","Stream")
plot_relationship(df,"Views","Stream")

## 1. There doesnt seem to be much correlation between the stream and view
## 2. There is a decent correlation between loudness and energy
## 3. There doesnt seem to be much correlation between the stream and likes
## 4. we can see that Comments , likes and views are much closely correlated when put against the popular artists.