# 1. Description


1. Artist Analysis
    1. Top Artist - Solo songs
    2. Top Artist - Collaboration songs
    3. Top Artist - Prolific
    4. Top Artist - Popularity
    5. Top Partnership (Total of Streams X Quantity of Songs)
    6. Influence on the Playlists (TO DO)
    7. Temporal Trends (TO DO)
    8. Musical Caracteristics (TO DO)
    9. Danceability (TO DO)
    10. Liveness (TO DO)
    11. Speechiness (TO DO)
     
    
2. Song Analysis
    1. Top Song - Popularity
    2. Temporal Trends (TO DO)
    3. Playlists (TO DO)
    4. Plataforms (TO DO)
    5. Danceability (TO DO)
    6. Energy (TO DO)
    7. Acousticness (TO DO)
    8. Instrumentalness (TO DO)
    10. BPM (TO DO)

# 2. Initicial Settings

### 2.1 Library

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm
import networkx as nx
from textblob import TextBlob

### 2.2 Loading Data

In [None]:
import pandas as pd

# Loading the CSV file with thw encoding
df_spotify = pd.read_csv('/kaggle/input/top-spotify-songs-2023/spotify-2023.csv', encoding='ISO-8859-1')

# View the first few rows of the DataFrame
df_spotify.head()

### 2.3 Definitions

In [None]:
# Quantity of Artists - Top (Change the range)
qt_artist = 25

# Define custom colors for graphs
colors = ['skyblue', 'lightcoral', 'lightgreen', 'gold', 'lightsalmon', 'lightseagreen', 'lightsteelblue', 'palevioletred', 'lightcyan', 'lightpink']

# 3. Exploratory Analysis

In [None]:
# statistical summary (numeric columns)
numeric_summary = df_spotify.describe(include='number')

numeric_summary

In [None]:
# Check the DataFrame information
df_spotify.info()

In [None]:
# Check for duplicated rows in the entire DataFrame
are_there_duplicates = df_spotify.duplicated().any()

# Display the result
if are_there_duplicates:
    print("There are duplicated rows in the DataFrame.")
else:
    print("There are no duplicated rows in the DataFrame.")

# 4. Artists

### 4.1 Artists who have released solo music

Identify and visualize top artists who have released music as solo artists

In [None]:
# Count the number of songs for each artist when artist_count is 1
solo_artist_counts = df_spotify[df_spotify['artist_count'] == 1]['artist(s)_name'].str.split(', ').explode().value_counts()

# Get the top solo artists and sort them by the number of songs in descending order
top_solo_artists = solo_artist_counts.head(qt_artist).sort_values(ascending=False)

# Plot a bar chart for the top solo artists with the blue palette
plt.figure(figsize=(12, 6))
top_solo_artists.plot(kind='bar', color=colors)
plt.xlabel('Artists')
plt.ylabel('Number of Songs')
plt.title(f"Top {qt_artist} Artists (Solo)")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Print the Top artist
top_solo_artists

### 4.2 Artist Collaboration

Identify which artists have collaborated the most with other artists

In [None]:
# Filter rows where artist_count is greater than 1
collaborative_songs = df_spotify[df_spotify['artist_count'] > 1].copy()  # Make a copy to avoid SettingWithCopyWarning

# Split the 'artist(s)_name' column into multiple artists
collaborative_songs['artists'] = collaborative_songs['artist(s)_name'].str.split(', ')

# Create a list of all collaborating artists
all_collaborating_artists = collaborative_songs['artists'].explode()

# Count how many times each artist has collaborated
collaboration_counts = all_collaborating_artists.value_counts()

# Visualize the top collaborating artists
top_collaborators = collaboration_counts.head(50)

# Create a bar chart to show the top 10
plt.figure(figsize=(12, 6))
top_collaborators.plot(kind='bar', color=colors)
plt.xlabel('Artists')
plt.ylabel('Number of Collaborations')
plt.title(f"Top {qt_artist} Artists (Collaboration)")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Print the Top artist
top_collaborators

### 4.3 Most Prolific Artists
Identify the artists who have the most songs in the dataset, considering both their solo songs and their featured appearances in other songs.

In [None]:
# Combine solo and collaboration counts for each artist
total_counts = solo_artist_counts.add(collaboration_counts, fill_value=0)

# Visualize the top influential artists
top_influential = total_counts.sort_values(ascending=False).head(qt_artist)

# Create a bar chart to show the top influential artists
plt.figure(figsize=(12, 6))
top_influential.plot(kind='bar', color=colors)
plt.xlabel('Artists')
plt.ylabel('Total Influence (Solo + Collaborations)')
plt.title(f"Top {qt_artist} Artists (Prolific)")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Print the Top artist
top_influential

### 4.4 Artist Popularity

Identify the artist's popularity (measured by the total number of streams on their songs).

In [None]:
# Remove non-numeric values from 'streams' column and convert to integer
df_spotify['streams'] = pd.to_numeric(df_spotify['streams'], errors='coerce')

df_spotify.dtypes

In [None]:
# Check the mean of caracters in streams
average_stream_length = df_spotify['streams'].dropna().apply(lambda x: len(str(x))).mean()

# Print the result
print(f"Average number of characters in 'streams' column: {average_stream_length:.2f}")

In [None]:
# Sort the songs by the number of streams in descending order
top_songs = df_spotify.sort_values(by='streams', ascending=False).head(qt_artist)

# Convert the values in the 'streams' column to numeric (and handle non-numeric values)
top_songs['streams'] = pd.to_numeric(top_songs['streams'], errors='coerce')

# Convert the stream numbers to billions (by dividing by 1 billion - 10 caracters)
top_songs['streams_billion'] = top_songs['streams'] / 1000000000

# Aggregate streams by artist
artist_popularity = df_spotify.groupby('artist(s)_name')['streams'].sum().reset_index()

# Sort artists by popularity (number of streams) in descending order
artist_popularity = artist_popularity.sort_values(by='streams', ascending=False)

# Create a bar chart to show the popularity of the top artists
top_popularity_artists = artist_popularity.head(qt_artist)

# Convert the 'streams' column to numeric using .loc to avoid the SettingWithCopyWarning
top_popularity_artists.loc[:, 'streams'] = pd.to_numeric(top_popularity_artists['streams'], errors='coerce')

plt.figure(figsize=(12, 6))
plt.barh(top_popularity_artists['artist(s)_name'], top_popularity_artists['streams'], color=colors)
plt.xlabel('Streams (Billion)')
plt.ylabel('Artist Name')
plt.title(f'Top {qt_artist} Artists by Total Streams')
plt.gca().invert_yaxis() 
plt.tight_layout()
plt.show()

In [None]:
# Define the artist name you want to analyze
artist_name = "The Weeknd"  # Change this to the artist you're interested in

# Filter the dataset for songs by the specified artist (either solo or collaborative)
artist_songs = df_spotify[df_spotify['artist(s)_name'].str.contains(artist_name, case=False, na=False)]

# Aggregate streams by track name for all songs by the artist
artist_song_streams = artist_songs.groupby('track_name')['streams'].sum().reset_index()

# Sort songs by total streams in descending order
artist_song_streams = artist_song_streams.sort_values(by='streams', ascending=False)

artist_song_streams

### 4.5 Artist Partnerships

Identify the number of collaborative songs, which artists are the most frequent partners and do these collaborations have any significant impact on the artist's popularity.

In [None]:
# Filter rows where artist_count is greater than 1 (collaborations)
collaborative = df_spotify[df_spotify['artist_count'] > 1].copy()

# Count the number of collaborative songs for each artist
collaborative_artist_counts = collaborative['artist(s)_name'].value_counts()

# Get the top artists with the most collaborative songs
top_collaborative_artists = collaborative_artist_counts.head(qt_artist)

# Group and sum streams by artist for collaborative songs
collaborative_artist_streams = collaborative.groupby('artist(s)_name')['streams'].sum()

# Get the top artists with the most total streams for collaborative songs
top_collaborative_artists_streams = collaborative_artist_streams.sort_values(ascending=False).head(qt_artist)

In [None]:

# Dados das parcerias, quantidade de músicas e quantidade de streams
partnership = top_collaborative_artists.index
num_songs = top_collaborative_artists.values
total_streams = top_collaborative_artists_streams.values

# Crie um intervalo numérico igual ao número de parcerias
x = np.arange(len(partnership))

# Crie uma figura com dois eixos y
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plote as barras no primeiro eixo y (total de streams)
ax1.bar(x, total_streams, color=colors, label='Total Streams')

# Configure o primeiro eixo y
ax1.set_xlabel('Partnership')
ax1.set_ylabel('Total Streams')
ax1.tick_params(axis='y')
ax1.set_xticks(x)
ax1.set_xticklabels(partnership, rotation=45, ha='right')

# Crie um segundo eixo y compartilhando o mesmo eixo x
ax2 = ax1.twinx()

# Plote a linha no segundo eixo y (total de músicas)
ax2.plot(x, num_songs, color='red', marker='o', linestyle='-', label='Number of Songs')

# Adicione uma legenda para ambas as linhas/barras
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc='right', bbox_to_anchor=(1.15, 1))

# Defina um título geral para o gráfico
plt.title('Partnership Statistics')

# Exiba o gráfico
plt.tight_layout()
plt.show()

# 5. Song

### 5.1 Song Popularity

Identify the song's popularity (measured by the total number of streams on their songs).

In [None]:
# The first part of this code is on topic 4.4 Artist Popularity

# Create a horizontal bar chart for the top songs with stream numbers in billions
plt.figure(figsize=(12, 6))
plt.barh(top_songs['track_name'], top_songs['streams_billion'], color=colors)
plt.xlabel('Streams (Billion)')
plt.ylabel('Track Name')
plt.title(f"Top {qt_artist} Songs by Streams")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()