# **Context:**

I've been studying data visualization lately and decided to use these dataset to practice. I really liked the dataset, it's about Spotify, the best-known music streaming service in the world. The columns in the dataset represent: 
- Artist: Name of artist;
- Streams: Number of times the artist's songs have been streamed on Spotify;
- Daily: The average number of streams an artist's music receives on a daily basis;
- As lead: Number of streams attributed to the artist when they are the lead or primary artist on a track;
- Solo: Number of streams garnered by the artist for their solo projects;
- As feature: number of streams the artist has accumulated when featured as a guest or collaborator on tracks by other artists.

# **Importing libraries and reading the dataset:**

In [None]:
import pandas as pd
from string import ascii_letters
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/spotify-most-streamed-artists-of-all-time/artists.csv')
df.head()

# **Peek at the data:**

In [None]:
df.Artist.nunique()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
cols_to_replace = ['Streams', 'As lead', 'Solo', 'As feature']
for col in cols_to_replace:
    df[col] = df[col].str.replace(',', '')
    df[col] = df[col].astype('float')
     
df.info()

In [None]:
df.describe()

# **Distribuitions:**

## *Conditional Kernal Desnity Estimate:**

In [None]:

sns.set_theme(style="white")
rs = np.random.RandomState(50)

# Set up the matplotlib figure
f, axes = plt.subplots(3, 3, figsize=(9, 9), sharex=True, sharey=True)

# Rotate the starting point around the cubehelix hue circle
for ax, s in zip(axes.flat, np.linspace(0, 3, 10)):

    # Create a cubehelix colormap to use with kdeplot
    cmap = sns.cubehelix_palette(start=s, light=1, as_cmap=True)

    # Generate and plot a random bivariate dataset
    x, y = rs.normal(size=(2, 50))
    sns.kdeplot(
        x=x, y=y,
        cmap=cmap, fill=True,
        clip=(-5, 5), cut=10,
        thresh=0, levels=15,
        ax=ax,
    )
    ax.set_axis_off()

ax.set(xlim=(-3.5, 3.5), ylim=(-3.5, 3.5))
f.subplots_adjust(0, 0, 1, 1, .08, .08)

## **BoxPlot:**

In [None]:
from matplotlib.pyplot import pie, axis,show
%matplotlib inline

sums =df.groupby(df["artist;"])["Solo"].sum()
axis('equal');
pie(sums,label=sums.index);
show()


#artist_data = df["Artist;"]   
#bug_data = df["Solo"]                      
#colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#8c564b"]    

#sums =
#plt.show()

## **Diagonal Correlation Matrix:**

In [None]:
Diagonal correlation matrix

rs = np.random.RandomState(33)
mask = np.triu(np.ones_like(corr, dtype=bool))
d = pd.DataFrame(data=rs.normal(size=(100, 26)),
                 columns=list(ascii_letters[26:]))
corr=d.corr()
cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show

# **BarPlot:**

In [None]:
mean_streams = df.groupby('Artist')['Streams'].mean()[:10]

fig, ax = plt.subplots(figsize=(9,6))

mean_streams.sort_values(ascending=False).plot(kind='bar', color='red', ax=ax)

for p in ax.patches:
    height = p.get_height()
    ax.annotate(text=f'{height:.2f}',
               xy = (p.get_x() + p.get_width()/2, height), 
               xytext=(0, 4),
               textcoords= 'offset points',
               ha = 'center',
               va = 'bottom',
               size=6,
               weight = 'bold' 
               )
plt.title('Top 10 mean of "Streams" by Artist', fontdict={'fontsize':15})
plt.show()

In [None]:
mean_lead = df.groupby('Artist')['As lead'].mean()[:10]

fig, ax = plt.subplots(figsize=(9,6))

mean_lead.sort_values(ascending=False).plot(kind='bar', color='darkred', ax=ax)

for p in ax.patches:
    height = p.get_height()
    ax.annotate(text=f'{height:.2f}',
               xy = (p.get_x() + p.get_width()/2, height), 
               xytext=(0, 4),
               textcoords= 'offset points',
               ha = 'center',
               va = 'bottom',
               size=6,
               weight = 'bold' 
               )
plt.title('Top 10 mean of "As lead" by Artist', fontdict={'fontsize':15})
plt.show()

In [None]:
mean_solo = df.groupby('Artist')['Solo'].mean()[:10]

fig, ax = plt.subplots(figsize=(9,6))

mean_solo.sort_values(ascending=False).plot(kind='bar', color='#BD3A00', ax=ax)

for p in ax.patches:
    height = p.get_height()
    ax.annotate(text=f'{height:.2f}',
               xy = (p.get_x() + p.get_width()/2, height), 
               xytext=(0, 4),
               textcoords= 'offset points',
               ha = 'center',
               va = 'bottom',
               size=6,
               weight = 'bold' 
               )
plt.title('Top 10 mean of "Solo" by Artist', fontdict={'fontsize':15})
plt.show()

In [None]:
mean_as_feature = df.groupby('Artist')['As feature'].mean()[:50].sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(16,10))

sns.barplot(x=mean_as_feature.index,
            y=mean_as_feature.values,
            palette=sns.set_palette('pastel'), 
            ax=ax)

plt.xticks(rotation=90)
for p in ax.patches:
    height = p.get_height()
    ax.annotate(text=f'{height:.2f}',
               xy = (p.get_x() + p.get_width()/2, height), 
               xytext=(0, 4),
               textcoords= 'offset points',
               ha = 'center',
               va = 'bottom',
               size=6,
               weight = 'bold' 
               )
plt.title('Top 50 mean of "As feature" by Artist', fontdict={'fontsize':15})
plt.show()

In [None]:
mean_daily = df[['Daily', 'Artist']].sort_values(by='Daily', ascending=False)[:50]
fig, ax = plt.subplots(figsize=(16,8))

sns.barplot(data= mean_daily, x='Artist',
            y='Daily',
            palette=sns.set_palette('pastel'), 
            ax=ax)
plt.xticks(rotation=90)
for p in ax.patches:
    height = p.get_height()
    ax.annotate(text=f'{height:.2f}',
               xy = (p.get_x() + p.get_width()/2, height), 
               xytext=(0, 4),
               textcoords= 'offset points',
               ha = 'center',
               va = 'bottom',
               size=6,
               weight = 'bold' 
               )

plt.title('Top 50 mean of "Daily" by Artist', fontdict={'fontsize':15})
plt.show()

# **PairPlot:**

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
#mean_daily = df[['Daily', 'Artist']].sort_values(by='Daily', ascending=False)[:50]
labels = 'Drake', 'Bad Bunny', 'Taylor Swift', 'The weekend', 'Ed Sheeran', 'Justin Beiber', 'Eminem'
sizes = [86041, 67533, 57859, 53655, 53665, 47907, 47525]
explode = (0, 0, 0, 0, 0, 0, 0.1)  # only "explode" the last slice (i.e. 'Others')

fig1, ax1 = plt.subplots(figsize=(15,10))
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=False, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

In [None]:
artist_data = df["Artist"]
stream_data = df["Streams"]
colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#8c564b"]
explode = (0.1, 0, 0, 0, 0)  
plt.pie(stream_data, labels=artist_data, explode=explode, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=140)
plt.title("All time streams pie chart top 5")
plt.show()

In [None]:
pair_plot = sns.pairplot(df, diag_kind='kde')
plt.show()

# **HeatMap:**

In [None]:
plt.rcParams["figure.figsize"] = [7.00, 3.50]
plt.rcParams["figure.autolayout"] = True
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
r = 0.05
u, v = np.mgrid[0:2 * np.pi:30j, 0:np.pi:20j]
x = np.cos(u) * np.sin(v)
y = np.sin(u) * np.sin(v)
z = np.cos(v)
ax.plot_surface(x, y, z, cmap=plt.cm.YlGnBu_r)
plt.show()


In [None]:
corr = df[numeric_cols].corr()
mask = np.triu(corr)
sns.heatmap(corr, mask=mask, vmin=-1, vmax=1, annot=True, cmap='YlOrRd')
plt.title('Correlation HeatMap', fontdict={'fontsize':20})
plt.show()