# Case Study 3

In [None]:
# 1. Display details of the songs that have popularity above 80
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

df = pd.read_csv("top50.csv", encoding='ISO-8859-1')
df[df['Popularity'] > 80]

In [None]:
# 2. Which is having maximum length, beats per minute?
df.iloc[df[['Beats.Per.Minute', 'Length.']].idxmax()]

In [None]:
# 3. Which is having minimum length, beats per minute?
df.iloc[df[['Beats.Per.Minute', 'Length.']].idxmin()]

In [None]:
# 4. Create new column to store string length
df['str_len'] = df['Track.Name'].str.len() + df['Genre'].str.len() + df['Artist.Name'].str.len()
df

In [None]:
# 5. Create new column to store first name of each artist
df['First_Name'] = df['Artist.Name'].str.split(' ').str[0]
df

In [None]:
# 6. How many songs from “dance pop” genre?
len(df[df['Genre'] == 'dance pop'])

In [None]:
# 7. What is average length for each genre?
df.groupby('Genre')[['Length.']].mean()

In [None]:
# 8. What is average length for each artist?
df.groupby('Artist.Name')[['Length.']].mean()

In [None]:
# 9. What is maximum length for each genre?
df.groupby('Genre')[['Length.']].max()

In [None]:
# 11. How many songs share same popularity?
df.groupby('Popularity')[['Track.Name']].count()

In [None]:
# 12. Rank each song based on popularity.
df = pd.read_csv("top50.csv", encoding='ISO-8859-1')
df['rank_popularity'] = df.Popularity.rank()
df

In [None]:
# 13. Rank each song based on danceability.
df['rank_danceability'] = df.Danceability.rank()
df

In [None]:
# 14. Average, Maximum, Minimum, standard deviation of each column
df.describe()

In [None]:
# 15. Average, Maximum, Minimum, standard deviation of each row
df.apply(pd.DataFrame.describe, axis=1)

In [None]:
# 16. Add a new column where each row will represent difference 
# between row value and mean of the each genre group.
df = pd.read_csv("top50.csv", encoding='ISO-8859-1')
df['new_col'] = df.groupby('Genre')[['Liveness']].transform(lambda x: x - x.mean())

In [None]:
# 17. Take log transform on Loudness DB and add as column
np.log(df['Loudness..dB..'])

In [None]:
# 18. Visualize if there is linear correlation between 
# beats per minutes and popularity, danceability and popularity, energy and popularity
# df.corr()
sns.heatmap(df[['Beats.Per.Minute','Popularity','Danceability']].corr())

fig, ax = plt.subplots(1,3, figsize = (20,4))
ax[0].scatter(df['Beats.Per.Minute'], df.Popularity)
ax[0].set_xlabel('Beats.Per.Minute')
ax[0].set_ylabel('Popularity')

ax[1].scatter(df.Danceability, df.Popularity)
ax[1].set_xlabel('Danceability')
ax[1].set_ylabel('Popularity')

ax[2].scatter(df.Energy, df.Popularity)
ax[2].set_xlabel('Energy')
ax[2].set_ylabel('Popularity')
plt.show()


In [None]:
# 19. What is distribution of popularity with respect to energy
plt.scatter(df.Energy, df.Popularity)
plt.xlabel("Energy")
plt.ylabel("Popularity")
plt.show()

In [None]:
# 20. What is distribution of popularity with respect to beats Per minute
plt.scatter(df['Beats.Per.Minute'], df.Popularity)
plt.xlabel("Beats.Per.Minute")
plt.ylabel("Popularity")

fig = plt.figure(figsize=(20,3))
sns.barplot(df['Beats.Per.Minute'], df.Popularity)
plt.show()

In [None]:
# 21. Are there any outliers in popularity, danceability, beats per minute?
fig, ax = plt.subplots(3,1, figsize = (20,10))
ax[0].boxplot(df.Popularity, vert=False)
ax[0].set_title("Popularity")

ax[1].boxplot(df.Danceability, vert=False)
ax[1].set_title("Danceability")

ax[2].boxplot(df['Beats.Per.Minute'], vert=False)
ax[2].set_title("Beats.Per.Minute")
# So there are outliers in Popularity and Danceability. But there is no outlier in Beats per minute.

plt.show()

In [None]:
# 22. Visual each artist distribution
df = pd.read_csv("top50.csv", encoding='ISO-8859-1')
plt.figure(figsize=(40,10))
sns.countplot(df['Artist.Name'])
plt.show()

In [None]:
# 23. Visual each track name distribution
plt.figure(figsize=(40,10))
sns.countplot(df['Track.Name'])
plt.show()

In [None]:
# 24. Visual each genre distribution
plt.figure(figsize=(40,10))
sns.countplot(df['Genre'])
plt.show()

In [None]:
# 25. Visual artist, track, genre distribution using subplots
fig, ax = plt.subplots(3,1, figsize = (50,40))

sns.countplot(x='Genre' ,ax=ax[0], data=df)
sns.countplot(x='Track.Name' ,ax=ax[1], data=df)
sns.countplot(x='Artist.Name' ,ax=ax[2], data=df)

plt.show()