# **Spority Music recommendation system**

# Imports and Reading in Data

In [None]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [None]:
df = pd.read_csv('../input/spotify-data-from-160k-dataset/data.csv')
df.info()

In [None]:
df.describe()

In [None]:
df.head()

## 1.1 Data Cleaning

Before we do any analysis, we have to clean up our data and check for missing values.

In [None]:
sns.heatmap(df.isnull())

In [None]:
df['artists'] = df['artists'].apply(lambda x: x[1:-1].split(','))
df = df.explode('artists')
df

In [None]:
df.columns

In [None]:
df.drop(['release_date', 'id', 'explicit', 'mode', 'key', 'duration_ms'], axis = 1, inplace = True)
df.reset_index(inplace=True)
df

## 1.2 Data Exploration

In [None]:
features = ['acousticness', 'danceability', 'energy',
        'instrumentalness', 'liveness', 'loudness',
        'popularity', 'speechiness', 'tempo',
       'valence', 'year']
for feat in features:
    plt.figure()
    sns.distplot(df[feat], kde = False)

In [None]:
plt.figure(figsize = (12, 6))
sns.heatmap(df.corr(), annot = True, cmap = 'rocket_r')

In [None]:
plt.figure()
sns.jointplot(x = 'loudness', y = 'energy', data = df)
plt.figure()
sns.jointplot(x = 'year', y = 'popularity', data = df)

In [None]:
plt.figure(figsize = (12, 6))
mostPopularSong = df.groupby(by='name')['popularity'].max().sort_values(ascending=False).head(10)
mostPopularSongPlot = sns.barplot(x = mostPopularSong.index, y = mostPopularSong, palette = 'Spectral')
mostPopularSongPlot.set_xlabel('Song Names')
mostPopularSongPlot.set_ylabel('Popularity Rating')
mostPopularSongPlot.set_title('Top 10 Most Popular Songs on Spotify (By Highest Popularity Rating)')
plt.xticks(rotation = 90)
sns.despine(left = True, bottom = True)

In [None]:
popByYear = df.groupby('year')['popularity'].max()
sns.lineplot(x = popByYear.index, y = popByYear)

In [None]:
plt.figure(figsize = (12, 6))
mostPopularArtist = df.groupby(by='artists')['popularity'].sum().sort_values(ascending=False).head(10)
mostPopularArtistPlot = sns.barplot(x = mostPopularArtist.index, y = mostPopularArtist, palette = 'Spectral')
mostPopularArtistPlot.set_xlabel('Artists Names')
mostPopularArtistPlot.set_ylabel('Popularity Rating')
mostPopularArtistPlot.set_title('Top 10 Most Popular Artists on Spotify (By Total Popularity)')
plt.xticks(rotation = 90)
sns.despine(left = True, bottom = True)

In [None]:
plt.figure(figsize = (12, 6))
MostSongs = df['artists'].value_counts().head(10)
sns.barplot(x = MostSongs, y = MostSongs.index, palette = 'Spectral')
sns.despine(right = True)

In [None]:
def decadeify (year):
    if year < 1930:
        return '1920s'
    elif year < 1940:
        return '1930s'
    elif year < 1950:
        return '1940s'
    elif year < 1960:
        return '1950s'
    elif year < 1970:
        return '1960s'
    elif year < 1980:
        return '1970s'
    elif year < 1990:
        return '1980s'
    elif year < 2000:
        return '1990s'
    elif year < 2010:
        return '2000s'
    elif year < 2020:
        return '2010s'
    else:
        return '2020s'

In [None]:
df['decade'] = df['year'].apply(lambda x: decadeify(x))
df.head()

In [None]:
feats = ['decade', 'artists', 'name', 'popularity']
df[feats].sort_values(by = 'popularity', ascending = False).groupby('decade').first()

In [None]:
features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 
            'speechiness', 'valence']
plt.figure(figsize = (12, 6))
sns.set(style = 'whitegrid')
for feat in features:
    feat_data = df.groupby('year')[feat].mean()
    sns.lineplot(x = feat_data.index, y = feat_data, label = feat)
plt.legend(loc = 'upper right')
plt.ylabel('Values')

In [None]:
sns.lineplot(x = df.groupby('year')['tempo'].mean().index, y = df.groupby('year')['tempo'].mean())

# 2. Model Building

## 2.1 Correlation Model

In [None]:
df = pd.read_csv('../input/spotify-data-from-160k-dataset/data.csv')
# Omitting unnecessary features
ft = df[['name', 'artists', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'speechiness', 'tempo', 'valence']]
# Removing unnecessary clutter in artist names and combining song name with artist name
ft['artists'] = ft['artists'].apply(lambda x: x[1:-1])
ft['artists'] = ft['artists'].apply(lambda x: x.replace("'", ""))
ft['name'] = ft['name'] + ' - ' + ft['artists']
ft.drop('artists', axis = 1, inplace = True)
# Convert song and artist name from column to index
ft.set_index('name', inplace = True)
# Scale tempo data as other features are [0,1] while tempo is much higher than that
scaler = MinMaxScaler()
# The reshape here just makes the 1-D data into 2-D so that fit_transform can use it
scaled = scaler.fit_transform(ft['tempo'].values.reshape(-1,1))
# Replace old tempo data with scaled
ft['tempo'] = scaled
# Switch columns and indices so that features are indices and names are columns
ft = ft.transpose()
# Checks if columns names are duplicate and returns a boolean value of True if has appeared previously
# We invert all the boolean values so that we can keep the first instance (previously False values)
# Finally, use .loc to select non-duplicated columns
ft = ft.loc[:,~ft.columns.duplicated()]
ft

In [None]:
def findRecommendationsCorr (title, df):
    song = df[title]
    similar_to_song = df.corrwith(song)
    corr = pd.DataFrame(similar_to_song,columns=['Correlation Similarity'])
    corr.dropna(inplace=True)
    corr.sort_values('Correlation Similarity',ascending=False, inplace = True)
    return corr[1:].head()

### 2.2 Testing and results

In [None]:
findRecommendationsCorr('I Love It (& Lil Pump) - Kanye West, Lil Pump', ft)

In [None]:
findRecommendationsCorr('Polonaise-Fantaisie in A-Flat Major, Op. 61 - Frédéric Chopin, Vladimir Horowitz', ft)

In [None]:
findRecommendationsCorr('Sweater Weather - The Neighbourhood', ft)

In [None]:
def findRecommendationsCosine (title, df):
    df = df.copy().transpose()
    similarity = []
    for i in df.index:
        # Reshape to a 2D array of (1,7)
        # Casting to string and float with indexing [2:-2] to remove the outer 2 pairs of square brackets
        similarity.append(float(str(cosine_similarity(df.loc[title].values.reshape(1, 7), df.loc[i].values.reshape(1, 7)))[2:-2]))
    df['Cosine Similarity'] = similarity
    return pd.DataFrame(df.sort_values(by = 'Cosine Similarity', ascending = False)['Cosine Similarity']).iloc[1:6]

### 2.3.3 Testing Recommendation System

In [None]:
findRecommendationsCosine('I Love It (& Lil Pump) - Kanye West, Lil Pump', ft)

In [None]:
findRecommendationsCosine('Sweater Weather - The Neighbourhood', ft)