In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from collections import defaultdict
from scipy.spatial.distance import cdist
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import warnings
warnings.filterwarnings("ignore")
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [3]:
data = pd.read_csv("/content/data.csv")
genre_data = pd.read_csv('/content/data_by_genres.csv')
year_data = pd.read_csv('/content/data_by_year.csv')
artist_data = pd.read_csv('/content/data_by_artist.csv')


In [4]:
genre_data.sample(5)

Unnamed: 0,mode,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
130,1,athens indie,0.173428,0.50968,243839.709821,0.654059,0.238168,0.157445,-9.104683,0.041957,126.176431,0.533934,40.995924,11
2689,1,tampa indie,0.284649,0.501803,212979.969697,0.765394,0.000265,0.290409,-8.512379,0.173103,142.892318,0.41107,38.772727,3
703,1,country pop,0.201168,0.581101,210417.631308,0.688557,0.036919,0.169314,-6.054354,0.045198,123.045079,0.555233,57.427657,7
1049,1,focus trance,0.010704,0.710667,399675.0,0.813,0.904667,0.285533,-9.201,0.054667,125.672667,0.362167,0.0,1
704,1,country rap,0.152007,0.705536,241822.411316,0.691926,0.00284,0.186232,-6.512374,0.131273,117.952953,0.512625,53.681864,11


In [5]:
year_data.sample(5)

Unnamed: 0,mode,year,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
19,1,1940,0.847644,0.521892,182227.9445,0.310893,0.316849,0.264335,-13.684048,0.242958,108.449334,0.616709,0.93,7
45,1,1966,0.524809,0.50702,216095.113,0.482208,0.115126,0.218093,-11.35692,0.055964,116.335424,0.580145,28.2815,2
15,1,1936,0.772312,0.558006,220809.186364,0.308389,0.25711,0.221438,-14.612999,0.279029,109.888755,0.564064,5.080909,10
86,1,2007,0.254081,0.563414,241049.962564,0.668305,0.072957,0.196127,-7.044536,0.084347,124.087516,0.516794,51.075897,7
80,1,2001,0.286842,0.583318,240307.79601,0.626986,0.107214,0.187026,-8.305095,0.089182,117.765399,0.541479,48.750125,7


In [6]:
artist_data.sample(5)

Unnamed: 0,mode,count,acousticness,artists,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
13642,1,2,0.934,Klarino Soukris,0.381,193480.0,0.487,1e-06,0.209,-8.301,0.0414,112.974,0.526,0.0,2
3183,1,14,0.648857,Booker Ervin,0.457143,318885.571429,0.268471,0.274235,0.120729,-17.155429,0.036986,120.881286,0.465429,23.714286,8
12253,1,16,0.961125,John Gary,0.25925,178524.875,0.248375,0.02749,0.21025,-11.939875,0.034475,113.368875,0.215875,22.75,10
13753,0,13,0.093362,Krayzie Bone,0.785538,256860.538462,0.700538,0.0,0.162962,-5.821077,0.159992,125.481846,0.709385,48.230769,6
5852,0,2,0.00596,David Draiman,0.566,219400.0,0.849,2e-06,0.115,-6.575,0.0447,98.996,0.402,46.0,2


In [7]:
datasets = [("data", data), ("genre_data", genre_data), ("year_data", year_data), ("artist_data", artist_data)]

In [8]:
data['year'] = pd.to_datetime(data['year'], format='%Y')
data['release_date'] = pd.to_datetime(data['release_date'])
year_data['year'] = pd.to_datetime(year_data['year'], format='%Y')

In [9]:
for name, df in datasets:
    # print some info about the datasets
    print(f"Info about the dataset: {name}")
    print("-"*30)
    print(df.info())
    print()

Info about the dataset: data
------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126627 entries, 0 to 126626
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   valence           126627 non-null  float64       
 1   year              126627 non-null  datetime64[ns]
 2   acousticness      126627 non-null  float64       
 3   artists           126627 non-null  object        
 4   danceability      126627 non-null  float64       
 5   duration_ms       126627 non-null  int64         
 6   energy            126627 non-null  float64       
 7   explicit          126627 non-null  int64         
 8   id                126627 non-null  object        
 9   instrumentalness  126627 non-null  float64       
 10  key               126627 non-null  int64         
 11  liveness          126627 non-null  float64       
 12  loudness          126627 non-null  float64       
 13 

In [10]:
for name, df in datasets:
    # Check for missing values in the datasets
    print(f"Missing Values in: {name}")
    print("-"*30)
    print(df.isnull().sum())
    print()

Missing Values in: data
------------------------------
valence             0
year                0
acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
id                  0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               1
dtype: int64

Missing Values in: genre_data
------------------------------
mode                0
genres              0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
valence             0
popularity          0
key                 0
dtype: int64

Missing Values in: year_data
------------------------------
mode                0
year                0
acousticness        0
danc

In [11]:
for name, df in datasets:
    # check for duplicates in the datasets
    print(f"Duplicates in the dataset: {name}")
    print("-"*30)
    print(df.duplicated(keep=False).sum())
    print()

Duplicates in the dataset: data
------------------------------
0

Duplicates in the dataset: genre_data
------------------------------
0

Duplicates in the dataset: year_data
------------------------------
0

Duplicates in the dataset: artist_data
------------------------------
0



In [12]:
for name, df in datasets:
    # Check the unique values in the dataset
    print(f"Unique Values in: {name}")
    print("-"*30)
    print(df.nunique())
    print()

Unique Values in: data
------------------------------
valence               1697
year                   100
acousticness          4388
artists              26703
danceability          1183
duration_ms          43413
energy                2237
explicit                 2
id                  126627
instrumentalness      5396
key                     12
liveness              1727
loudness             24005
mode                     2
name                101559
popularity             100
release_date          9552
speechiness           1595
tempo                72628
dtype: int64

Unique Values in: genre_data
------------------------------
mode                   2
genres              2973
acousticness        2798
danceability        2725
duration_ms         2872
energy              2778
instrumentalness    2731
liveness            2709
loudness            2873
speechiness         2707
tempo               2872
valence             2745
popularity          2188
key                   12
dtype: in

In [13]:
# Popularity Trends Over Years
fig = px.line(year_data, x='year', y='popularity', title='Popularity Trends Over Years')
fig.show()

In [14]:
# Convert release_date to datetime and extract decade
data['release_decade'] = (data['release_date'].dt.year // 10) * 10

# Count the number of songs per decade
decade_counts = data['release_decade'].value_counts().sort_index()

# Create a bar chart for songs per decade
fig = px.bar(x=decade_counts.index, y=decade_counts.values, labels={'x': 'Decade', 'y': 'Number of Songs'},
             title='Number of Songs per Decade')
fig.update_layout(xaxis_type='category')
fig.show()

In [15]:
# Tempo Changes Over Years
fig = px.scatter(year_data, x='year', y='tempo', color='tempo', size='popularity',
                 title='Tempo Changes Over Years', labels={'tempo': 'Tempo'})
fig.show()

In [16]:
# Average Danceability Over Years
fig = px.line(year_data, x='year', y='danceability', title='Average Danceability Over Years')
fig.show()

In [17]:
# Danceability and Energy Over Years
fig = go.Figure()

fig.add_trace(go.Scatter(x=year_data['year'], y=year_data['danceability'], mode='lines', name='Danceability'))
fig.add_trace(go.Scatter(x=year_data['year'], y=year_data['energy'], mode='lines', name='Energy'))

fig.update_layout(title='Danceability and Energy Over Years', xaxis_title='Year', yaxis_title='Value')
fig.show()

In [18]:
# Energy and Acousticness Over Years
fig = go.Figure()

fig.add_trace(go.Scatter(x=year_data['year'], y=year_data['energy'], mode='lines', name='Energy'))
fig.add_trace(go.Scatter(x=year_data['year'], y=year_data['acousticness'], mode='lines', name='Acousticness'))

fig.update_layout(title='Energy and Acousticness Over Years', xaxis_title='Year', yaxis_title='Value')
fig.show()

In [19]:
# Convert year column back
data['year'] = data['year'].dt.year

In [20]:
# List of numerical columns to consider for similarity calculations
number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit', 'year',
               'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']

In [21]:
# Function to retrieve song data for a given song name
def get_song_data(name, data):
    try:
        return data[data['name'].str.lower() == name].iloc[0]
        return song_data
    except IndexError:
        return None

In [22]:
# Function to calculate the mean vector of a list of songs
def get_mean_vector(song_list, data):
    song_vectors = []
    for song in song_list:
        song_data = get_song_data(song['name'], data)
        if song_data is None:
            print('Warning: {} does not exist in the dataset'.format(song['name']))
            return None
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)

In [23]:
# Function to flatten a list of dictionaries into a single dictionary
def flatten_dict_list(dict_list):
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
    return flattened_dict

In [24]:
# Normalize the song data using Min-Max Scaler
min_max_scaler = MinMaxScaler()
normalized_data = min_max_scaler.fit_transform(data[number_cols])

# Standardize the normalized data using Standard Scaler
standard_scaler = StandardScaler()
scaled_normalized_data = standard_scaler.fit_transform(normalized_data)

In [25]:
# Function to recommend songs based on a list of seed songs
def recommend_songs(seed_songs, data, n_recommendations=10):
    metadata_cols = ['name', 'artists', 'year']
    song_center = get_mean_vector(seed_songs, data)

    # Return an empty list if song_center is missing
    if song_center is None:
        return []

    # Normalize the song center
    normalized_song_center = min_max_scaler.transform([song_center])

    # Standardize the normalized song center
    scaled_normalized_song_center = standard_scaler.transform(normalized_song_center)

    # Calculate Euclidean distances and get recommendations
    distances = cdist(scaled_normalized_song_center, scaled_normalized_data, 'euclidean')
    index = np.argsort(distances)[0]

    # Filter out seed songs and duplicates, then get the top n_recommendations
    rec_songs = []
    for i in index:
        song_name = data.iloc[i]['name']
        if song_name not in [song['name'] for song in seed_songs] and song_name not in [song['name'] for song in rec_songs]:
            rec_songs.append(data.iloc[i])
            if len(rec_songs) == n_recommendations:
                break

    return pd.DataFrame(rec_songs)[metadata_cols].to_dict(orient='records')

In [26]:
# List of seed songs (replace with your own seed songs)
seed_songs = [
    {'name': 'Paranoid'},
    {'name': 'Blinding Lights'},
    # Add more seed songs as needed
]
seed_songs = [{'name': name['name'].lower()} for name in seed_songs]

# Number of recommended songs
n_recommendations = 15

# Call the recommend_songs function
recommended_songs = recommend_songs(seed_songs, data, n_recommendations)

# Convert the recommended songs to a DataFrame
recommended_df = pd.DataFrame(recommended_songs)

# Print the recommended songs
for idx, song in enumerate(recommended_songs, start=1):
    print(f"{idx}. {song['name']} by {song['artists']} ({song['year']})")

1. Infinity by ['One Direction'] (2015)
2. Secrets by ['OneRepublic'] (2009)
3. In My Blood by ['Shawn Mendes'] (2018)
4. Head Above Water by ['Avril Lavigne'] (2019)
5. Green Light by ['Lorde'] (2017)
6. My Wish by ['Rascal Flatts'] (2006)
7. Good Things Fall Apart (with Jon Bellion) by ['ILLENIUM', 'Jon Bellion'] (2019)
8. Magic Shop by ['BTS'] (2018)
9. Inside Out (feat. Griff) by ['Zedd', 'Griff'] (2020)
10. A.M. by ['One Direction'] (2015)
11. Love You Goodbye by ['One Direction'] (2015)
12. Story of My Life by ['One Direction'] (2013)
13. Perfect by ['Simple Plan'] (2018)
14. arms by ['Christina Perri'] (2011)
15. The Cave by ['Mumford & Sons'] (2009)


In [27]:
# Create a bar plot of recommended songs by name
recommended_df['text'] = recommended_df.apply(lambda row: f"{row.name + 1}. {row['name']} by {row['artists']} ({row['year']})", axis=1)
fig = px.bar(recommended_df, y='name', x=range(n_recommendations, 0, -1), title='Recommended Songs', orientation='h', color='name', text='text')
fig.update_layout(xaxis_title='Recommendation Rank', yaxis_title='Songs', showlegend=False, uniformtext_minsize=20, uniformtext_mode='show', yaxis_showticklabels=False, height=1000)
fig.update_traces(width=1)
fig.show()