# Input your data here

In [None]:
input = "Happy"

In [None]:
# %pip install pandas
# %pip install numpy
# %pip install matplotlib
# %pip install seaborn
# %pip install sklearn
# %pip install -U scikit-learn
# %pip install spotipy

In [None]:
import pandas as pd

# Importing the dataset
dataset = pd.read_csv("data/tracks.csv")

# Creating the dataframe
df = pd.DataFrame(dataset)

df.tail()

In [None]:
# Spotify API Authentication Information
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
#weilin's api key
client_id = '169bbab461424df7a16d00fbdb3201ec'
client_secret = '19fb464d5e34444aa5faa4613d35f6b6'
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [None]:
# Creating a new feature that comprises of Song Name and Artist.
df["song_name_artist"] = df["name"] + df["artists"]
df.head()

In [None]:
df['year'] = df['release_date'].str.extract(r'(\d{4})').astype(int)
df.head()

In [None]:
# Removing all duplicate songs-artist pairs. Meaning that there won't be 2 of the same song sung by the same artist.
print(df.shape)
df.drop_duplicates(subset=["song_name_artist"], keep='first', inplace=True)
print(df.shape)

In [None]:
# Filter data to keep songs published 2000* and later.
df = df[df["year"] >= 2000]
print(df.shape)

In [None]:
df_fill_null = df.copy()

df_fill_null['name'] = df.apply(
    lambda row: sp.track(f"spotify:track:{row['id']}")['name'] if pd.isnull(row['name']) else row['name'], axis=1
)

df_fill_null.head()

In [None]:
df_fill_null['first_artist'] = df_fill_null.apply(
    lambda row: row['artists'][1:-1].split(',')[0][1:-1], axis=1
)

df_fill_null.head()

In [None]:
df_fill_null['first_id_artists'] = df_fill_null.apply(
    lambda row: row['id_artists'][1:-1].split(',')[0][1:-1], axis=1
)

df_fill_null.head()

In [None]:
# Where the genre part will go.

# # The effective solution:
# def get_artist_genres(artist_name):
#     # Get track information
#     artist_info = sp.search(q=artist_name, type='artist')
#     # Extract song name
#     genres = artist_info['artists']['items'][0]['genres']

#     return genres # As an array

# # Sample size:
# test = df_fill_null.iloc[1900:1910]
# print(get_artist_genres(test['first_artist']))


# test['genres'] = test['first_artist'].apply(
#     lambda x: get_artist_genres(x)) # Appending each row with its respective genre.
# test.iloc[1000]

# Here's the issue: Each row takes 0.7 seconds to process. Shorter duration if processed before, and is cached locally.
# So far, we have est. 188,000 rows of data.
# Thus, it will take a total of 131,600 seconds to fully populate the dataset with genres.
# That's 2,193.34 minutes, or 36 and a half hours.

# The to-do: reduce time dimensionaltiy, or find an alternative source.

# Index 1901 cannot be found on spotify for some reason and is creating issues with obtaining data.

In [None]:
import threading

# Improved workaround Multi Threading
# API Pull function
def get_artist_genres(artist_name):
    # Get track information
    artist_info = sp.search(q=artist_name, type='artist') # API Search
    items = artist_info.get('artists', {}).get('items', []) # Specific information pull
    # Error checking to ensure that the data structure is as expected before trying to access its elements
    # Used to bypass the error from above
    if items:
        genres = items[0].get('genres', [])
        return genres
    else:
        print(f"No artist found for {artist_name}")
        return []

# Threaded function/service that calls the API Pull function for each row in DataFrame
def process_data(data):
    # Function to process data
    genres_list = []
    for artist_name in data['first_artist']:
        genres_list.append(get_artist_genres(artist_name))
    return genres_list

# DataFrame splitting for Parallel Processing (Multi Threading)
def split_data(data, num_threads=4):
    # Define worker function
    def worker(chunk, result, start_index):
        for i, artist_name in enumerate(chunk['first_artist']):
            result[start_index + i] = get_artist_genres(artist_name)

    # Split the data into chunks
    chunk_size = len(data) // num_threads
    chunks = [data.iloc[i:i+chunk_size] for i in range(0, len(data), chunk_size)]

    # Create threads to process chunks
    threads = []
    results = [[] for _ in range(len(data))]
    for i in range(num_threads):
        start_index = i * chunk_size
        thread = threading.Thread(target=worker, args=(chunks[i], results, start_index))
        threads.append(thread)
        thread.start()

    # Wait for all threads to complete
    for thread in threads:
        thread.join()

    return results

# Example usage with pandas DataFrame
data = df_fill_null.iloc[0:10000]

combined_result = split_data(data)

In [93]:
# Appends the genre list generated as a 'genre' column in the DataFrame
data['genres'] = combined_result
data.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['genres'] = combined_result


Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,instrumentalness,liveness,valence,tempo,time_signature,song_name_artist,year,first_artist,first_id_artists,genres
84067,6XAymNI5EF0ZGPYKIyBWZD,Wtf (Ac . Demo 2012),2,92307,0,['The Jerkwadz'],['2RfzsBVkw2Xw433trjNdcY'],2012-10-22,0.543,0.155,...,1.2e-05,0.122,0.139,63.781,4,Wtf (Ac . Demo 2012)['The Jerkwadz'],2012,The Jerkwadz,2RfzsBVkw2Xw433trjNdcY,[idaho indie]
84068,5Ve4qBYAThGLTOva0hhoTa,So Bad,59,325347,1,['Eminem'],['7dGJo4pcD2V6oG8kP0tJRR'],2010-06-18,0.773,0.811,...,0.0,0.174,0.774,81.037,4,So Bad['Eminem'],2010,Eminem,7dGJo4pcD2V6oG8kP0tJRR,"[detroit hip hop, hip hop, rap]"
84069,1pajT8BMXMlABtfZ22fdfO,Your Rules,0,457024,0,['Allan Shee'],['1j5Lb42MRG0K4vmugWuSRs'],2010-12-15,0.796,0.821,...,0.92,0.0801,0.76,126.0,4,Your Rules['Allan Shee'],2010,Allan Shee,1j5Lb42MRG0K4vmugWuSRs,[]
84070,2GJpeySaB1z6mJDEASkIJ4,Ella,59,227867,0,['Tan Bionica'],['37MCoi4pcUf9EKsPXeuCqU'],2010-01-01,0.632,0.738,...,0.000174,0.12,0.8,124.979,4,Ella['Tan Bionica'],2010,Tan Bionica,37MCoi4pcUf9EKsPXeuCqU,"[argentine rock, argentine telepop]"
84071,268PcYu5i5f1egp4mQcr4K,The Hitchhiker,0,455158,0,['Allan Shee'],['1j5Lb42MRG0K4vmugWuSRs'],2010-12-15,0.789,0.928,...,0.911,0.128,0.39,125.007,4,The Hitchhiker['Allan Shee'],2010,Allan Shee,1j5Lb42MRG0K4vmugWuSRs,[]


In [None]:
# Removing rows without song name.
df_removed = df_fill_null.dropna()
df_removed.isnull().sum()
df_removed.count()
# df_removed.head()

In [None]:
# Converting release_date feature to datetime format, and extracting the year.
# df_removed['release_date_datetime'] = pd.to_datetime(df_removed['release_date'], errors='coerce')
# df_removed["year"] = df_removed["release_date_datetime"].dt.year
# df_removed.head()

In [None]:
# Using Spotify API to search for a song's information based on input and adding the necessary information in a DataFrame.
def search_track(track_name):
    # Search for the track
    results = sp.search(q=track_name, type='track')

    # Check if the track exists
    if results['tracks']['total'] > 0:
        # Get the first track from the results
        track = results['tracks']['items'][0]
        explicit = int(track["explicit"] == True)
        print(f'Found track: {track["name"]} by {track["artists"][0]["name"]} from the album {track["album"]["name"]}.')
        track_dict = {"id": track["id"], "name": track["name"], "popularity": track["popularity"], 
                      "duration_ms": track["duration_ms"], "explicit": explicit, "artists": track["artists"][0]["name"],
                      "id_artists": track["artists"][0]["id"], "release_date": track["album"]["release_date"]}
        return track_dict
    else:
        print('Track not found')
        return None

# Creating a Single Row DataFrame for the input song.
track_result = search_track(input)
td = pd.DataFrame(track_result, index=[0])
td

In [None]:
# Obtaining Feature Data from song based on its song_id from previous function and adding them to a DataFrame.
def get_audio_features(track_result):
    song_id = track_result["id"]
    results = sp.audio_features(song_id)

    if results:
        return results[0]
    else:
        print(f'No audio features found for song ID: {song_id}')
        return None

audio_features = get_audio_features(track_result)
af = pd.DataFrame(audio_features, index=[0])
# Taking out the irrevelant features.
af_formatted = af.drop(["type", "id", "uri", "track_href", "analysis_url", "duration_ms"], axis=1)
# Merging both DataFrames to sync up with the dataset's layout.
td = pd.concat([td, af_formatted], axis=1)
td['year'] = td['release_date'].str.extract(r'(\d{4})').astype(int)

In [None]:
td

In [None]:
# Adding the Input Song to the Dataset DataFrame. Added to the very front.
new_df = pd.concat([td, df_removed], ignore_index=True)
new_df.head()

In [None]:
# Recalibrating the Index.
# new_df = new_df.reset_index(drop=True)
# new_df.head()

# Checking loudness values.
new_df["loudness"].describe()

In [None]:
import numpy as np
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Feature selection, removing non-useful rows.
def processing(new_df):
    df2 = new_df.drop(['id', 'name', 'id_artists', 'release_date', 'popularity', 'mode', "song_name_artist"], axis=1)
    
    # Normalize numerical features
    numerical_features = ['duration_ms', 'loudness', 'tempo']
    scaler = MinMaxScaler()
    df2[numerical_features] = scaler.fit_transform(df2[numerical_features])
    
    # Standardise Year
    df2['standardized_year'] = scaler.fit_transform(df2[['year']])
    df2 = df2.drop(['year'], axis=1)

    # Create a feature matrix
    feature_matrix = df2.drop(['artists'], axis=1)
    # This is the input song. We are isolating it from the dataframe first.
    input = feature_matrix.iloc[0].T
    # This is the rest of the songs.
    feature_matrix = feature_matrix.iloc[1:]

    # Apply cosine similarity
    start = 0
    end = 1000
    arr = []
    
    # Returns comparison value of first song in dataset to all others in an array.
    # DataFrame can be customised before Feature Selection to change which song the subject of comparison should be.
    while end < feature_matrix['explicit'].count():
        # Compile the nth 1,000 songs in the dataframe.
        set = feature_matrix.iloc[start:end].T
        # Add the input song to the top of the dataframe.
        compare_df = pd.concat([input, set], axis=1).T
        # Perform cosine similarity.
        cosine_sim = cosine_similarity(compare_df) # Comparing bit by bit to prevent too many dimensions.
        cs_list = cosine_sim[0].tolist() # Taking only the first row a.k.a the comparison between the first song and all others.
        arr += cs_list
        # Prepare for the next 1,000 songs.
        start = end
        end += 1000
    return arr
arr = processing(new_df)
arr

In [None]:
max(arr[1:])

In [None]:
def compile_suggestions(arr):
    suggestions = []
    
    # Compiling all song ids that are above the pre-determined threshold for 'like the first song'.
    for i in range(len(arr)):
        if arr[i] > 0.995:
            suggestions.append(i)
    
    print(suggestions)
    return suggestions
suggestions = compile_suggestions(arr)

In [None]:
# Extracting out the end result information.
def format_suggestions(suggestions):
    # first = suggestions[1]

    # Creating the new dataframe with the first matched song.
#     main = df_removed.iloc[suggestions].copy()
#     df_main = pd.DataFrame(main)
#     df_main = df_main.T
#    print(df_removed.tail())
    df_main = new_df.loc[suggestions]

    # Adding the rest of the matched songs to the dataframe.
#     for n in range (2, len(suggestions)):
#         temp = df_removed.iloc[suggestions[n],:6].copy()
#         df_temp = pd.DataFrame(temp)
#         df_temp = df_temp.T
#         df_main = pd.concat([df_main, df_temp])

#     df_main = df_main.drop(['popularity', 'duration_ms', 'explicit'], axis=1)
    return df_main

df_main = format_suggestions(suggestions).iloc[1:]
df_main.head()

In [None]:
# Standardising and Weighting Years.
# from sklearn.preprocessing import StandardScaler
# def weighting(df_main): 
#     df_main['year'] = pd.to_numeric(df_main['year'])
#     scaler = StandardScaler()
#     df_main['standardized_year'] = scaler.fit_transform(df_main[['year']])
#     year_compare = df_main['standardized_year'].loc[0]
#     df_main = df_main.drop
#     return df_main
# df_final = weighting(df_main)
# df_final.head()

In [None]:
df_main.count() # There is one less for the last 3 features as the input song did not go through the extracted year's steps.
# df_final['standardized_year'].describe()

In [None]:
import random

# Generating 10 of the matched songs at random.
def recommend(df_main):
    print('Here are your recommendations!')
    for c in range(10):
        recc = df_main.sample().to_dict()
        name = list(recc['name'].values())[0]
        artists = (list(recc['artists'].values())[0])[1:len(list(recc['artists'].values())[0])-1]
        year = str(list(recc['year'].values())[0])
        print(str(c + 1) + '. ' + name + ' by ' + artists + 'published in ' + year)
        df_main = df_main.drop(list(recc['id'])[0])
    
# May want to try a filter for repreventing duplicate songs in the list.
recommend(df_main) # End Product

In [None]:
import matplotlib.pyplot as plt

plt.hist(new_df["year"])

# To do:
# Artist Genre and Langauge of Artist to be added.

In [None]:
#!/usr/bin/python3

# import spotipy
# from spotipy.oauth2 import SpotifyOAuth
# from flask import Flask, url_for, session, request, redirect
# import json
# import time
# import pandas as pd
# from .downloadvideos import DownloadVideosFromTitles

# # App config
# app = Flask(__name__)

# app.secret_key = 'SOMETHING-RANDOM'
# app.config['SESSION_COOKIE_NAME'] = 'spotify-login-session'

# @app.route('/')
# def login():
#     sp_oauth = create_spotify_oauth()
#     auth_url = sp_oauth.get_authorize_url()
#     print(auth_url)
#     return redirect(auth_url)

# @app.route('/authorize')
# def authorize():
#     sp_oauth = create_spotify_oauth()
#     session.clear()
#     code = request.args.get('code')
#     token_info = sp_oauth.get_access_token(code)
#     session["token_info"] = token_info
#     return redirect("/getTracks")

# @app.route('/logout')
# def logout():
#     for key in list(session.keys()):
#         session.pop(key)
#     return redirect('/')

# @app.route('/getTracks')
# def get_all_tracks():
#     session['token_info'], authorized = get_token()
#     session.modified = True
#     if not authorized:
#         return redirect('/')
#     sp = spotipy.Spotify(auth=session.get('token_info').get('access_token'))
#     results = []
#     iter = 0
#     while True:
#         offset = iter * 50
#         iter += 1
#         curGroup = sp.current_user_saved_tracks(limit=50, offset=offset)['items']
#         for idx, item in enumerate(curGroup):
#             track = item['track']
#             val = track['name'] + " - " + track['artists'][0]['name']
#             results += [val]
#         if (len(curGroup) < 50):
#             break
    
#     df = pd.DataFrame(results, columns=["song names"]) 
#     df.to_csv('songs.csv', index=False)
#     return "done"


# # Checks to see if token is valid and gets a new token if not
# def get_token():
#     token_valid = False
#     token_info = session.get("token_info", {})

#     # Checking if the session already has a token stored
#     if not (session.get('token_info', False)):
#         token_valid = False
#         return token_info, token_valid

#     # Checking if token has expired
#     now = int(time.time())
#     is_token_expired = session.get('token_info').get('expires_at') - now < 60

#     # Refreshing token if it has expired
#     if (is_token_expired):
#         sp_oauth = create_spotify_oauth()
#         token_info = sp_oauth.refresh_access_token(session.get('token_info').get('refresh_token'))

#     token_valid = True
#     return token_info, token_valid


# def create_spotify_oauth():
#     return SpotifyOAuth(
#             client_id="id",
#             client_secret="secret",
#             redirect_uri=url_for('authorize', _external=True),
#             scope="user-library-read")

##### client ID and secret for an yu's spotify project 
client id = 4a5c41a987b64342884ca4d5c090ed84
<br>
client secret = dbc5f7a16b6b44e990be11cd09dd7374