In [1]:
#import requests
#import bs4
#from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import config
from tqdm import tqdm_notebook
import tqdm
#from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
#from selenium import webdriver
import time
import pickle
import re
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy import oauth2
import spotipy.util as util

## Cleaning Data

In [None]:
with open('17375_mixes.pickle', 'rb') as handle:
    set_list_with_titles = pickle.load(handle)

In [None]:
expanded_set_lists=[]
for val in set_list_with_titles:
    expanded_set_lists.append([tup for tup in val])
expanded_set_lists=[x for sublist in expanded_set_lists for x in sublist]

In [None]:
set_lists_df=pd.DataFrame(expanded_set_lists)
set_lists_df.columns=['set_title','set_list']
set_lists_df.set_title.drop_duplicates(inplace=True)

In [None]:
set_lists_expanded_df=set_lists_df.set_index('set_title').explode('set_list').reset_index()
set_lists_expanded_df.dropna(inplace=True)
#gets rid of timestamps
set_lists_expanded_df.set_list=set_lists_expanded_df.set_list.map(lambda x :re.sub('^(\[[0-9]*:*[0-9]*])','',x))
set_lists_expanded_df.set_list=set_lists_expanded_df.set_list.map(lambda x :re.sub('(\[[0-9]*:*[0-9]*])','',x))
#replaces songs that are questionmarks with np.NaNs
set_lists_expanded_df.set_list.replace('^[\[ ].?\?',np.NaN,regex=True,inplace=True)
set_lists_expanded_df.set_list.replace('^\?',np.NaN,regex=True,inplace=True)

In [None]:
set_lists_expanded_df.set_title.replace('([0-9]+[-][0-9]+[-]?[0-9]+)','',regex=True,inplace=True)
set_lists_expanded_df['set_title_split']=set_lists_expanded_df.set_title.map(lambda x: x.split(' - ')[1] if len(x.split(' - '))>1 else x.split('-')[1])
set_lists_expanded_df['set_title_split']=set_lists_expanded_df['set_title_split'].map(lambda x:x.split('@')[0])

In [None]:
set_lists_expanded_df.isna().sum()

The biggest question in cleaning the data is should we consider sets with multiple artists as an individual entity? Otherwise we would have to attribute every song to every DJ on the mix.

In [None]:
set_lists_expanded_df.head()

In [None]:
set_lists_expanded_df=set_lists_expanded_df[set_lists_expanded_df.set_title_split != 'VA']

In [None]:
#removing dj sets that had only 1 or 2 songs 
threshold = 0.000001
for col in set_lists_expanded_df.columns:
    counts = set_lists_expanded_df[col].value_counts(normalize=True)
    set_lists_expanded_df = set_lists_expanded_df.loc[set_lists_expanded_df[col].isin(counts[counts > threshold].index), :]

In [None]:
set_lists_expanded_df=set_lists_expanded_df[set_lists_expanded_df.set_title_split != 'VA']

In [None]:
set_lists_expanded_df.dropna(inplace=True)

In [None]:
def split_song(x):
    try:
        y=x.split(' - ')
        return y[1]
    except:
        #loads of ways that the songs are recorded and how missing songs are recorded too
        if '...' in x:
            return np.NaN
        if x =='X':
            return np.NaN
        if '?' in x:
            return np.NaN
        if 'Intro' in x:
            return np.NaN
        if '•' in x:
            return x.split('•')[1]
        #no space
        if '-' in x:
            return x.split('-')[1]
        
        
        return x

In [None]:
set_lists_expanded_df['artist']=set_lists_expanded_df.set_list.map(lambda x:x.split(' - ')[0])
set_lists_expanded_df['song']=set_lists_expanded_df.set_list.map(split_song)

In [None]:
#more different unusable song titles
filter_names_list= ['ID','?','Unknown','Unreleased','Intro','Untitled [Unreleased]','Untitled','Promo','? [Unreleased]','Outro']
set_lists_expanded_df['song']=set_lists_expanded_df['song'].map(lambda x: np.NaN if x in filter_names_list else x)

In [None]:
set_lists_expanded_df.artist=set_lists_expanded_df.artist.replace('N/A',np.NaN)

In [None]:
set_lists_expanded_df.dropna(inplace=True)

In [None]:
set_lists_expanded_df.tail()
#can see that its taken a bit off of the some of the songs but think that may actually work in my favour
#a lot of the time spotify wont have the exact same song title (usually shorter on spotify) as mixesdb 
#so shortening names may be preferable


In [None]:
set_lists_expanded_df.to_csv('songs_with_set_title_cleaned.csv',index=False)

In [None]:
set_lists_expanded_df=pd.read_csv('songs_with_set_title_cleaned.csv')

## Syncing up songs with spotify

In [None]:
def refresh():
    '''when spotify token times out this refreshes it'''
    global token_info, sp

    if sp_oauth.is_token_expired(token_info):
        token_info = sp_oauth.refresh_access_token(token_info['refresh_token'])
        token = token_info['access_token']
        sp = spotipy.Spotify(auth=token)

In [None]:
#setting up credentials for accessing spotify
redirect_uri='https://www.google.com/'
scopes ='user-follow-modify'
client_id= config.client_id
client_secret =config.client_secret
username = '1143043561'

sp_oauth = oauth2.SpotifyOAuth(client_id=client_id,client_secret=client_secret,redirect_uri=redirect_uri,scope=scopes,username=username)
token_info = sp_oauth.get_cached_token() 

if not token_info:
    auth_url = sp_oauth.get_authorize_url()
    print(auth_url)
    response = input('Paste the above link into your browser, then paste the redirect url here: ')

    code = sp_oauth.parse_response_code(response)
    token_info = sp_oauth.get_access_token(code)

    token = token_info['access_token']
else:
    token = token_info['access_token']
sp = spotipy.Spotify(auth=token)

In [None]:
artists =set_lists_expanded_df.artist.tolist()
tracks =set_lists_expanded_df.song.tolist()

In [None]:
print(len(artists),len(tracks))



spotify_song_names=[]
spotify_ids=[]
spotify_previews=[]
not_found_count =0
for ind, tup in tqdm_notebook(enumerate(zip(artists,tracks))):
    
    
    if ind%1500==0:
        print(ind)

    try:
        track_id = sp.search(q='artist:' + tup[0] + ' track:' + tup[1], type='track')
        if not track_id:
            if '[' in tup[1]:
                #accounting for the weird entries of songs and artists where they will be the same
                song_short = tup[1].split('[')[0]
                track_id = sp.search(q='artist:' + tup[0] + ' track:' + song_short, type='track')
                
        if not track_id:
            #same as above
            if tup[0]==tup[1]:
                track_id =sp.search(q=tup[0],type='track')
    except:
        refresh()
        track_id = sp.search(q='artist:' + tup[0] + ' track:' + tup[1], type='track')

        
    try:
    #some tracks cant be found with the search
        if track_id['tracks']['items'][0]['id']:
            spotify_ids.append(track_id['tracks']['items'][0]['id'])
        else:
            spotify_ids.append(np.NaN)

        if track_id['tracks']['items'][0]['preview_url']:
            spotify_previews.append(track_id['tracks']['items'][0]['preview_url'])
        else:
            spotify_previews.append(np.NaN)

        if track_id['tracks']['items'][0]['name']:
            spotify_song_names.append(track_id['tracks']['items'][0]['name'])
        else:
            spotify_song_names.append(np.NaN)


    except:
        spotify_song_names.append(np.NaN)
        spotify_ids.append(np.NaN)
        spotify_previews.append(np.NaN)
        not_found_count +=1
print('num songs not found :',not_found_count)

In [None]:
#create a df out of the songs with spotify info
limited_df =pd.DataFrame([spotify_song_names,spotify_ids,spotify_previews]).T

In [None]:
limited_df.columns=['spotify_song_name','spotify_id','preview']
limited_df['artist']=artists
limited_df['songs']=tracks

In [None]:
limited_df.dropna(subset=['spotify_id'],inplace=True)
limited_df.drop_duplicates(inplace=True)

In [None]:
limited_df.to_csv('mixes_db_tracks_with_spotify.csv',index=False)

## Preparing dataset for reccomendations

In [2]:
set_lists_expanded_df=pd.read_csv('songs_with_set_title_cleaned.csv')
limited_df= pd.read_csv('mixes_db_tracks_with_spotify.csv')

In [3]:
set_lists_expanded_df.head()

Unnamed: 0,set_title,set_list,set_title_split,artist,song
0,2016 - Andrew Weatherall - Rainbow Disco Club,The Visitor - Walk With Me,Andrew Weatherall,The Visitor,Walk With Me
1,2016 - Andrew Weatherall - Rainbow Disco Club,DJ Kaos - Hard To Earn,Andrew Weatherall,DJ Kaos,Hard To Earn
2,2016 - Andrew Weatherall - Rainbow Disco Club,The Revenge - MDMF,Andrew Weatherall,The Revenge,MDMF
3,2016 - Andrew Weatherall - Rainbow Disco Club,Rhythm Odyssey & Dr Dunks - Fox,Andrew Weatherall,Rhythm Odyssey & Dr Dunks,Fox
4,2016 - Andrew Weatherall - Rainbow Disco Club,Echo Echo - Synths Of Jupiter,Andrew Weatherall,Echo Echo,Synths Of Jupiter


In [29]:
full_df=pd.merge(set_lists_expanded_df,limited_df,how='outer',left_on=['artist','song'],right_on=['artist','songs'])

I need to seperate out DJs who played back to back sets. I think the fairest way of doing this is to assign all the songs in the set to each DJ as if there is a back to back set there is no way to tell who is playing what. I wanted to do this after finding spotify features as then there are less duplicates. The only downside of doing this is that I am sort of artificially increasing playcounts of songs.

In [30]:
full_df[full_df.set_title_split.str.contains(',')].head()

Unnamed: 0,set_title,set_list,set_title_split,artist,song,spotify_song_name,spotify_id,preview,songs
30,"- Mark Fanciulli, Carlo Lio - Between 2 Points",Marco Faraone - Replace [Inmotion],"Mark Fanciulli, Carlo Lio",Marco Faraone,Replace [Inmotion],,,,
42,"- Mark Knight, Christian Smith - Toolroom Rad...",Ron Costa - Hourncase [Tronic],"Mark Knight, Christian Smith",Ron Costa,Hourncase [Tronic],,,,
48,"- B.Traits, La Fleur, Tom Trago & Alan Fitzpa...",Jack Master - Bang The Box (Slam Remix) [Soma],"B.Traits, La Fleur, Tom Trago & Alan Fitzpatri...",Jack Master,Bang The Box (Slam Remix) [Soma],Bang the Box - Slam Remix,1DHVgTCO1lO4oh07rdaVgB,https://p.scdn.co/mp3-preview/0a8448ab06aa821b...,Bang The Box (Slam Remix) [Soma]
83,"- John Digweed, Wehbba - Transitions 630, Pro...",Wehbba - Psyche (Mark Reeve Remix) [Tronic],"John Digweed, Wehbba",Wehbba,Psyche (Mark Reeve Remix) [Tronic],,,,
128,"- Darius Syrossian, Doorly, Harry Romero - Do...",Joey Beltram - Energy Flash [R&S],"Darius Syrossian, Doorly, Harry Romero",Joey Beltram,Energy Flash [R&S],Energy Flash,2nBjTtvY6eN4GekYk4cdiw,https://p.scdn.co/mp3-preview/c54670d24d127f0f...,Energy Flash [R&S]


In [31]:
#kept all values as lists so i can explode
full_df.set_title_split=full_df.set_title_split.map(lambda x: x.split(',') if ',' in x else [x])

In [32]:
cols=full_df.columns.tolist()
cols.remove('set_title_split')
full_df=full_df.set_index(keys=cols).explode('set_title_split').reset_index()

In [35]:
#because we are using spotify ids for searching for songs cannot keep songs without spotify ids
full_df.dropna(subset=['spotify_id'],inplace=True)

In [55]:
full_df['user_nums'] = full_df.groupby('set_title_split').ngroup().values
full_df['song_nums'] = full_df.groupby('set_list').ngroup().values
count_series = full_df.groupby(['set_title_split', 'set_list']).size()
plays_df = count_series.to_frame(name = 'size').reset_index()
full_df=pd.merge(full_df,plays_df,on=['set_title_split','set_list'],how='left')

In [63]:
full_df.to_csv('mixesdb_df_for_recs.csv')