In [1]:
import pandas as pd
import requests
import json
from os import environ

### Last FM functions

In [2]:
API_KEY = environ["LAST_FM_API_KEY"]

In [3]:
def get_album(artist: str, song: str) -> str:
    headers = {
        'user-agent': "MyOwnPlaylistRecommender"
    }

    payload = {
        'api_key': API_KEY,
        'method': 'track.getInfo',
        'format': 'json',
        'artist': artist,
        'track': song
    }

    r = requests.get('https://ws.audioscrobbler.com/2.0/', headers=headers, params=payload)
    r.status_code
    try:
        album_name = r.json()['track']['album']['title']
    except (KeyError, json.JSONDecodeError):
        album_name = "No album information"
    return album_name

In [4]:
def get_album_info(artist: str, album: str) -> dict:
    headers = {
        'user-agent': "MyOwnPlaylistRecommender"
    }

    payload = {
        'api_key': API_KEY,
        'method': 'album.getinfo',
        'format': 'json',
        'artist': artist,
        'album': album
    }

    r = requests.get('https://ws.audioscrobbler.com/2.0/', headers=headers, params=payload)
    try:
        album_details = r.json()  #['track']['album']['title']
    except KeyError:
        album_details = None
    return album_details

In [5]:
def get_artist_info(artist: str) -> dict:
    headers = {
        'user-agent': "MyOwnPlaylistRecommender"
    }

    payload = {
        'api_key': API_KEY,
        'method': 'artist.gettopalbums',
        'format': 'json',
        'artist': artist
    }

    r = requests.get('https://ws.audioscrobbler.com/2.0/', headers=headers, params=payload)
    try:
        artist_details = r.json()  #['track']['album']['title']
    except KeyError:
        artist_details = None
    return artist_details

### What to do
* Clean the Play Activity.csv
* Load in Apple Music Library Tracks.json
* Merge on song and album names

### Load Play Activity.csv

In [6]:
df = pd.read_csv("./data/apple/Apple Music Play Activity.csv", low_memory=False, encoding='utf-8')

#### Clean the data

In [7]:
df.dropna(subset=['Album Name'], inplace=True)
df.dropna(subset=['Song Name'], inplace=True)
df = df[df['Media Type'] != 'VIDEO']
df = df[df['Play Duration Milliseconds'] >= 0]
df = df[df['Event Type'] != 'LYRIC_DISPLAY']
df.replace({"Event Start Timestamp": ""}, pd.NA, inplace=True)
df.dropna(subset=["Event Start Timestamp"], inplace=True)

In [8]:
df.describe()

Unnamed: 0,Apple ID Number,Bundle Version,Camera Option,Carrier Name,Container Artist Name,Container iTunes Playlist ID,Container Personalized ID,Container Season ID,Contingency,Display Count,...,Source Radio Name,Start Position In Milliseconds,Subscribed State,Subscription Bundle ID,Subscription Discovery Mode,Subscription Offer ID,Subscription User ID,UTC Offset In Seconds,Vocal Attenuation Duration,Vocal Attenuation Model ID
count,12244.0,9173.0,0.0,0.0,0.0,589.0,0.0,0.0,0.0,0.0,...,0.0,12244.0,0.0,0.0,0.0,0.0,8332.0,12244.0,7291.0,0.0
mean,277175890.0,3.1,,,,256002000.0,,,,,...,,40224.12,,,,,1608092000.0,1952.009147,0.867919,
std,0.0,4.441134e-16,,,,930.2435,,,,,...,,201785.9,,,,,0.0,1794.233291,74.109336,
min,277175890.0,3.1,,,,256000300.0,,,,,...,,0.0,,,,,1608092000.0,0.0,0.0,
25%,277175890.0,3.1,,,,256001000.0,,,,,...,,0.0,,,,,1608092000.0,0.0,0.0,
50%,277175890.0,3.1,,,,256002600.0,,,,,...,,0.0,,,,,1608092000.0,3600.0,0.0,
75%,277175890.0,3.1,,,,256002800.0,,,,,...,,0.0,,,,,1608092000.0,3600.0,0.0,
max,277175890.0,3.1,,,,256003300.0,,,,,...,,3820895.0,,,,,1608092000.0,7200.0,6328.0,


#### Get Album - Song unique dataframe

In [9]:
tmp_album_song_df = df[["Album Name", "Song Name"]]

In [10]:
album_song_df = tmp_album_song_df.drop_duplicates()

In [11]:
album_song_df

Unnamed: 0,Album Name,Song Name
0,Are a Drag,Science Fiction Double Feature
1,Escape (2022 Remaster),Don't Stop Believin' (2022 Remaster)
2,Straight Ahead,Watch Me as I Fall
4,"Acoustic, Vol. 2",Know It All
5,"Acoustic, Vol. 2",Alison's Disease
...,...,...
21383,Pump up the Valuum,Theme from a Nofx Album
21398,Schubert: Piano Trios,"Sonatensatz in B-Flat Major, D. 28"
21405,One Thing At A Time,Cowgirls (feat. ERNEST)
21412,Ella & Duke At the Cote d'Azur,Just Squeeze Me (But Don't Tease Me)


In [12]:
album_song_df.describe()

Unnamed: 0,Album Name,Song Name
count,5491,5491
unique,2868,5076
top,"Mozart: Don Giovanni, K. 527",Last Christmas
freq,57,5


### Load Apple Music Library Tracks.json

In [13]:
library_tracks = pd.read_json('./data/apple/Apple Music Library Tracks.json', encoding='utf-8')

In [14]:
library_tracks['Artist'] = library_tracks['Artist'].str.lower()

In [15]:
# get the genre of each artist
artist_groups = library_tracks.groupby('Artist')

In [16]:
list(artist_groups.get_group("Taylor Swift".lower())["Genre"].unique())  # raises a KeyError

['Pop', 'Country', "'10s Pop"]

In [17]:
def get_genre(artist: str) -> list:
    try:
        genres = list(artist_groups.get_group(artist.lower())["Genre"].unique())
    except (AttributeError, KeyError):
        genres = [None]
    return genres

#### Get Artist - Album - Song unique dataframe

In [18]:
tmp_artist_album_song_df = library_tracks[["Artist", "Album", "Title"]]

In [19]:
artist_album_song_df = tmp_artist_album_song_df.drop_duplicates()

In [20]:
artist_album_song_df.describe()

Unnamed: 0,Artist,Album,Title
count,7988,7821,8022
unique,1453,976,7525
top,bbc radio 1,<Unknown>,Intro
freq,411,267,18


In [21]:
artist_album_song_df.dropna(subset=['Artist'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artist_album_song_df.dropna(subset=['Artist'], inplace=True)


In [22]:
artist_album_song_df.describe()

Unnamed: 0,Artist,Album,Title
count,7988,7820,7988
unique,1453,976,7492
top,bbc radio 1,<Unknown>,Intro
freq,411,267,18


In [23]:
nan_rows_df = artist_album_song_df[artist_album_song_df.isna().any(axis=1)]

In [24]:
nan_rows_df

Unnamed: 0,Artist,Album,Title
203,mc hammer,,Adams Family
221,aphex twin,,Ageispolis
223,uniting nations ft laura more,,Ai No Corrida
239,aphex twin,,Alberto Balsalm
288,the mighty mighty bosstones,,All Things Considered
...,...,...,...
7608,nelly,,12 Tho Dem Wrappas
7635,nelly/st. lunatics,,13 Wrap Sumden
7640,ali/nelly/murphy lee,,14 Batter Up
7671,nelly/the teamsters,,15 Never Let 'Em C U Sweat


In [25]:
nan_rows_df.isna().sum()

Artist      0
Album     168
Title       0
dtype: int64

In [26]:
nan_rows_df['Album'] = nan_rows_df.apply(
    lambda x: get_album(x['Artist'], x['Title']) if pd.isna(x["Album"]) else x["Album"], 
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_rows_df['Album'] = nan_rows_df.apply(


In [27]:
nan_rows_df.describe()

Unnamed: 0,Artist,Album,Title
count,168,168,168
unique,67,62,168
top,the mighty mighty bosstones,No album information,Adams Family
freq,53,58,1


In [28]:
artist_album_song_df = pd.concat([artist_album_song_df, nan_rows_df], ignore_index=True)

In [29]:
artist_album_song_df

Unnamed: 0,Artist,Album,Title
0,shy fx & t-power,Set It Off,Shake Ur Body (feat. Di)
1,the cranberries,No Need To Argue (Remastered 2020),Zombie
2,spooks,"S.I.O.S.O.S., Vol. One (Remastered)",Karma Hotel
3,major lazer,Cold Water (feat. Justin Bieber & MØ) - Single,Cold Water (feat. Justin Bieber & MØ)
4,taylor swift,reputation,...Ready For It?
...,...,...,...
8151,nelly,No album information,12 Tho Dem Wrappas
8152,nelly/st. lunatics,No album information,13 Wrap Sumden
8153,ali/nelly/murphy lee,No album information,14 Batter Up
8154,nelly/the teamsters,No album information,15 Never Let 'Em C U Sweat


### Load Apple Music - Play History Daily Tracks.csv

In [30]:
tracks_df = pd.read_csv("./data/apple/Apple Music - Play History Daily Tracks.csv", encoding='utf-8')
tracks_df.head(2)

Unnamed: 0,Country,Track Identifier,Media type,Date Played,Hours,Play Duration Milliseconds,End Reason Type,Source Type,Play Count,Skip Count,Ignore For Recommendations,Track Reference,Track Description
0,United Kingdom,1116868327,AUDIO,20160828,21,231000,NOT_APPLICABLE,IPHONE,1,0,,,Glass Animals - Youth
1,United Kingdom,1129287609,AUDIO,20160828,21,221000,NOT_APPLICABLE,IPHONE,1,0,,,AlunaGeorge - Mean What I Mean (feat. Leikeli4...


In [31]:
tracks_df.dropna(subset=['Track Description'], inplace=True)

In [32]:
# Split 'Track Description' into 'Artist' and 'Song Name'
split_columns = tracks_df['Track Description'].str.split(' - ', expand=True)
tracks_df['Artist'] = split_columns[0]
tracks_df['Song Name'] = split_columns[1]

In [33]:
tracks_df.head(2)

Unnamed: 0,Country,Track Identifier,Media type,Date Played,Hours,Play Duration Milliseconds,End Reason Type,Source Type,Play Count,Skip Count,Ignore For Recommendations,Track Reference,Track Description,Artist,Song Name
0,United Kingdom,1116868327,AUDIO,20160828,21,231000,NOT_APPLICABLE,IPHONE,1,0,,,Glass Animals - Youth,Glass Animals,Youth
1,United Kingdom,1129287609,AUDIO,20160828,21,221000,NOT_APPLICABLE,IPHONE,1,0,,,AlunaGeorge - Mean What I Mean (feat. Leikeli4...,AlunaGeorge,Mean What I Mean (feat. Leikeli47 & Dreezy)


#### Get Artist - Song unique dataframe

In [34]:
artist_song_df = tracks_df.filter(['Artist', 'Song Name'])
artist_song_df = artist_song_df.drop_duplicates()
artist_song_df = artist_song_df.dropna(how='any')

In [35]:
artist_song_df.describe()

Unnamed: 0,Artist,Song Name
count,5223,5223
unique,2182,5070
top,NOFX,Butterfly
freq,153,4


In [36]:
artist_song_df.head()

Unnamed: 0,Artist,Song Name
0,Glass Animals,Youth
1,AlunaGeorge,Mean What I Mean (feat. Leikeli47 & Dreezy)
4,Christine and the Queens,Tilted
6,Whitney Houston,I Wanna Dance With Somebody (2000 Remaster)
7,Backstreet Boys,Everybody (Backstreet's Back) [Extended Version]


### Fuzzy match setup

In [37]:
from thefuzz import fuzz
from thefuzz import process

In [38]:
def normalize(text):
    if isinstance(text, str):
        return text.lower().strip()
    else:
        return str(text).lower().strip()

In [39]:
def get_fuzzy_artist(name, choices, scorer, cutoff):
    match = process.extractOne(name, choices, scorer=scorer, score_cutoff=cutoff)
    if match:
        return match[0]
    else:
        return None

In [40]:
def get_fuzzy_album(name, choices, scorer, cutoff):
    match = process.extractOne(name, choices, scorer=scorer, score_cutoff=cutoff)
    if match:
        return match[0]
    else:
        return None

#### Match Artist-Song to Artist-Album-Song

In [41]:
df1 = artist_song_df
df2 = artist_album_song_df

In [42]:
df1['key'] = df1['Artist'] + ' ' + df1['Song Name']
df2['key'] = df2['Artist'] + ' ' + df2['Title']

In [43]:
df1['key'] = df1['key'].apply(normalize)
df2['key'] = df2['key'].apply(normalize)

In [44]:
df1.head(10)

Unnamed: 0,Artist,Song Name,key
0,Glass Animals,Youth,glass animals youth
1,AlunaGeorge,Mean What I Mean (feat. Leikeli47 & Dreezy),alunageorge mean what i mean (feat. leikeli47 ...
4,Christine and the Queens,Tilted,christine and the queens tilted
6,Whitney Houston,I Wanna Dance With Somebody (2000 Remaster),whitney houston i wanna dance with somebody (2...
7,Backstreet Boys,Everybody (Backstreet's Back) [Extended Version],backstreet boys everybody (backstreet's back) ...
8,Aqua,Barbie Girl,aqua barbie girl
9,"Christina Aguilera, Lil' Kim, Mýa & P!nk",Lady Marmalade,"christina aguilera, lil' kim, mýa & p!nk lady ..."
10,S Club,Reach,s club reach
11,Shania Twain,Man! I Feel Like a Woman!,shania twain man! i feel like a woman!
12,Shania Twain,Man! I Feel Like a Woman! (International Mix),shania twain man! i feel like a woman! (intern...


In [45]:
df2.head(10)

Unnamed: 0,Artist,Album,Title,key
0,shy fx & t-power,Set It Off,Shake Ur Body (feat. Di),shy fx & t-power shake ur body (feat. di)
1,the cranberries,No Need To Argue (Remastered 2020),Zombie,the cranberries zombie
2,spooks,"S.I.O.S.O.S., Vol. One (Remastered)",Karma Hotel,spooks karma hotel
3,major lazer,Cold Water (feat. Justin Bieber & MØ) - Single,Cold Water (feat. Justin Bieber & MØ),major lazer cold water (feat. justin bieber & mø)
4,taylor swift,reputation,...Ready For It?,taylor swift ...ready for it?
5,krept & konan & wizkid,Revenge Is Sweet,G Love,krept & konan & wizkid g love
6,drapht,Brothers Grimm,Jimmy Recard,drapht jimmy recard
7,pras,Ghetto Supastar,Ghetto Supastar (That Is What You Are),pras ghetto supastar (that is what you are)
8,rihanna,A Girl Like Me,Unfaithful,rihanna unfaithful
9,mad caddies,Punk Rocksteady,She,mad caddies she


In [78]:
df2[df2["Artist"] == "alicia keys"][0:10]

Unnamed: 0,Artist,Album,Title,key
423,alicia keys,As I Am,As I Am,alicia keys as i am
589,alicia keys,Here,The Beginning (Interlude),alicia keys the beginning (interlude)
705,alicia keys,Here,Blended Family (What You Do for Love),alicia keys blended family (what you do for love)
803,alicia keys,Girl On Fire [+digital booklet],Brand New Me,alicia keys brand new me
905,alicia keys,Songs in A Minor,Butterflyz,alicia keys butterflyz
918,alicia keys,Songs in A Minor,Caged Bird [Outro],alicia keys caged bird [outro]
1153,alicia keys,Here,Cocoa Butter (Cross & Pic Interlude),alicia keys cocoa butter (cross & pic interlude)
1366,alicia keys,Girl On Fire [+digital booklet],De Novo Adagio (Intro),alicia keys de novo adagio (intro)
1434,alicia keys,The Diary of Alicia Keys,Diary ft Tony Toni Tone,alicia keys diary ft tony toni tone
1478,alicia keys,The Element of Freedom,Distance and Time,alicia keys distance and time


In [47]:
df1['matched_key_partial_ratio'] = df1['key'].apply(
    lambda x: get_fuzzy_artist(x, df2['key'], scorer=fuzz.partial_ratio, cutoff=95)
)

In [69]:
df1['matched_key_token_sort_ratio'] = df1['key'].apply(
    lambda x: get_fuzzy_artist(x, df2['key'], scorer=fuzz.token_sort_ratio, cutoff=90)
)

In [70]:
print(f"Is Nan: {df1['matched_key_partial_ratio'].isna().sum()}")
print(f"Not NaN: {df1['matched_key_partial_ratio'].notna().sum()}")

Is Nan: 4278
Not NaN: 945


In [71]:
not_nan_df = df1[df1['matched_key_partial_ratio'].notna()]
not_nan_df.head(10)

Unnamed: 0,Artist,Song Name,key,matched_key_partial_ratio,matched_key_token_sort_ratio
7,Backstreet Boys,Everybody (Backstreet's Back) [Extended Version],backstreet boys everybody (backstreet's back) ...,backstreet boys everybody (backstreets back),
20,Five,Keep On Movin',five keep on movin',five keep on movin',five keep on movin'
26,Toploader,Dancing In the Moonlight,toploader dancing in the moonlight,toploader dancing in the moonlight,toploader dancing in the moonlight
27,Westlife,Uptown Girl (Radio Edit),westlife uptown girl (radio edit),westlife uptown girl,
33,Duran Duran,A View to a Kill,duran duran a view to a kill,duran duran a view to a kill,duran duran a view to a kill
34,Carly Simon,Nobody Does It Better,carly simon nobody does it better,carly simon nobody does it better,carly simon nobody does it better
35,Paul McCartney & Wings,Live and Let Die,paul mccartney & wings live and let die,wings live and let die,
38,Sheena Easton,For Your Eyes Only,sheena easton for your eyes only,sheena easton for your eyes only,sheena easton for your eyes only
63,Taylor Swift,All Too Well (Taylor's Version),taylor swift all too well (taylor's version),taylor swift all too well,
64,Taylor Swift,Red,taylor swift red,taylor swift red,taylor swift red


In [72]:
is_nan_df = df1[df1['matched_key_partial_ratio'].isna()]
is_nan_df.head(10)

Unnamed: 0,Artist,Song Name,key,matched_key_partial_ratio,matched_key_token_sort_ratio
0,Glass Animals,Youth,glass animals youth,,
1,AlunaGeorge,Mean What I Mean (feat. Leikeli47 & Dreezy),alunageorge mean what i mean (feat. leikeli47 ...,,
4,Christine and the Queens,Tilted,christine and the queens tilted,,
6,Whitney Houston,I Wanna Dance With Somebody (2000 Remaster),whitney houston i wanna dance with somebody (2...,,
8,Aqua,Barbie Girl,aqua barbie girl,,
9,"Christina Aguilera, Lil' Kim, Mýa & P!nk",Lady Marmalade,"christina aguilera, lil' kim, mýa & p!nk lady ...",,
10,S Club,Reach,s club reach,,s club 7 reach
11,Shania Twain,Man! I Feel Like a Woman!,shania twain man! i feel like a woman!,,
12,Shania Twain,Man! I Feel Like a Woman! (International Mix),shania twain man! i feel like a woman! (intern...,,
13,Lou Bega,Mambo No. 5 (a Little Bit of...),lou bega mambo no. 5 (a little bit of...),,


In [73]:
print(f"Is Nan: {df1['matched_key_token_sort_ratio'].isna().sum()}")
print(f"Not NaN: {df1['matched_key_token_sort_ratio'].notna().sum()}")

Is Nan: 4389
Not NaN: 834


In [79]:
not_nan_token_ratio_df = df1[df1['matched_key_token_sort_ratio'].isna()]
not_nan_token_ratio_df.head(10)

Unnamed: 0,Artist,Song Name,key,matched_key_partial_ratio,matched_key_token_sort_ratio
0,Glass Animals,Youth,glass animals youth,,
1,AlunaGeorge,Mean What I Mean (feat. Leikeli47 & Dreezy),alunageorge mean what i mean (feat. leikeli47 ...,,
4,Christine and the Queens,Tilted,christine and the queens tilted,,
6,Whitney Houston,I Wanna Dance With Somebody (2000 Remaster),whitney houston i wanna dance with somebody (2...,,
7,Backstreet Boys,Everybody (Backstreet's Back) [Extended Version],backstreet boys everybody (backstreet's back) ...,backstreet boys everybody (backstreets back),
8,Aqua,Barbie Girl,aqua barbie girl,,
9,"Christina Aguilera, Lil' Kim, Mýa & P!nk",Lady Marmalade,"christina aguilera, lil' kim, mýa & p!nk lady ...",,
11,Shania Twain,Man! I Feel Like a Woman!,shania twain man! i feel like a woman!,,
12,Shania Twain,Man! I Feel Like a Woman! (International Mix),shania twain man! i feel like a woman! (intern...,,
13,Lou Bega,Mambo No. 5 (a Little Bit of...),lou bega mambo no. 5 (a little bit of...),,


In [80]:
df1["matched_key"] = df1["matched_key_partial_ratio"].combine_first(df1["matched_key_token_sort_ratio"])

In [84]:
df1.drop(columns=['matched_key_partial_ratio', 'matched_key_token_sort_ratio'], inplace=True)

In [86]:
print(f"Is Nan: {df1['matched_key'].isna().sum()}")
print(f"Not NaN: {df1['matched_key'].notna().sum()}")

Is Nan: 4214
Not NaN: 1009


In [87]:
df1.head(10)

Unnamed: 0,Artist,Song Name,key,matched_key
0,Glass Animals,Youth,glass animals youth,
1,AlunaGeorge,Mean What I Mean (feat. Leikeli47 & Dreezy),alunageorge mean what i mean (feat. leikeli47 ...,
4,Christine and the Queens,Tilted,christine and the queens tilted,
6,Whitney Houston,I Wanna Dance With Somebody (2000 Remaster),whitney houston i wanna dance with somebody (2...,
7,Backstreet Boys,Everybody (Backstreet's Back) [Extended Version],backstreet boys everybody (backstreet's back) ...,backstreet boys everybody (backstreets back)
8,Aqua,Barbie Girl,aqua barbie girl,
9,"Christina Aguilera, Lil' Kim, Mýa & P!nk",Lady Marmalade,"christina aguilera, lil' kim, mýa & p!nk lady ...",
10,S Club,Reach,s club reach,s club 7 reach
11,Shania Twain,Man! I Feel Like a Woman!,shania twain man! i feel like a woman!,
12,Shania Twain,Man! I Feel Like a Woman! (International Mix),shania twain man! i feel like a woman! (intern...,


### Some Last FM calls

In [None]:
artist_info = get_artist_info("Alicia Keys")

In [None]:
# artist_info["topalbums"]
[album["name"] for album in artist_info["topalbums"]["album"]]

In [None]:
the_album = get_album("Alicia Keys", "Plentiful (Originals) [feat. Pusha T]")
print(the_album)

In [None]:
proper_album = get_album_info("Alicia Keys", "Keys II")

In [None]:
# proper_album["album"]["tracks"]
[track["name"] for track in proper_album["album"]["tracks"]["track"]]

In [None]:
the_album

In [None]:
filtered_tracks_df['Album'] = filtered_tracks_df.apply(lambda x: get_album(x['Artist'], x['Song Name']), axis=1)

In [None]:
filtered_tracks_df.to_csv("filtered_tracks.csv")

In [None]:
filtered_library_df = library_tracks.filter(['Artist', 'Title', 'Album'])
filtered_library_df = filtered_library_df.drop_duplicates()
filtered_library_df.head()

In [None]:
filtered_library_df.describe()

### Fuzzy matching to combine 3 Artist - Album - Song dataframes

In [None]:
from thefuzz import fuzz
from thefuzz import process

In [None]:
# name = "backstreet boys everybody (backstreets back)"
# full_name = "backstreet boys everybody (backstreet's back) [extended version]"

# name = "whitney houston i wanna dance with somebody (2000 remaster)"
# full_name = "whitney houston on my own"

name = "s club reach"
full_name = "s club 7 reach"

print(f"Ratio Similarity score: {fuzz.ratio(name, full_name)}")
print(f"Partial Ratio Similarity score: {fuzz.partial_ratio(name, full_name)}")
print(f"Token Sort Ratio Similarity score: {fuzz.token_sort_ratio(name, full_name)}")
print(f"Token Set Ratio Similarity score: {fuzz.token_set_ratio(name, full_name)}")

In [None]:
def normalize(text):
    if isinstance(text, str):
        return text.lower().strip()
    else:
        return str(text).lower().strip()

In [None]:
# data3 = {'album': ['Album A', 'Album B', 'Album C'],
#          'song': ['Song 1', 'Song 2', 'Song 3']}
# df1 = pd.DataFrame(data3)
df1 = album_song_df

# data4 = {'album': ['Album A (Remastered)', 'Album B (Deluxe)', 'Album C'],
#          'song': ['Song 1 (Radio Edit)', 'Song 2', 'Song 3 (Live)'],
#          'artist': ['Artist X', 'Artist Y', 'Artist Z']}
# df2 = pd.DataFrame(data4)
df2 = artist_album_song_df

In [None]:
df1['key'] = df1['Album Name'] + ' ' + df1['Song Name']
df2['key'] = df2['Album'] + ' ' + df2['Title']

In [None]:
df1['key'] = df1['key'].apply(normalize)
df2['key'] = df2['key'].apply(normalize)

In [None]:
def get_fuzzy_artist(name, choices, scorer, cutoff):
    match = process.extractOne(name, choices, scorer=scorer, score_cutoff=cutoff)
    if match:
        return match[0]
    else:
        return None

In [None]:
df1['matched_key'] = df1['key'].apply(
    lambda x: get_fuzzy_artist(x, df2['key'], scorer=fuzz.token_set_ratio, cutoff=90)
)

In [None]:
df1

In [None]:
df1 = df1.merge(df2[['key', 'artist']], left_on='matched_key', right_on='key', how='left')
df1 = df1.drop(columns=['matched_key', 'key_y']).rename(columns={'key_x': 'key'})

In [None]:
df1

# Function to find info

In [None]:
from collections import defaultdict

In [None]:
artist_dict = defaultdict(list)

def find_artist_in_library(album: str, song: str):
    artist = None
    artists_from_track = list(filtered_tracks_df[filtered_tracks_df['Song Name'] == song]["Artist"].unique())
    artists_from_album = list(filtered_library_df[filtered_library_df['Album'] == album]["Artist"].unique())
    artists_from_non_exact_song_match = list(filtered_tracks_df[filtered_tracks_df.map(lambda x: song.lower() in x.lower() if isinstance(x, str) else False)["Song Name"]]["Artist"].unique())
    potential_artist = list(set(artists_from_track).intersection(artists_from_album))
    if len(artists_from_track) == 1:
        artist_dict[artists_from_track[0]].append(album)
        artist = artists_from_track[0]
    elif len(potential_artist) == 1:
        artist = potential_artist[0]
    elif not artist:
        for a in artists_from_track:
            try:
                tmp_album_list = artist_dict[a]
                if album in tmp_album_list:
                    artist = a
                    break
            except KeyError:
                pass
    elif len(artists_from_non_exact_song_match) == 1:
            artist = artists_from_non_exact_song_match[0]
    else:
        artist = "I CANNOT FIND THE ARTIST!!!"
        # headers = {"user-agent": "MyOwnPlaylistRecommender"}
        # payload = {
        #     "api_key": API_KEY,
        #     "method": "artist.getInfo",
        #     "format": "json",
        # }
        # response = requests.get(f'http://ws.audioscrobbler.com/2.0/?method=track.getInfo&track={song}&artist={artist}&api_key={API_KEY}&format=json')
        # data = response.json()
    return artist

def find_artist_from_track_only(song: str):
    return filtered_tracks_df[filtered_tracks_df['Song Name'] == song]["Artist"].to_list()

def find_artist_from_album_only(album: str):
    return list(filtered_library_df[filtered_library_df['Album'] == album]["Artist"].unique())

In [None]:
def non_exact_track_names(song: str):
    artist = pd.NA
    list_of_artists = filtered_tracks_df[filtered_tracks_df.map(lambda x: song.lower() in x.lower() if isinstance(x, str) else False)["Song Name"]]["Artist"].to_list()
    if list_of_artists:
        artist = list_of_artists[0]
    return artist

In [None]:
df['Artist from track'] = df.apply(lambda x: find_artist_from_track_only(x["Song Name"]), axis=1)
df['Artist from non exact matches'] = df.apply(lambda x: non_exact_track_names(x["Song Name"]), axis=1)
df['Artist'] = df.apply(lambda x: find_artist_in_library(x["Album Name"], x["Song Name"]), axis=1)

In [None]:
small_df = df[['Song Name', 'Album Name', 'Artist from track', 'Artist from non exact matches', 'Artist']]

In [None]:
small_df.describe()

In [None]:
small_df.head(5)

In [None]:
small_df["Final Artist"] = small_df["Artist"].fillna(small_df["Artist from non exact matches"])

In [None]:
no_artists = small_df["Final Artist"] == "I CANNOT FIND THE ARTIST!!!"

In [None]:
small_df[no_artists]

In [None]:
small_df['Genre'] = small_df["Final Artist"].apply(get_genre)

In [None]:
small_df.head(50)

### Merge dataframes

In [None]:
library_rename = {"Title": "Song Name", "Album": "Album Name"}
library_tracks.rename(columns=library_rename, inplace=True)
library_tracks.head()

In [None]:
library_tracks.describe()

In [None]:
# merge play activity df and library tracks directly
# new_merged_df = df.merge(library_tracks, left_on=['Song Name', 'Album Name'], right_on=['Song Name', 'Album Name'])
new_merged_df = pd.merge(df, library_tracks, on=['Song Name', 'Album Name'], how='left')

In [None]:
small_df = new_merged_df[['Song Name', 'Album Name', 'Artist']]

In [None]:
small_df.describe()

In [None]:
small_df.head(50)

In [None]:
small_df[small_df['Song Name'] == "Science Fiction Double Feature"]

In [None]:
another_merged_df = pd.merge(small_df, library_tracks, on=['Song Name', 'Album Name'], how='left')

In [None]:
library_tracks[library_tracks['Song Name'] == "Watch Me as I Fall"][["Artist", "Song Name","Track Play Count", "Track Identifier", "Audio Matched Track Identifier", "Apple Music Track Identifier", "Tag Matched Track Identifier"]]

#### Convert genre to list

In [None]:
new_merged_df.describe()

In [None]:
new_merged_df["Genre"]

In [None]:
new_merged_df["Genre"] = new_merged_df["Genre"].apply(lambda x: [x])

In [None]:
new_merged_df["Genre"]

In [None]:
# Display the merged DataFrame
new_merged_df.head()

In [None]:
new_merged_df.columns

In [None]:
end_reason_dict = {
    "EXITED_APPLICATION": "logout",
    "FAILED_TO_LOAD": "track_error",
    "MANUALLY_SELECTED_PLAYBACK_OF_A_DIFF_ITEM": "selected_diff_item",
    "NATURAL_END_OF_TRACK": "track_done",
    "NOT_APPLICABLE": "unknown",
    "OTHER": "uknown",
    "PLAYBACK_MANUALLY_PAUSED": "pause",
    "PLAYBACK_SUSPENDED": "suspended",
    "SCRUB_BEGIN": "scrub_begin",
    "SCRUB_END": "scrub_end",
    "TRACK_SKIPPED_BACKWARDS": "back_button",
    "TRACK_SKIPPED_FORWARDS": "forward_button",
    pd.NA: "unknown"
}

In [None]:
shuffle_dict = {
    "SHUFFLE_ON": "On",
    "SHUFFLE_OFF": "Off",
    "SHUFFLE_UNKNOWN": "Unknown"
}

In [None]:
from collections import defaultdict

def constant_factory(value):
    return lambda: value

country_dict = defaultdict(constant_factory("Unknown"))
country_list = [
    ("GB", "United Kingdom"),
    ("AL", "Albania"),
    ("ES", "Spain"),
    ("IE", "Ireland"),
    ("US", "United States")
]
for k, v in country_list:
    country_dict[k] = v

In [None]:
country_dict

In [None]:
new_merged_df.replace({"IP Country Code": pd.NA}, "unknown", inplace=True)

In [None]:
new_merged_df["IP Country Code"]

In [None]:
new_merged_df.iloc[2][["Album Name", "Song Name", "IP Country Code"]]

In [None]:
new_merged_df["Datetime"] = pd.to_datetime(new_merged_df["Event Start Timestamp"], format='mixed')
new_merged_df["Day name"] = new_merged_df["Datetime"].dt.day_name()
new_merged_df["Day number"] = new_merged_df["Datetime"].dt.day
new_merged_df["Month number"] = new_merged_df["Datetime"].dt.month
new_merged_df["Year"] = new_merged_df["Datetime"].dt.year
new_merged_df["Hour"] = new_merged_df["Datetime"].dt.hour
new_merged_df["Song and Artist name"] = new_merged_df["Song Name"] + " | " + new_merged_df["Artist"]
new_merged_df["Platform"] = new_merged_df["Device OS Name"] + " | " + new_merged_df["Device Type"]+ " | " + new_merged_df["Device OS Version"]
new_merged_df["Milliseconds played"] = new_merged_df["Play Duration Milliseconds"]
new_merged_df.replace({"End Reason Type": end_reason_dict}, inplace=True)
new_merged_df.replace({"Shuffle Play": shuffle_dict}, inplace=True)
new_merged_df.replace({"IP Country Code": country_dict}, inplace=True)
new_merged_df["Latitude"] = new_merged_df["IP Latitude"]
new_merged_df["Longitude"] = new_merged_df["IP Longitude"]

In [None]:
new_merged_df.columns

In [None]:
new_merged_df["End Reason Type"].head(10)

In [None]:
rename_columns = {
    "Album Name": "Album name",
    "Song Name": "Song name",
    "End Reason Type": "End reason",
    "Shuffle Play": "Shuffle",
    "IP Country Code": "Country"
}

In [None]:
columns_to_keep = [
    "Datetime",
    "Day name",
    "Day number",
    "Month number",
    "Year",
    "Hour",
    "Artist",
    "Album name",
    "Song name",
    "Song and Artist name",
    "Genre",
    "Platform",
    "Milliseconds played",
    "End reason",
    "Shuffle",
    "Country",
    "Latitude",
    "Longitude"
]

In [None]:
apple_df = new_merged_df.rename(columns=rename_columns)[columns_to_keep]
apple_df.head()

In [None]:
apple_df["Country"]