**Data Collection**: 

This notebook is the first of five notebooks containing the central work for the project.

This notebook contains the (1) gathering of the data needed for the project and (2) the filtering process of the data in order to use it. 

In [None]:
# Gooogle Drive connection to your compute instance
from google.colab import drive
drive.mount('/content/drive')

# Data Gathering

In [None]:
# Requirements
!pip install spotipy # https://github.com/plamere/spotipy

In [None]:
# Imports
import pandas as pd
import spotipy
from tqdm.notebook import trange, tqdm

# Connect to Spotify
auth_manager = spotipy.oauth2.SpotifyClientCredentials(client_id="53bf09bf588f4cb09fa178a3cb4dc0d3", client_secret="29ee66296242465f8337a35e2d892b16")
spotify = spotipy.Spotify(auth_manager=auth_manager)

# Get Dataset
playlist_collection = pd.read_csv("/content/drive/My Drive/Spotify Song Classification/data/raw/SongPlaylists.csv")
#playlist_collection.head(20)


In [None]:
# RETURN BY THE AUDIO_FEATURES
# "id": "4JpKVNYnVcJ8tuMKjAj50A",   #       str   - Spotify-ID of the song - Can be ommited 
# "duration_ms": 535223,            #[0,inf]int   - duration of the song in ms
# "tempo": 123.99,                  #[0,inf]float - The overall estimated tempo of a track in beats per minute (BPM)
# "time_signature": 4               #[3,7]  int   - Estimated overall beats per bar of a track (dt. "Takt") (from 3/4 to 7/4)
# "loudness": -12.733,              #[-60,0]float - averaged loudness of a song in Dezibel (db) (typically between -60 and 0) 
# "key": 7,                         #[0,11] int   - the key the track is in (0=C, 1=C#, 2=D, ...)
# "mode": 1,                        #[0,1]  int   - modality (major/minor) of the track (0=minor, 1=major)
# ---
# "danceability": 0.808,  	        #[0,1]  float - how suitable a track is for dancing
# "energy": 0.626,                  #[0,1]  float - a perceptual measure of intensity and activity
# "speechiness": 0.168,             #[0,1]  float - detects spoken words in a track (<0.33=music/songs | 0.33-0.66=music with spoken words like rap | >0.66=only spoken words like talkshow)
# "valence": 0.369,                 #[0,1]  float - a perceptual measure of positiveness
# ---
# "acousticness": 0.00187,          #[0,1]  float - confidence measure whether the track is acoustic
# "instrumentalness": 0.159,        #[0,1]  float - confidence measure whether the track has no vocals
# "liveness": 0.376,                #[0,1]  flaot - confidence measure whether the track was performed live


# RETURN BY THE AUDIO-ANALYSIS (we can use to enrich our data)
# PER TRACK
# "tempo_confidence"                #[0,1]  flaot - confidence measure for tempo (often changes in tempo and sounds wihtout tempo (e.g. speech) leads to low confidence)
# "time_signature_confidence"       #[0,1]  flaot - confidence measure for time_signature (often changes in time_signature leads to low confidence)
# "key_confidence"                  #[0,1]  flaot - confidence measure for key (often changes in key leads to low confidence) 
# "mode_confidence"                 #[0,1]  flaot - confidence measure for mode 
# ---
# PER SEGMENT (need a way to generalize from Segments to Track without omitting the meaning)
# "pitch"                           #12x[0,1]     - for each tone (C=0,C#=1,..) how clear it is. The vector is normalized by the strongest dimension, so lot of high values means noisy sounds while single high values means clear sounds
# "timbre"                          #12x[-inf,inf]- complex notion also referred to as sound color, texture, or tone quality
# FEATURE ENGINEERING
# number of segments per minute     #[1,inf]      - segments divide a song in timeframes of "containing a roughly consistent sound". Many segments per minute maybe(!) can be interpreted in much changes


In [None]:

columns=['playlist_id', 'track_id', 'genre']
df = pd.DataFrame(columns=columns)

playlist_counter = 0
track_error_counter = 0
playlist_error_counter = 0

print()

# Retrieve full list of tracks for all playlists found in CSV-File
for genre, genre_playlists in tqdm(playlist_collection.iteritems(), desc="Extracting tracks from playlists", total=len(playlist_collection.columns), ncols="560px"):
  for playlist in tqdm(genre_playlists.items(), desc=genre, total=genre_playlists.count(), leave=False, ncols="560px"):
    playlist_link = playlist[1]

    if type(playlist_link) is str:
      playlist_counter+=1
      playlist_id = playlist_link.split(":")[-1]

      # Now let's retrieve the full list of tracks for this playlist
      response = spotify.playlist_items(playlist_id)
      tracks = response["items"]

      while response["next"]:
        response = spotify.next(response)
        tracks.extend(response["items"])


      track_dicts = []
      for track in tracks:

        try:
          track_dict = {
              "track_id": track["track"]["id"],
              "playlist_id": playlist_id,
              "genre": genre
          }
          track_dicts.append(track_dict)
        except TypeError:
          track_error_counter+=1
      
      df = df.append(track_dicts)

original_df = df.copy()
print("EXTRACTION DONE")
print("From {} playlists, we correctly extracted {} tracks ({} tracks were empty and couldn't be extracted).".format(playlist_counter,len(df), track_error_counter))

In [None]:
df = original_df.copy()

##### Understand Data
print("=== UNDERSTAND DATA ===")
original_length = len(df)

playlistXgenre = df.drop("track_id", axis=1).drop_duplicates(["playlist_id","genre"])
double_playlists = len(playlistXgenre[playlistXgenre.duplicated("playlist_id")])

non_duplicate_tracks = len(df.drop_duplicates("track_id", keep=False))
duplicate_overall = original_length - len(df.drop_duplicates("track_id"))
duplicate_in_playlist_genre = original_length - len(df.drop_duplicates())
duplicate_in_genre = original_length - duplicate_in_playlist_genre - len(df.drop_duplicates(["track_id","genre"])) 
duplicate_in_playlist = original_length - duplicate_in_playlist_genre - duplicate_in_genre - len(df.drop_duplicates(["track_id","genre"]).drop_duplicates(["track_id","playlist_id"]))
duplicate_diff = original_length - duplicate_in_playlist_genre - duplicate_in_genre - duplicate_in_playlist - len(df.drop_duplicates("track_id"))

print("Total Number of Tracks: \t\t\t", original_length)
print("Number of Tracks that appear exactly once: \t", non_duplicate_tracks)
print("Number of Tracks that appear more than once: \t", duplicate_overall)

print("\t", "A) Tracks that are in the same playlist and genre multiple times: ", duplicate_in_playlist_genre)
print("\t", "B) Tracks that are in the same genre multiple times: ", duplicate_in_genre)
print("\t", "C) Tracks that are in the same playlist multiple times (due to {} playlists in more than one genre): {}".format(double_playlists, duplicate_in_playlist))
print("\t", "D) Tracks that are in different genre and playlists multiple times: ", duplicate_diff)

# Data Filtering

In [None]:
##### Filter Data
print("=== FILTERING ===")
after_filterA, after_filterB, after_fitlerC, after_filterD = [float("inf"),float("inf"),float("inf"),float("inf")]
print("Original Number of Tracks: \t", original_length)

#Filter A
df.drop_duplicates(inplace=True)
after_filterA = len(df)
print("Filter tracks of category A: \t {} \t\tdelete all occurences except one, as it's genre is correct".format(after_filterA-original_length))

#Filter B
df.drop_duplicates(["track_id", "genre"], inplace=True)
after_filterB = len(df)
print("Filter tracks of category B: \t {} \t\tdelete all occurences except one, as it's genre is correct".format(after_filterB-after_filterA))

#"""
#Filter C
df.drop_duplicates(["track_id","playlist_id"], keep=False, inplace=True)
after_filterC = len(df)
print("Filter tracks of category C: \t {} \t\tdelete all occurences as the genre is ambigous".format(after_filterC-after_filterB))

#Filter D
df.drop_duplicates("track_id", keep=False, inplace=True)
after_filterD = len(df)
print("Filter tracks of category D: \t {} \t\tdelete all occurences as the genre is ambigous".format(after_filterD-after_filterC))
#"""

last_filter = min(after_filterA, after_filterB,after_fitlerC, after_filterD)
print("Tracks left: \t\t\t {} (with {} duplicates)".format(last_filter, len(df)-len(df.drop_duplicates("track_id"))))

In [None]:
audio_features_error_counter = 0
fields = ['energy', 'liveness', 'tempo', 'speechiness', 'acousticness', 'instrumentalness', 'time_signature', 'danceability', 'key', 'duration_ms', 'loudness', 'valence', 'mode']

single_tracks = df.drop_duplicates("track_id")
others = []

for i in trange(int(len(single_tracks)/100)+1, desc="Get audio features of all tracks", unit="Batches", ncols="560px"):
  start = i*100
  end = (i+1)*100
  segment = single_tracks.iloc[start:end]

  segment_ids = segment["track_id"]
  track_features = spotify.audio_features(segment_ids)
  
  for entry in track_features:
    if not entry:
      audio_features_error_counter+=1
      continue

    df_dict = {}

    for field in fields:
        df_dict[field] = [entry[field]]
    df_dict["track_id"] = entry["id"]

    others.append(pd.DataFrame(df_dict))

print("Concatinating Dataframes...\n")
audio_features = pd.concat(others)
audio_features.set_index("track_id", inplace=True, verify_integrity=True)

print("From {} non-duplicate tracks, we correctly extracted {} audio_features ({} audio_features were empty and couldn't be extracted).".format(len(single_tracks), len(audio_features), audio_features_error_counter))

In [None]:
print("{} identical tracks with different IDs identified in audio features dataframe.".format(len(audio_features[audio_features.duplicated()])))

In [None]:
df = df.join(audio_features, on="track_id")
df = df.dropna()

# Filter identical tracks in audio feautures
original_length = len(df)
identical_tracks = len(df[df.duplicated(fields)])
keep_one = 0
delete_all = 0

#"""
df = df.drop_duplicates(['energy', 'liveness', 'tempo', 'speechiness', 'acousticness', 'instrumentalness', 'time_signature', 'danceability', 'key', 'duration_ms', 'loudness', 'valence', 'mode', "genre"])
after_keep_one = len(df)
keep_one = original_length - after_keep_one

df = df.drop_duplicates(fields, keep=False)
delete_all = after_keep_one - len(df)
#"""

print("{} audio features are found more than once indicating the identifcal track with different IDs.".format(identical_tracks))
print("{} tracks were filtered due to same audio features but also the same genre (keep one occurence)".format(keep_one))
print("{} tracks were filtered due to same audio features but different genres (keep no occurences).".format(delete_all))


In [None]:
print("We will work with {} tracks enriched with audio features.".format(len(df)))
df.to_csv("/content/drive/My Drive/Spotify Song Classification/data/raw/TrackAnalysis.csv")

In [None]:
# Values change over time as Spotify changes the playlists

# General | Durchlauf: 12.11.2020 - 21:58 Uhr
#=== UNDERSTAND DATA ===
# Total Number of Tracks: 			                32766 (8 errors)
# Number of Tracks that appear exactly once: 	  23722
# Number of Tracks that appear more than once: 	 5447
#	   A) Tracks that are in the same playlist and genre multiple times:  3
#	   B) Tracks that are in the same genre multiple times:  4475
#	   C) Tracks that are in the same playlist multiple times (due to 5 playlists in more than one genre): 319
#	   D) Tracks that are in different genre and playlists multiple times:  650


#### TrackAnalysis.csv (Filter A, B, C, D | audio_features) - Durchlauf: 12.11.2020 - 21:58 Uhr
#=== FILTERING ===
# Original Number of Tracks: 	                    32766
# Filter tracks of category A: 	                     -3 		delete all occurences except one, as it's genre is correct
# Filter tracks of category B: 	                  -4475 		delete all occurences except one, as it's genre is correct
# Filter tracks of category C: 	                   -638 		delete all occurences as the genre is ambigous
# Filter tracks of category D: 	                  -1288 		delete all occurences as the genre is ambigous
# Tracks left: 			                              26362 (with 0 duplicates)
#--
# Number of Audio Features gathered:              26359 (3 errors)
# Audiofeatures more than once in audiofeatures:    274 
# Audiofeatures more than once in df:               244
# Tracks filtered due to same audiofeatures:       -269 (same genre: -219 | different genre: -50)
#--
# Result of Data Collection                       26090 tracks



#### TrackAnalysis_unrestricted.csv (Filter A, B) - Durchlauf: 12.11.2020 - 21:04 Uhr
#=== FILTERING ===
# Original Number of Tracks: 	                    32766
# Filter tracks of category A: 	                     -3 		delete all occurences except one, as it's genre is correct
# Filter tracks of category B: 	                  -4475 		delete all occurences except one, as it's genre is correct
# Tracks left: 			                              28288 (with 969 duplicates)
#--
# Number of Audio Features gathered:              26359 (3 errors)
# Audiofeatures more than once in audiofeatures:    244
# Audiofeatures more than once in df:              1243 
# Tracks filtered due to same audiofeatures:          0 (same genre: 0 | different genre: 0)
#--
# Result of Data Collection                       28285 tracks
