In [1]:
import os
import pyspark
import math
from pyspark.sql import SparkSession
from pyspark.sql.functions import substring

In [5]:
import pandas as pd
import numpy as np
import datetime

In [6]:
def read_parquet_folder_as_pandas(path, verbosity=1):
  files = [f for f in os.listdir(path) if f.endswith("parquet")]
#   files = files[:50]

  if verbosity > 0:
    print("{} parquet files found. Beginning reading...".format(len(files)), end="")
    start = datetime.datetime.now()

  df_list = [pd.read_parquet(os.path.join(path, f)) for f in files]
  df = pd.concat(df_list, ignore_index=True)

  if verbosity > 0:
    end = datetime.datetime.now()
    print(" Finished. Took {}".format(end-start))
  return df

In [8]:
df = read_parquet_folder_as_pandas("spotifymillion.parquet")

334 parquet files found. Beginning reading... Finished. Took 0:02:42.228440


In [9]:
df2 = read_parquet_folder_as_pandas("1.2MtracksFeaturesParquet")

3 parquet files found. Beginning reading... Finished. Took 0:00:03.136206


In [14]:
df1 = df.drop(columns=["playlist_name","playlist_id","artist_uri"])
df1 = df1.drop_duplicates()

In [16]:
df1 = df1.drop(columns=["artist_name"])

Unnamed: 0,track_name,track_uri
0,Home For A Rest,
1,Political,
2,(Putting Up With) The Joneses,
3,The Old Sod,
4,Homelands,
...,...,...
66345868,Chaotic Punishment (Bonus Track),spotify:track:0Ega3SDrGwR4FBxWO3PCRc
66345874,Bombs of Death,spotify:track:0QMzHVx4E8mTX2ptrkKyXF
66346380,Konoha,spotify:track:7kgFXzdLiKnDV4GytHe1sb
66346410,Up,spotify:track:1t9A2GMNIzrADt8QMgmLQI


In [26]:
df1 = df1.dropna()
df1['track_uri'] = df1['track_uri'].map(lambda name: name[14:])

In [20]:
df2 = df2[["id","name"]]

Unnamed: 0,id,name
0,5EjHf83FVd53WFr8yfErUi,Sittin' On Top of The World
1,4QQlmPMDpOSB9DNFYzsDzU,Am I Born To Die?
2,3DUFMqtynem6XMXTLkR3wf,I'm Going Home
3,4kRGPeHDulsh4hg3barZH3,Never Far Away
4,0jVepxXjNtS5eldyPcmFK3,Christmas Time Will Soon Be Over
...,...,...
972473,4RQmq0NSmQsnkmruH7gUbT,Jump Jump
972474,7i5fOTOt0VQpXPoaG71ILH,Goodnight And I Wish
972475,4dp5Wv6ihYCW7PlTyZpEep,Wayfaring Stranger
972476,3wxPT46lvtnmggDJS03DIl,Like A Songbird That Has Fallen


In [21]:
df2.drop_duplicates()

Unnamed: 0,id,name
0,5EjHf83FVd53WFr8yfErUi,Sittin' On Top of The World
1,4QQlmPMDpOSB9DNFYzsDzU,Am I Born To Die?
2,3DUFMqtynem6XMXTLkR3wf,I'm Going Home
3,4kRGPeHDulsh4hg3barZH3,Never Far Away
4,0jVepxXjNtS5eldyPcmFK3,Christmas Time Will Soon Be Over
...,...,...
972473,4RQmq0NSmQsnkmruH7gUbT,Jump Jump
972474,7i5fOTOt0VQpXPoaG71ILH,Goodnight And I Wish
972475,4dp5Wv6ihYCW7PlTyZpEep,Wayfaring Stranger
972476,3wxPT46lvtnmggDJS03DIl,Like A Songbird That Has Fallen


In [28]:
df1.columns = ["name","id"]
df1

Unnamed: 0,name,id
15,Close To Me,4xiyq1iRdsxuU1BPUJ490Z
16,Friday I'm In Love,4QlzkaRHtU8gAdwqjWmO8n
17,Sometime To Return,4UTEaqefy5lDXf9ZACe4lU
18,Never Really Been,0iZGBAHycWtEQPESPJT95n
19,Obscurity Knocks,6Pbbz7hVhl3F8abGVDt06H
...,...,...
66345868,Chaotic Punishment (Bonus Track),0Ega3SDrGwR4FBxWO3PCRc
66345874,Bombs of Death,0QMzHVx4E8mTX2ptrkKyXF
66346380,Konoha,7kgFXzdLiKnDV4GytHe1sb
66346410,Up,1t9A2GMNIzrADt8QMgmLQI


In [34]:
joined = df1.set_index("id").join(df2.set_index("id"), how="inner", lsuffix='_1', rsuffix='_2')

In [36]:
joined

Unnamed: 0_level_0,name_1,name_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2EEinN4Zk8MUv4OQuLsTBj,Age Of Consent - 2015 Remastered Version,Age of Consent - 2015 Remaster
4tPHC7YsU3LInUYcIe2UIi,Crash,Crash
3uVCJvMFKmsj9P44vNHfx1,Grey Cell Green,Grey Cell Green
20dP2DaMHIAmwWAbp7peSr,pick up the phone,pick up the phone
0ADG9OgdVTL7fgREP75BrZ,Ain't My Fault,Ain't My Fault
...,...,...
3dVeByUPeuucOUPf4u8waf,Bar Room Beauty,Bar Room Beauty
4XqShMPgP8mqtMCN0U4cL6,Can't Say I'll Change,Can't Say I'll Change
06DhQuICVfYahs8dqvDDqu,I'll Be Your Rock (At Rock Bottom),I'll Be Your Rock (At Rock Bottom)
7J0dxYRp4k7KUOt85MmcDs,Podunk,Podunk


In [35]:
joined[joined["name_1"] != joined["name_2"]]

Unnamed: 0_level_0,name_1,name_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2EEinN4Zk8MUv4OQuLsTBj,Age Of Consent - 2015 Remastered Version,Age of Consent - 2015 Remaster
5uDASfU19gDxSjW8cnCaBp,Chill Bill,Chill Bill (feat. J. Davi$ & Spooks)
2rzBvHM9h36Tpdj7Jdajka,Wild for the Night,Wild for the Night (feat. Skrillex & Birdy Nam...
000xQL6tZNLJzIrtIgxqSl,Still Got Time,Still Got Time (feat. PARTYNEXTDOOR)
6pV8yBEHeu7A7lkttIjM2g,Waste Away (feat. Devon Baldwin),waste away (feat. Devon Baldwin)
...,...,...
2YmyyLojlTIhiF6NT6rgcA,"Deux Arabesques, L. 66: I. Andantino con moto ...","2 Arabesques, L. 66: No. 1 in E Major (Orchest..."
2mxIgrE8m3BvrtuBfGAABj,All I Need - Main Mix,All I Need (feat. Wale & Tawiah) - Main Mix
5scx9EiTkPtIpdsUzxxWvK,"Song of India (""Countless diamonds are hidden ...","""Song of India (""""Countless diamonds are hidde..."
6UildWDUbdBBHLgYJ24WPt,The Right Place To Fade,The Right Place to Fade


In [None]:
joined