In [1]:
import os
import pyspark
import math
from pyspark.sql import SparkSession
from pyspark.sql.functions import substring

In [2]:
import pandas as pd
import numpy as np
import datetime

In [3]:
def read_parquet_folder_as_pandas(path, verbosity=1):
  files = [f for f in os.listdir(path) if f.endswith("parquet")]

  if verbosity > 0:
    print("{} parquet files found. Beginning reading...".format(len(files)), end="")
    start = datetime.datetime.now()

  df_list = [pd.read_parquet(os.path.join(path, f)) for f in files]
  df = pd.concat(df_list, ignore_index=True)

  if verbosity > 0:
    end = datetime.datetime.now()
    print(" Finished. Took {}".format(end-start))
  return df

In [4]:
df1 = read_parquet_folder_as_pandas("songDF.parquet")

96 parquet files found. Beginning reading... Finished. Took 0:00:01.843259


In [5]:
df2 = read_parquet_folder_as_pandas("spotifymillion.parquet")

334 parquet files found. Beginning reading... Finished. Took 0:01:15.005264


In [6]:
df1 = df1[["id","title"]]
df1.columns = [["id","name"]]

In [18]:
df1 = df1.drop_duplicates()

In [19]:
df1

Unnamed: 0,id,name
0,SOJOUCR12A58A7591A,we don't need ya
1,SOLVQGV12AC3DFA577,angels from the realms of glory
2,SOEFANT12AC9075290,exorcise this wasteland (remix)
3,SOEOFOU12A8C135526,quintet no. 5 in e major for two violins_ viol...
4,SOIHCRO12D02198F34,tip the scales
...,...,...
999995,SODMQEF12A8C139C86,h-o-p-p- why?
999996,SOASORC12AB0187499,con las alas rotas
999997,SOLPQYU12A6D4F7215,don't get around much anymore
999998,SOLJCPK12AB018899B,intro


In [9]:
df2 = df2[["track_uri","track_name"]]
df2["track_uri"] = df2["track_uri"].map(lambda x:x[14:])
df2["track_name"] = df2["track_name"].map(lambda x:x.lower())
df2.columns = ["id","name"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["track_uri"] = df2["track_uri"].map(lambda x:x[14:])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["track_name"] = df2["track_name"].map(lambda x:x.lower())


In [11]:
df2 = df2.drop_duplicates()

In [12]:
df2

Unnamed: 0,id,name
0,41hzhXfYJYVFWF21JrmkA9,home for a rest
1,4UD60np1TaWZMNczJ33aze,political
2,0pcRJgftExwW9JzDSBSrpp,(putting up with) the joneses
3,4AYdMGcnkfcrXqMoY20IVD,the old sod
4,7v8QabmUUr7yxfCN0U6xXV,homelands
...,...,...
66345868,0Ega3SDrGwR4FBxWO3PCRc,chaotic punishment (bonus track)
66345874,0QMzHVx4E8mTX2ptrkKyXF,bombs of death
66346380,7kgFXzdLiKnDV4GytHe1sb,konoha
66346410,1t9A2GMNIzrADt8QMgmLQI,up


In [20]:
joined = df1.set_index("name").join(df2.set_index("name"), how="inner", lsuffix='_1', rsuffix='_2')

In [21]:
joined

Unnamed: 0_level_0,"(id,)",id
name,Unnamed: 1_level_1,Unnamed: 2_level_1


In [25]:
df1[df1.name == "up"]

Unnamed: 0,id,name
0,,
1,,
2,,
3,,
4,,
...,...,...
999995,,
999996,,
999997,,
999998,,
