In [1]:
import os
import pyspark
import math
from pyspark.sql import SparkSession
from pyspark.sql.functions import substring

In [2]:
import pandas as pd
import numpy as np
import datetime

In [3]:
def read_parquet_folder_as_pandas(path, verbosity=1):
  files = [f for f in os.listdir(path) if f.endswith("parquet")]

  if verbosity > 0:
    print("{} parquet files found. Beginning reading...".format(len(files)), end="")
    start = datetime.datetime.now()

  df_list = [pd.read_parquet(os.path.join(path, f)) for f in files]
  df = pd.concat(df_list, ignore_index=True)

  if verbosity > 0:
    end = datetime.datetime.now()
    print(" Finished. Took {}".format(end-start))
  return df

In [4]:
df2 = read_parquet_folder_as_pandas("1.2MtracksFeaturesParquet")

3 parquet files found. Beginning reading... Finished. Took 0:00:02.533867


In [5]:
df1 = read_parquet_folder_as_pandas("songDF.parquet")

96 parquet files found. Beginning reading... Finished. Took 0:00:05.034657


In [6]:
df1 = df1[["id","title"]]
df1.columns = [["id","name"]]

In [7]:
df1

Unnamed: 0,id,name
0,SOJOUCR12A58A7591A,we don't need ya
1,SOLVQGV12AC3DFA577,angels from the realms of glory
2,SOEFANT12AC9075290,exorcise this wasteland (remix)
3,SOEOFOU12A8C135526,quintet no. 5 in e major for two violins_ viol...
4,SOIHCRO12D02198F34,tip the scales
...,...,...
999995,SODMQEF12A8C139C86,h-o-p-p- why?
999996,SOASORC12AB0187499,con las alas rotas
999997,SOLPQYU12A6D4F7215,don't get around much anymore
999998,SOLJCPK12AB018899B,intro


In [8]:
df2 = df2[["id","name"]]

In [9]:
df2["name"] = df2["name"].map(lambda x: x.lower())

In [10]:
df2

Unnamed: 0,id,name
0,5EjHf83FVd53WFr8yfErUi,sittin' on top of the world
1,4QQlmPMDpOSB9DNFYzsDzU,am i born to die?
2,3DUFMqtynem6XMXTLkR3wf,i'm going home
3,4kRGPeHDulsh4hg3barZH3,never far away
4,0jVepxXjNtS5eldyPcmFK3,christmas time will soon be over
...,...,...
972473,4RQmq0NSmQsnkmruH7gUbT,jump jump
972474,7i5fOTOt0VQpXPoaG71ILH,goodnight and i wish
972475,4dp5Wv6ihYCW7PlTyZpEep,wayfaring stranger
972476,3wxPT46lvtnmggDJS03DIl,like a songbird that has fallen


In [11]:
joined = df1.set_index("name").join(df2.set_index("name"), how="inner", lsuffix='_1', rsuffix='_2')

In [12]:
joined

Unnamed: 0_level_0,"(id,)",id
name,Unnamed: 1_level_1,Unnamed: 2_level_1


In [18]:
(df1['name'] == "konoha").sum(axis=0)

name    0
dtype: int64