In [0]:
configs = {"fs.azure.account.auth.type": "OAuth",
"fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
"fs.azure.account.oauth2.client.id": "CLIENT_ID",
"fs.azure.account.oauth2.client.secret": 'CLIENT_SECRET',
"fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/TENANT-ID/oauth2/token"}


dbutils.fs.mount(
source = "abfss://spotifydata@spotifydata02.dfs.core.windows.net", # contrainer@storageacc
mount_point = "/mnt/spotify",
extra_configs = configs)

Out[2]: True

In [0]:
%fs
ls "/mnt/spotify"

path,name,size,modificationTime
dbfs:/mnt/spotify/raw-data/,raw-data/,0,1693827752000
dbfs:/mnt/spotify/transformed-data/,transformed-data/,0,1693840233000


In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *     

In [0]:
# Import files
the_weeknd = spark.read.csv('/mnt/spotify/raw-data/The_Weeknd.csv', header = True, inferSchema = True )
taylor_swift = spark.read.csv('/mnt/spotify/raw-data/Taylor_Swift.csv', header = True, inferSchema = True)
drake = spark.read.csv('/mnt/spotify/raw-data/Drake.csv', header = True, inferSchema = True)
billie_eilish = spark.read.csv('/mnt/spotify/raw-data/Billie_Eilish.csv', header = True, inferSchema = True)
ed_sheeran = spark.read.csv('/mnt/spotify/raw-data/Ed_Sheeran.csv', header = True, inferSchema = True)
ariana_grande = spark.read.csv('/mnt/spotify/raw-data/Ariana_Grande.csv', header = True, inferSchema = True)
travis_scott = spark.read.csv('/mnt/spotify/raw-data/Travis_Scott.csv', header = True, inferSchema = True)
justin_beiber = spark.read.csv('/mnt/spotify/raw-data/Justin_Bieber.csv', header = True, inferSchema = True)
dj_khaled = spark.read.csv('/mnt/spotify/raw-data/DJ_Khaled.csv', header = True, inferSchema = True)
dua_lipa = spark.read.csv('/mnt/spotify/raw-data/Dua_Lipa.csv', header = True, inferSchema = True)

## Combining the Datasets

In [0]:
new_df = the_weeknd.union(taylor_swift).union(drake).union(billie_eilish).union(ed_sheeran).union(ariana_grande).union(travis_scott).union(justin_beiber).union(dj_khaled).union(dua_lipa).orderBy(rand())

In [0]:
new_df.show()

+--------------------+--------------------+--------------------+--------------------+-------------------+------------+---------------+--------------------+
|             TrackID|           TrackName|           AlbumName|          ArtistName|        ReleaseDate|Duration(ms)|PopularityIndex|                 URL|
+--------------------+--------------------+--------------------+--------------------+-------------------+------------+---------------+--------------------+
|12PNcnMsjsZ3eHm62...|All I Do Is Win (...|             Victory|           DJ Khaled|2010-03-02 00:00:00|      232506|             71|https://open.spot...|
|3ZCTVFBt2Brf31RLE...| everything i wanted| everything i wanted|       Billie Eilish|2019-11-13 00:00:00|      245425|             86|https://open.spot...|
|0HaRLPnr887lcQM2Y...|For My Hand (feat...|        Love, Damini|Burna Boy, Ed She...|2022-07-07 00:00:00|      159123|             79|https://open.spot...|
|3WxmlTZ85sCYFnuIX...|         party favor|    dont smile at me|

In [0]:
new_df.printSchema()

root
 |-- TrackID: string (nullable = true)
 |-- TrackName: string (nullable = true)
 |-- AlbumName: string (nullable = true)
 |-- ArtistName: string (nullable = true)
 |-- ReleaseDate: timestamp (nullable = true)
 |-- Duration(ms): integer (nullable = true)
 |-- PopularityIndex: integer (nullable = true)
 |-- URL: string (nullable = true)



In [0]:
new_df = new_df.withColumnRenamed('TrackID','trackID') \
                .withColumnRenamed('TrackName', 'trackname') \
                .withColumnRenamed('AlbumName', 'album') \
                .withColumnRenamed('ArtistName', 'artists') \
                .withColumnRenamed('ReleaseDate', 'releasedate') \
                .withColumnRenamed('Duration(ms)', 'duration') \
                .withColumnRenamed('PopularityIndex', 'popularity') \
                .withColumnRenamed('URL', 'URL')

In [0]:
new_df.show()

+--------------------+--------------------+--------------------+--------------------+-------------------+--------+----------+--------------------+
|             trackID|           trackname|               album|             artists|        releasedate|duration|popularity|                 URL|
+--------------------+--------------------+--------------------+--------------------+-------------------+--------+----------+--------------------+
|12PNcnMsjsZ3eHm62...|All I Do Is Win (...|             Victory|           DJ Khaled|2010-03-02 00:00:00|  232506|        71|https://open.spot...|
|3ZCTVFBt2Brf31RLE...| everything i wanted| everything i wanted|       Billie Eilish|2019-11-13 00:00:00|  245425|        86|https://open.spot...|
|0HaRLPnr887lcQM2Y...|For My Hand (feat...|        Love, Damini|Burna Boy, Ed She...|2022-07-07 00:00:00|  159123|        79|https://open.spot...|
|3WxmlTZ85sCYFnuIX...|         party favor|    dont smile at me|       Billie Eilish|2017-12-22 00:00:00|  204770|    

In [0]:
new_df = new_df.drop('URL')

In [0]:
characters_to_replace = ['\\(', '\\)', '\\?', '"', '\\[', '\\]', "'", '\\/']

for char in characters_to_replace:
    new_df = new_df.withColumn("trackname", regexp_replace(new_df["trackname"], char, " "))
    new_df = new_df.withColumn("album", regexp_replace(new_df["album"], char, " "))
    new_df = new_df.withColumn("trackname", regexp_replace(new_df["trackname"], "-", " "))
    new_df = new_df.withColumn("album", regexp_replace(new_df["album"], "-", " " ))

In [0]:
new_df.show()

+--------------------+--------------------+--------------------+--------------------+-------------------+--------+----------+
|             trackID|           trackname|               album|             artists|        releasedate|duration|popularity|
+--------------------+--------------------+--------------------+--------------------+-------------------+--------+----------+
|12PNcnMsjsZ3eHm62...|All I Do Is Win  ...|             Victory|           DJ Khaled|2010-03-02 00:00:00|  232506|        71|
|3ZCTVFBt2Brf31RLE...| everything i wanted| everything i wanted|       Billie Eilish|2019-11-13 00:00:00|  245425|        86|
|0HaRLPnr887lcQM2Y...|For My Hand  feat...|        Love, Damini|Burna Boy, Ed She...|2022-07-07 00:00:00|  159123|        79|
|3WxmlTZ85sCYFnuIX...|         party favor|    dont smile at me|       Billie Eilish|2017-12-22 00:00:00|  204770|        70|
|3GYlZ7tbxLOxe6ewM...|                  TV|        Guitar Songs|       Billie Eilish|2022-07-21 00:00:00|  281380|    

In [0]:
new_df = new_df.withColumn("duration", round(col("duration") / 1000, 2))

In [0]:
new_df = new_df.withColumn("trackID", col("trackID").cast(StringType())) \
                .withColumn("trackname", col("trackname").cast(StringType())) \
                .withColumn('album', col('album').cast(StringType())) \
                .withColumn('artists', col('artists').cast(StringType())) \
                .withColumn('releasedate', col('releasedate').cast(DateType())) \
                .withColumn('duration', col('duration').cast(FloatType())) \
                .withColumn('popularity', col('popularity').cast(IntegerType()))

In [0]:
new_df.show()

+--------------------+--------------------+--------------------+--------------------+-----------+--------+----------+
|             trackID|           trackname|               album|             artists|releasedate|duration|popularity|
+--------------------+--------------------+--------------------+--------------------+-----------+--------+----------+
|12PNcnMsjsZ3eHm62...|All I Do Is Win  ...|             Victory|           DJ Khaled| 2010-03-02|  232.51|        71|
|3ZCTVFBt2Brf31RLE...| everything i wanted| everything i wanted|       Billie Eilish| 2019-11-13|  245.43|        86|
|0HaRLPnr887lcQM2Y...|For My Hand  feat...|        Love, Damini|Burna Boy, Ed She...| 2022-07-07|  159.12|        79|
|3WxmlTZ85sCYFnuIX...|         party favor|    dont smile at me|       Billie Eilish| 2017-12-22|  204.77|        70|
|3GYlZ7tbxLOxe6ewM...|                  TV|        Guitar Songs|       Billie Eilish| 2022-07-21|  281.38|        87|
|4XbOWk2QMKiAvMBmC...|   Out Here Grindin |           We

### Writing Data to ADLSv2

In [0]:
new_df.repartition(1).write.mode("overwrite").option("header","true").csv("/mnt/spotify/transformed-data/spotify_top_artists")