In [42]:
import pyspark
from pyspark.sql import types
from pyspark.sql import SparkSession


In [3]:
pyspark.__file__

'/home/febridev/spark/spark-3.0.3-bin-hadoop3.2/python/pyspark/__init__.py'

In [43]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [9]:
df_artists = spark.read.option("header",True).parquet("gs://dtc_data_lake_applied-mystery-341809/raw/artists.parquet")

In [10]:
df_artists.printSchema()

root
 |-- id: string (nullable = true)
 |-- followers: double (nullable = true)
 |-- genres: string (nullable = true)
 |-- name: string (nullable = true)
 |-- popularity: long (nullable = true)



In [14]:
df_artists.schema

StructType(List(StructField(id,StringType,true),StructField(followers,DoubleType,true),StructField(genres,StringType,true),StructField(name,StringType,true),StructField(popularity,LongType,true)))

In [18]:
schema_artists = types.StructType([
	types.StructField('id',types.StringType(),True),
	types.StructField('followers',types.DoubleType(),True),
	types.StructField('name',types.StringType(),True)
])

In [57]:
df_artists = spark.read \
.option("header",True) \
.schema(schema_artists) \
.parquet("gs://dtc_data_lake_applied-mystery-341809/raw/artists.parquet")

In [58]:
df_artists = df_artists.repartition(8)

In [59]:
# create new file parquet for artists
df_artists.write.parquet("gs://dtc_data_lake_applied-mystery-341809/transform/artists",mode='overwrite')

In [60]:
df_artists = spark.read \
.option("header",True) \
.parquet("gs://dtc_data_lake_applied-mystery-341809/transform/artists")

df_artists.show(10)

+--------------------+---------+--------------------+
|                  id|followers|                name|
+--------------------+---------+--------------------+
|59qz10hjQPAs0spos...|     18.0|    Boys of the Band|
|0LcrfJ63GdIW4n2UZ...|      0.0|David Mackersie e...|
|18lEjlk2JBPdyObT3...|    785.0|Ministerio Doble ...|
|1UhZGGHbPPHAHt2ce...|    214.0|  Andre Tschaskowski|
|5D95DPWHohrzVbdub...|  10557.0|       Advent Sorrow|
|11CbG4ImkEw99aUng...|   3614.0|                  MX|
|61MH29rMIyOfuK7KX...|  27878.0| The Vintage Caravan|
|5lInFfKjIAVzBIOg4...|    445.0|     Charles Grigsby|
|3kJakFcxRwLW9f47x...|      4.0|        Willy Rustad|
|508weSx4HBumrGggF...|   1110.0|              Hunxho|
+--------------------+---------+--------------------+
only showing top 10 rows



In [61]:
df_tracks = spark.read.option("header",True).parquet("gs://dtc_data_lake_applied-mystery-341809/raw/tracks.parquet")

In [46]:
df_tracks.show(10)

+--------------------+--------------------+----------+-----------+--------+-------------------+--------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+
|                  id|                name|popularity|duration_ms|explicit|            artists|          id_artists|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|
+--------------------+--------------------+----------+-----------+--------+-------------------+--------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+
|35iwgR4jXetI318WE...|               Carve|         6|     126903|       0|            ['Uli']|['45tIt06XoI0Iio4...|  1922-02-22|       0.645| 0.445|  0| -13.338|   1|      0.451|       0.674|           0.744|   0.151|  0.127|104.851|             3|


In [47]:
df_tracks.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- popularity: long (nullable = true)
 |-- duration_ms: long (nullable = true)
 |-- explicit: long (nullable = true)
 |-- artists: string (nullable = true)
 |-- id_artists: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: long (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: long (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: long (nullable = true)



In [49]:
df_tracks.schema

StructType(List(StructField(id,StringType,true),StructField(name,StringType,true),StructField(popularity,LongType,true),StructField(duration_ms,LongType,true),StructField(explicit,LongType,true),StructField(artists,StringType,true),StructField(id_artists,StringType,true),StructField(release_date,StringType,true),StructField(danceability,DoubleType,true),StructField(energy,DoubleType,true),StructField(key,LongType,true),StructField(loudness,DoubleType,true),StructField(mode,LongType,true),StructField(speechiness,DoubleType,true),StructField(acousticness,DoubleType,true),StructField(instrumentalness,DoubleType,true),StructField(liveness,DoubleType,true),StructField(valence,DoubleType,true),StructField(tempo,DoubleType,true),StructField(time_signature,LongType,true)))

In [62]:
schema_tracks = types.StructType([
    types.StructField('id',types.StringType(),True),
    types.StructField('name',types.StringType(),True),
    types.StructField('popularity',types.LongType(),True),
    types.StructField('duration_ms',types.LongType(),True),
    types.StructField('explicit',types.LongType(),True),
    types.StructField('artists',types.StringType(),True),
    types.StructField('id_artists',types.StringType(),True),
    types.StructField('release_date',types.StringType(),True),
    types.StructField('danceability',types.DoubleType(),True),
    types.StructField('energy',types.DoubleType(),True),
    types.StructField('key',types.LongType(),True),
    types.StructField('loudness',types.DoubleType(),True),
    types.StructField('mode',types.LongType(),True),
    types.StructField('speechiness',types.DoubleType(),True),
    types.StructField('acousticness',types.DoubleType(),True),
    types.StructField('instrumentalness',types.DoubleType(),True),
    types.StructField('liveness',types.DoubleType(),True),
    types.StructField('valence',types.DoubleType(),True),
    types.StructField('tempo',types.DoubleType(),True),
    types.StructField('time_signature',types.TimestampType(),True)
])

In [64]:
df_tracks = spark.read \
.option("header",True) \
.schema(schema_tracks) \
.parquet("gs://dtc_data_lake_applied-mystery-341809/raw/tracks.parquet")

In [65]:
df_tracks = df_tracks.repartition(8)

In [66]:
# create new file parquet for artists
df_tracks.write.parquet("gs://dtc_data_lake_applied-mystery-341809/transform//tracks",mode='overwrite')