# Setup

In [None]:
# Installing required packages
!pip install pyspark
!pip install findspark
!pip install pyarrow==1.0.0
#!pip install pandas
#!pip install numpy==1.19.5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=030e385361dec39bf421a289b18a4d2000cf9b586062c5dd6aeba4ca1dafc967
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 k

In [None]:
import findspark
findspark.init()

In [None]:
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

## Spark Session

In [None]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [None]:
spark

## Load Part 1

In [None]:
# Read the file using `read_csv` function in pandas
music = pd.read_csv('updated_music_streaming.csv')

In [None]:
# Preview a few records
music.head()

Unnamed: 0.1,Unnamed: 0,Artist,Track,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration,time_signature,Genre,duration_in min
0,0,Bruno Mars,That's What I Like (feat. Gucci Mane),60.0,85.4745,56.34718,1.0,-4.964,1,2.788204,1.716867,0.423093,7.387916,90.979782,134.071,3.909933,4,5,3.909933
1,1,Boston,Hitch a Ride,54.0,34.689047,81.377467,3.0,-7.23,1,1.941019,0.110442,0.40251,9.017306,56.765163,116.454,4.19555,4,10,4.19555
2,2,The Raincoats,No Side to Fall In,35.0,40.284054,61.353237,6.0,-8.334,1,3.217158,48.795181,0.019578,38.670175,79.367548,147.681,1.827783,4,6,1.827783
3,3,Deno,Lingo (feat. J.I & Chunkz),66.0,85.366903,59.651178,10.0,-6.528,0,3.538874,2.128514,0.423093,11.142597,56.765163,107.033,2.899467,4,5,2.899467
4,4,Red Hot Chili Peppers,Nobody Weird Like Me - Remastered,53.0,11.555842,97.496971,2.0,-4.279,1,20.75067,0.016968,1.616367,16.202813,7.288751,199.06,3.832667,4,10,3.832667


Please note I have saved the cleaned & engineered csv file already from SparkSQL so that I don't repeat the steps again here.

# Loading Part 2 (Loading Data Into Spark DataFrame)

In [None]:
sdf = spark.createDataFrame(music)

In [None]:
sdf.printSchema()

root
 |-- Unnamed: 0: long (nullable = true)
 |-- Artist: string (nullable = true)
 |-- Track: string (nullable = true)
 |-- Popularity: double (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: long (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- duration: double (nullable = true)
 |-- time_signature: long (nullable = true)
 |-- Genre: long (nullable = true)
 |-- duration_in min: double (nullable = true)



In [None]:
sdf.createTempView("songs")

# Tasks

## b) Remove any songs that exceed 5 minutes

In [None]:
sdf.filter(sdf['duration'] < 5).show(5)

+----------+--------------------+--------------------+----------+------------------+-----------------+----+--------+----+------------------+------------------+------------------+------------------+-----------------+-------+------------------+--------------+-----+------------------+
|Unnamed: 0|              Artist|               Track|Popularity|      danceability|           energy| key|loudness|mode|       speechiness|      acousticness|  instrumentalness|          liveness|          valence|  tempo|          duration|time_signature|Genre|   duration_in min|
+----------+--------------------+--------------------+----------+------------------+-----------------+----+--------+----+------------------+------------------+------------------+------------------+-----------------+-------+------------------+--------------+-----+------------------+
|         0|          Bruno Mars|That's What I Lik...|      60.0|  85.4744996772111|56.34718008790636| 1.0|  -4.964|   1|2.7882037533512065| 1.71686746

## c) Display songs by J. Cole, Novo Amor and Anson Seabra


In [None]:
sdf.filter(sdf['Artist'].isin("J. Cole", "Novo Amor", "Anson Seabra")).show(10)

+----------+------------+--------------------+----------+-----------------+-----------------+----+--------+----+------------------+------------------+------------------+------------------+------------------+-------+------------------+--------------+-----+------------------+
|Unnamed: 0|      Artist|               Track|Popularity|     danceability|           energy| key|loudness|mode|       speechiness|      acousticness|  instrumentalness|          liveness|           valence|  tempo|          duration|time_signature|Genre|   duration_in min|
+----------+------------+--------------------+----------+-----------------+-----------------+----+--------+----+------------------+------------------+------------------+------------------+------------------+-------+------------------+--------------+-----+------------------+
|      1525|     J. Cole|p u n c h i n ‚Äò...|      81.0|76.32881428878846|73.96850188728361|11.0|  -6.579|   0| 38.12332439678284|19.779116465863453|0.4230927942698738|16.101

## d) How many songs are included in every category?


In [None]:
sdf.groupBy("Genre").count().show()

+-----+-----+
|Genre|count|
+-----+-----+
|    0|  586|
|    7|  465|
|    6| 2263|
|    9| 1828|
|    5| 1210|
|    1| 1268|
|   10| 4264|
|    3|  371|
|    8| 1704|
|    2| 1182|
|    4|  376|
+-----+-----+



## e) Which artists dominated the charts?


In [None]:
# Domination of the charts by an artist is determined by how ofen they appear, as well as how popular their songs are as well
# In my opinion, the most popular artists arethe most dominant in the charts, therefore the artists with the highest sum of popularity
from pyspark.sql.functions import desc

dominant_artist = sdf.groupby('Artist') \
    .agg({'Popularity': 'count', 'Popularity': 'sum'}) \
    .orderBy(desc('sum(Popularity)')) \
    .show(10)

+------------------+------------------+
|            Artist|   sum(Popularity)|
+------------------+------------------+
|    Britney Spears| 2637.241221979766|
|   Backstreet Boys|            2615.0|
|The Rolling Stones|1838.3294319910071|
|         Metallica|            1710.0|
|                U2|            1648.0|
|          Westlife|1553.7236659392977|
|             AC/DC|            1549.0|
|          Coldplay|            1441.0|
|       The Beatles|1409.6588639820143|
|           Nirvana|            1385.0|
+------------------+------------------+
only showing top 10 rows



## f) What songs would be considered for the “Billboard Top 10 Songs of the Year”? (mention their artists as well)

In [None]:
# most popular songs overall belong to billboard top 10.

sdf.select("Track", "Artist", "Popularity") \
    .orderBy(desc("Popularity")) \
    .show(10)

+--------------------+--------------------+----------+
|               Track|              Artist|Popularity|
+--------------------+--------------------+----------+
|MONTERO (Call Me ...|           Lil Nas X|     100.0|
|             Beggin'|            Måneskin|     100.0|
|            good 4 u|      Olivia Rodrigo|      99.0|
|Kiss Me More (fea...|            Doja Cat|      98.0|
|STAY (with Justin...|The Kid LAROI, Ju...|      97.0|
|Astronaut In The ...|         Masked Wolf|      97.0|
|          Bad Habits|          Ed Sheeran|      97.0|
|STAY (with Justin...|The Kid LAROI, Ju...|      97.0|
|              Butter|                 BTS|      96.0|
|             RAPSTAR|              Polo G|      96.0|
+--------------------+--------------------+----------+
only showing top 10 rows



## g) Recommend at least 5 songs that can be played at a party


In [None]:
# I think the main parameters for a party song are: Liveness, Loudness, Energy and Danceability. Valence is not considered because party songs
# don't necessarily have to be happy. Duration is irrelevant. Speechiness is not a factor because there are EDM songs without words that
# work great in parties. The same reasoning goes for Instrumentalness. Tempo could be considered (as high tempo is synonymous with high energy
# which is great for parties) however there exist low tempo party songs. I think the most import factors are that the song is lively, loud,
# energetic and danceable.
# popularity could be a factor, however some unknown songs can be great party songs

# I wanted to keep the values as high as possible while still giving at least 5 results

party_songs = sdf.select("Track", "Artist", "Liveness", "Loudness", "Energy", "Danceability") \
    .filter((sdf["Liveness"] >= 85) & (sdf["Loudness"] >= -10) & (sdf["Energy"] >= 85) & (sdf["Danceability"] >= 60)) \
    .orderBy("Popularity", ascending=False) \
    .show()


+--------------------+----------+-----------------+--------+-----------------+-----------------+
|               Track|    Artist|         Liveness|Loudness|           Energy|     Danceability|
+--------------------+----------+-----------------+--------+-----------------+-----------------+
|I Like It, I Love It|Tim McGraw| 94.7373747596397|  -5.298|96.09527528309252| 62.7716806541855|
|         Got Me Good|     Ciara|89.17113652464326|  -4.465|95.99515413650516|69.87303636754895|
|     Good Times Roll|      GRiZ|97.26748304827446|  -5.445|87.38473552999129|66.42995480955456|
|Lovin' Is Everywhere| Engelwood|89.06993219309787|  -6.411|89.58740075491345|62.23369916074887|
|Stars - Live at S...|Simply Red|97.36868737981986|  -7.222|91.28946024689876|61.15773617387562|
|             Trigger| FEVER 333|88.56391053537091|  -4.125| 98.5983039477768|61.37292877125027|
+--------------------+----------+-----------------+--------+-----------------+-----------------+

