# Exploring data

In [9]:
from pyspark.sql import SparkSession
import os
import configparser
import boto3

## Load credentials

In [10]:
config = configparser.ConfigParser()
config.read_file(open('dl.cfg'))

os.environ['AWS_ACCESS_KEY_ID'] = config.get("default", "AWS_ACCESS_KEY_ID")
os.environ['AWS_SECRET_ACCESS_KEY'] = config.get("default", "AWS_SECRET_ACCESS_KEY")

## Create spark session with haddop-aws package

In [11]:
spark = SparkSession \
    .builder \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
    .getOrCreate()

## Song table

In [12]:
song_table = spark.read.parquet("s3a://data-lake-udacity-sparkify/song/songs.parquet")
song_table.show()

+------------------+--------------------+---------+----+------------------+
|           song_id|               title| duration|year|         artist_id|
+------------------+--------------------+---------+----+------------------+
|SOBTCUI12A8AE48B70|Faust: Ballet Mus...| 94.56281|   0|ARSUVLW12454A4C8B8|
|SOVNKJI12A8C13CB0D|Take It To Da Hou...|227.10812|2001|ARWUNH81187FB4A3E0|
|SOYVBGZ12A6D4F92A8|Piano Sonata No. ...|221.70077|   0|ARLRWBW1242077EB29|
|SODBHKO12A58A77F36|Fingers Of Love (...|335.93424|   0|ARKGS2Z1187FB494B5|
|SOGXFIF12A58A78CC4|Hanging On (Mediu...|204.06812|   0|AR5LZJD1187FB4C5E5|
|SOZCRVP12A81C21F40|Welcome To The Do...| 46.94159|2008|AR4503S1187FB43199|
|SOOBEML12A8C138C91|Johnny Leary's Po...|  197.642|   0|ARP4O0W1187FB5A06B|
|SOUOPFM12AB0185809|You'd Be So Nice ...|405.41995|   0|ARSXDJO1269FCD9405|
|SOVJXVJ12A8C13517D|Where The Thunder...|298.84036|   0|ARCCRTI11F4C845308|
|SOKTJDS12AF72A25E5|Drown In My Own T...|  192.522|   0|ARA23XO1187B9AF18F|
|SOHHANU12A5

In [13]:
song_table.createOrReplaceTempView("song_table")

In [23]:
spark.sql('''
          SELECT title, count(*)
          FROM song_table 
          GROUP BY title
          ORDER BY count(*) desc
          LIMIT 10
          '''
          ).show()

+--------------------+--------+
|               title|count(1)|
+--------------------+--------+
|Always And Never ...|       1|
|Seem Like A Thug ...|       1|
|                Hate|       1|
|          Ludwig Van|       1|
|Stop Coming To My...|       1|
|       Rainbow Yoshi|       1|
|       The Telescope|       1|
|Talking With Myse...|       1|
|My Eyes Burn (Alb...|       1|
|You Are The Only ...|       1|
+--------------------+--------+



## Artist table

In [22]:
artist_table = spark.read.parquet("s3a://data-lake-udacity-sparkify/artist/artists.parquet")
artist_table.show()

+------------------+--------------------+--------------------+---------+----------+
|         artist_id|                name|            location|lattitude| longitude|
+------------------+--------------------+--------------------+---------+----------+
|ARSUVLW12454A4C8B8|Royal Philharmoni...|           Tennessee| 35.83073| -85.97874|
|ARXQC081187FB4AD42|William Shatner_ ...|                  UK| 54.31407|  -2.23001|
|ARWUNH81187FB4A3E0|         Trick Daddy|     Miami , Florida|     null|      null|
|ARTC1LV1187B9A4858|  The Bonzo Dog Band|Goldsmith's Colle...|  51.4536|  -0.01802|
|ARA23XO1187B9AF18F|     The Smithereens|Carteret, New Jersey| 40.57885| -74.21956|
|ARLRWBW1242077EB29|     Mikhail Pletnev|                    |     null|      null|
|AR5LZJD1187FB4C5E5|        Britt Nicole|                  NC| 35.66693| -80.46935|
|ARV3PXE1187B98E680|   John Brown's Body|     NY - Upstate NY| 40.71455| -74.00712|
|AR6PJ8R1187FB5AD70|Shakira Featuring...|Barranquilla, Col...|     null|    

In [24]:
artist_table.createOrReplaceTempView("artist_table")

In [25]:
spark.sql('''
          SELECT name, count(*)
          FROM artist_table 
          GROUP BY name
          ORDER BY count(*) desc
          LIMIT 10
          '''
          ).show()

+---------------+--------+
|           name|count(1)|
+---------------+--------+
|      Xcultures|       2|
|Black Eyed Peas|       2|
|       Karunesh|       2|
| Polygon Window|       2|
|     Aphex Twin|       2|
|      The Kooks|       2|
|       Mastodon|       2|
|   Jamie Cullum|       2|
|  Tracy Chapman|       2|
|     Jag Panzer|       2|
+---------------+--------+



## User table

In [28]:
user_table = spark.read.parquet("s3a://data-lake-udacity-sparkify/user/user.parquet")
user_table.show()

+------+----------+---------+------+-----+
|userId| firstName| lastName|gender|level|
+------+----------+---------+------+-----+
|    88|  Mohammad|Rodriguez|     M| paid|
|    88|  Mohammad|Rodriguez|     M| free|
|    68|    Jordan|Rodriguez|     F| free|
|    29|Jacqueline|    Lynch|     F| free|
|    11| Christian|   Porter|     F| free|
|    53|   Celeste| Williams|     F| free|
|    69|  Anabelle|  Simpson|     F| free|
|    75|    Joseph|Gutierrez|     M| free|
|    40|    Tucker| Garrison|     M| free|
|     2|   Jizelle| Benjamin|     F| free|
|    14|  Theodore|   Harris|     M| free|
|    52|  Theodore|    Smith|     M| free|
|    56|    Cienna|  Freeman|     F| free|
|    12|    Austin|  Rosales|     M| free|
|    19|   Zachary|   Thomas|     M| free|
|    23|    Morris|  Gilmore|     M| free|
|    55|    Martin|  Johnson|     M| free|
|    66|     Kevin| Arellano|     M| free|
|    64|    Hannah|  Calhoun|     F| free|
|   100|     Adler|  Barrera|     M| free|
+------+---

In [29]:
user_table.createOrReplaceTempView("user_table")

In [30]:
spark.sql('''
          SELECT *
          FROM user_table 
          LIMIT 10
          '''
          ).show()

+------+----------+---------+------+-----+
|userId| firstName| lastName|gender|level|
+------+----------+---------+------+-----+
|    88|  Mohammad|Rodriguez|     M| paid|
|    88|  Mohammad|Rodriguez|     M| free|
|    68|    Jordan|Rodriguez|     F| free|
|    29|Jacqueline|    Lynch|     F| free|
|    11| Christian|   Porter|     F| free|
|    53|   Celeste| Williams|     F| free|
|    69|  Anabelle|  Simpson|     F| free|
|    75|    Joseph|Gutierrez|     M| free|
|    40|    Tucker| Garrison|     M| free|
|     2|   Jizelle| Benjamin|     F| free|
+------+----------+---------+------+-----+



## Time Table

In [33]:
time_table = spark.read.parquet("s3a://data-lake-udacity-sparkify/time/time.parquet")
time_table.show()

+----+---+----+-------+----+-----+
|hour|day|week|weekday|year|month|
+----+---+----+-------+----+-----+
|   0| 15|  46|      5|2018|   11|
|   0| 15|  46|      5|2018|   11|
|   0| 15|  46|      5|2018|   11|
|   3| 15|  46|      5|2018|   11|
|   5| 15|  46|      5|2018|   11|
|   5| 15|  46|      5|2018|   11|
|   5| 15|  46|      5|2018|   11|
|   6| 15|  46|      5|2018|   11|
|   6| 15|  46|      5|2018|   11|
|   6| 15|  46|      5|2018|   11|
|   6| 15|  46|      5|2018|   11|
|   6| 15|  46|      5|2018|   11|
|   6| 15|  46|      5|2018|   11|
|   6| 15|  46|      5|2018|   11|
|   6| 15|  46|      5|2018|   11|
|   6| 15|  46|      5|2018|   11|
|   6| 15|  46|      5|2018|   11|
|   7| 15|  46|      5|2018|   11|
|   7| 15|  46|      5|2018|   11|
|   7| 15|  46|      5|2018|   11|
+----+---+----+-------+----+-----+
only showing top 20 rows



In [34]:
time_table.createOrReplaceTempView("time_table")

In [35]:
spark.sql('''
          SELECT *
          FROM time_table 
          LIMIT 10
          '''
          ).show()

+----+---+----+-------+----+-----+
|hour|day|week|weekday|year|month|
+----+---+----+-------+----+-----+
|   0| 15|  46|      5|2018|   11|
|   0| 15|  46|      5|2018|   11|
|   0| 15|  46|      5|2018|   11|
|   3| 15|  46|      5|2018|   11|
|   5| 15|  46|      5|2018|   11|
|   5| 15|  46|      5|2018|   11|
|   5| 15|  46|      5|2018|   11|
|   6| 15|  46|      5|2018|   11|
|   6| 15|  46|      5|2018|   11|
|   6| 15|  46|      5|2018|   11|
+----+---+----+-------+----+-----+



## Songplays Table

In [None]:
songplays_table = spark.read.parquet("s3a://data-lake-udacity-sparkify/songplays/songplays.parquet")
songplays_table.show()

In [None]:
songplays_table.createOrReplaceTempView("songplays_table")

In [None]:
spark.sql('''
          SELECT *
          FROM songplays_table 
          LIMIT 10
          '''
          ).show()