In [1]:
from pyspark.sql import SparkSession 
from pyspark.sql import functions as F 

spark = (SparkSession
            .builder
            .appName("Demo App")
            .config("spark.ui.port", "4050")
            .getOrCreate()
            )

# Setting the log level to ERROR 
spark.sparkContext.setLogLevel('ERROR')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Creating a DF from a List of List 

In [2]:
my_grocery_list = [
    ["Banana", 2, 1.74],
    ["Apple", 4, 2.04],
    ["Carrot", 1, 1.09],
    ["Cake", 1, 10.99],
    ]

In [3]:
df = spark.createDataFrame(my_grocery_list, ['Item', 'Quantity', 'Price'])
df.show()
df.printSchema()

+------+--------+-----+
|  Item|Quantity|Price|
+------+--------+-----+
|Banana|       2| 1.74|
| Apple|       4| 2.04|
|Carrot|       1| 1.09|
|  Cake|       1|10.99|
+------+--------+-----+

root
 |-- Item: string (nullable = true)
 |-- Quantity: long (nullable = true)
 |-- Price: double (nullable = true)



                                                                                

In [4]:
!sudo aws s3 ls s3://data-engg-suman/dataset/all-book-sataset/broadcast_logs/

                           PRE ReferenceTables/
2023-01-11 17:09:16   33922161 BroadcastLogs_2018_Q3_M8_sample.CSV
2023-01-11 17:14:40      18219 Call_Signs.csv
2023-01-11 17:13:51      72704 data dictionary.doc


# Reading a CSV file 

In [5]:
import os 

DATA_PATH = 's3://data-engg-suman/dataset/all-book-sataset/broadcast_logs/'

logs = (spark
            .read
            .csv(os.path.join(DATA_PATH, 'BroadcastLogs_2018_Q3_M8_sample.CSV'), sep='|', header=True, inferSchema=True, timestampFormat='yyy-MM-dd')
            )

                                                                                

In [6]:
logs.show(5)

+--------------+------------+-------------------+----------+-------------------+----------------------+----------+---------------+-----------------+----------------+---------------+------------------+--------------+--------------------+------------+----------------+----------------+-------------------+------------+--------------------+----------------+--------+--------------------+------------------+----------------------+-------------+---------+---------+---------+---------+
|BroadcastLogID|LogServiceID|            LogDate|SequenceNO|AudienceTargetAgeID|AudienceTargetEthnicID|CategoryID|ClosedCaptionID|CountryOfOriginID|DubDramaCreditID|EthnicProgramID|ProductionSourceID|ProgramClassID|FilmClassificationID|ExhibitionID|        Duration|         EndTime|       LogEntryDate|ProductionNO|        ProgramTitle|       StartTime|Subtitle|NetworkAffiliationID|SpecialAttentionID|BroadcastOriginPointID|CompositionID|Producer1|Producer2|Language1|Language2|
+--------------+------------+---------

In [7]:
logs.printSchema()

root
 |-- BroadcastLogID: integer (nullable = true)
 |-- LogServiceID: integer (nullable = true)
 |-- LogDate: timestamp (nullable = true)
 |-- SequenceNO: integer (nullable = true)
 |-- AudienceTargetAgeID: integer (nullable = true)
 |-- AudienceTargetEthnicID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- ClosedCaptionID: integer (nullable = true)
 |-- CountryOfOriginID: integer (nullable = true)
 |-- DubDramaCreditID: integer (nullable = true)
 |-- EthnicProgramID: integer (nullable = true)
 |-- ProductionSourceID: integer (nullable = true)
 |-- ProgramClassID: integer (nullable = true)
 |-- FilmClassificationID: integer (nullable = true)
 |-- ExhibitionID: integer (nullable = true)
 |-- Duration: string (nullable = true)
 |-- EndTime: string (nullable = true)
 |-- LogEntryDate: timestamp (nullable = true)
 |-- ProductionNO: string (nullable = true)
 |-- ProgramTitle: string (nullable = true)
 |-- StartTime: string (nullable = true)
 |-- Subtitle: string 

In [8]:
logs.select(F.col('BroadcastLogID'), F.col('LogServiceID'), F.col('LogDate')).show(5, False)

+--------------+------------+-------------------+
|BroadcastLogID|LogServiceID|LogDate            |
+--------------+------------+-------------------+
|1196192316    |3157        |2018-08-01 00:00:00|
|1196192317    |3157        |2018-08-01 00:00:00|
|1196192318    |3157        |2018-08-01 00:00:00|
|1196192319    |3157        |2018-08-01 00:00:00|
|1196192320    |3157        |2018-08-01 00:00:00|
+--------------+------------+-------------------+
only showing top 5 rows



In [9]:
logs.select(['BroadcastLogID', 'LogServiceID', 'LogDate']).show(5, False)

+--------------+------------+-------------------+
|BroadcastLogID|LogServiceID|LogDate            |
+--------------+------------+-------------------+
|1196192316    |3157        |2018-08-01 00:00:00|
|1196192317    |3157        |2018-08-01 00:00:00|
|1196192318    |3157        |2018-08-01 00:00:00|
|1196192319    |3157        |2018-08-01 00:00:00|
|1196192320    |3157        |2018-08-01 00:00:00|
+--------------+------------+-------------------+
only showing top 5 rows



In [21]:
logs.select('BroadcastLogID', 'LogServiceID', 'LogDate').show(5, False)

+--------------+------------+-------------------+
|BroadcastLogID|LogServiceID|LogDate            |
+--------------+------------+-------------------+
|1196192316    |3157        |2018-08-01 00:00:00|
|1196192317    |3157        |2018-08-01 00:00:00|
|1196192318    |3157        |2018-08-01 00:00:00|
|1196192319    |3157        |2018-08-01 00:00:00|
|1196192320    |3157        |2018-08-01 00:00:00|
+--------------+------------+-------------------+
only showing top 5 rows



                                                                                

In [15]:
import numpy as np 

column_split = np.array_split(np.array(logs.columns), len(logs.columns)//3)

In [17]:
for x in column_split:
    logs.select(*x).show(5, False)

+--------------+------------+-------------------+
|BroadcastLogID|LogServiceID|LogDate            |
+--------------+------------+-------------------+
|1196192316    |3157        |2018-08-01 00:00:00|
|1196192317    |3157        |2018-08-01 00:00:00|
|1196192318    |3157        |2018-08-01 00:00:00|
|1196192319    |3157        |2018-08-01 00:00:00|
|1196192320    |3157        |2018-08-01 00:00:00|
+--------------+------------+-------------------+
only showing top 5 rows

+----------+-------------------+----------------------+
|SequenceNO|AudienceTargetAgeID|AudienceTargetEthnicID|
+----------+-------------------+----------------------+
|1         |4                  |null                  |
|2         |null               |null                  |
|3         |null               |null                  |
|4         |null               |null                  |
|5         |null               |null                  |
+----------+-------------------+----------------------+
only showing top 5 ro

## Deleting columns

In [18]:
logs_new = logs.drop("BroadcastLogID", "SequenceNO")

In [20]:
"BroadcastLogID" in logs_new.columns, "SequenceNO" in logs_new.columns

(False, False)

In [21]:
"BroadcastLogID" in logs.columns, "SequenceNO" in logs.columns

(True, True)

## New columns with withColumn()

In [53]:
logs.select(F.col('Duration')).show(3)
logs.select(F.col('Duration')).dtypes

+----------------+
|        Duration|
+----------------+
|02:00:00.0000000|
|00:00:30.0000000|
|00:00:15.0000000|
+----------------+
only showing top 3 rows



[('Duration', 'string')]

In [54]:
logs.select(
        F.col("Duration"),
        F.col("Duration").substr(1, 2).cast("int").alias("dur_hours"),
        F.col("Duration").substr(4, 2).cast("int").alias("dur_minutes"),
        F.col("Duration").substr(7, 2).cast("int").alias("dur_seconds"),
        ).distinct().show(5)



+----------------+---------+-----------+-----------+
|        Duration|dur_hours|dur_minutes|dur_seconds|
+----------------+---------+-----------+-----------+
|00:04:52.0000000|        0|          4|         52|
|00:09:52.0000000|        0|          9|         52|
|01:34:00.0000000|        1|         34|          0|
|01:59:57.0000000|        1|         59|         57|
|00:38:10.0000000|        0|         38|         10|
+----------------+---------+-----------+-----------+
only showing top 5 rows



                                                                                

In [65]:
logs = logs.withColumn('Duration_seconds',
        (   F.col("Duration").substr(1, 2).cast("int") * 60 * 60 +
            F.col("Duration").substr(4, 2).cast("int") * 60 +
            F.col("Duration").substr(7, 2).cast("int")).alias('seconds'),
        )

logs.printSchema()



root
 |-- BroadcastLogID: integer (nullable = true)
 |-- LogServiceID: integer (nullable = true)
 |-- LogDate: timestamp (nullable = true)
 |-- SequenceNO: integer (nullable = true)
 |-- AudienceTargetAgeID: integer (nullable = true)
 |-- AudienceTargetEthnicID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- ClosedCaptionID: integer (nullable = true)
 |-- CountryOfOriginID: integer (nullable = true)
 |-- DubDramaCreditID: integer (nullable = true)
 |-- EthnicProgramID: integer (nullable = true)
 |-- ProductionSourceID: integer (nullable = true)
 |-- ProgramClassID: integer (nullable = true)
 |-- FilmClassificationID: integer (nullable = true)
 |-- ExhibitionID: integer (nullable = true)
 |-- Duration: string (nullable = true)
 |-- EndTime: string (nullable = true)
 |-- LogEntryDate: timestamp (nullable = true)
 |-- ProductionNO: string (nullable = true)
 |-- ProgramTitle: string (nullable = true)
 |-- StartTime: string (nullable = true)
 |-- Subtitle: string 

In [66]:
logs.select(
        F.col("Duration"),
        F.col("Duration_seconds")
        ).distinct().show(5)



+----------------+----------------+
|        Duration|Duration_seconds|
+----------------+----------------+
|00:28:08.0000000|            1688|
|00:32:00.0000000|            1920|
|00:30:00.0000000|            1800|
|00:01:39.0000000|              99|
|00:29:50.0000000|            1790|
+----------------+----------------+
only showing top 5 rows



                                                                                

## Renaming all columns in one go

In [69]:
logs = logs.toDF(*[x.lower() for x in logs.columns])

In [70]:
logs.columns

['broadcastlogid',
 'logserviceid',
 'logdate',
 'sequenceno',
 'audiencetargetageid',
 'audiencetargetethnicid',
 'categoryid',
 'closedcaptionid',
 'countryoforiginid',
 'dubdramacreditid',
 'ethnicprogramid',
 'productionsourceid',
 'programclassid',
 'filmclassificationid',
 'exhibitionid',
 'duration',
 'endtime',
 'logentrydate',
 'productionno',
 'programtitle',
 'starttime',
 'subtitle',
 'networkaffiliationid',
 'specialattentionid',
 'broadcastoriginpointid',
 'compositionid',
 'producer1',
 'producer2',
 'language1',
 'language2',
 'duration_seconds']

## Reordering columns

In [71]:
logs.select(sorted(logs.columns)).printSchema()

root
 |-- audiencetargetageid: integer (nullable = true)
 |-- audiencetargetethnicid: integer (nullable = true)
 |-- broadcastlogid: integer (nullable = true)
 |-- broadcastoriginpointid: integer (nullable = true)
 |-- categoryid: integer (nullable = true)
 |-- closedcaptionid: integer (nullable = true)
 |-- compositionid: integer (nullable = true)
 |-- countryoforiginid: integer (nullable = true)
 |-- dubdramacreditid: integer (nullable = true)
 |-- duration: string (nullable = true)
 |-- duration_seconds: integer (nullable = true)
 |-- endtime: string (nullable = true)
 |-- ethnicprogramid: integer (nullable = true)
 |-- exhibitionid: integer (nullable = true)
 |-- filmclassificationid: integer (nullable = true)
 |-- language1: integer (nullable = true)
 |-- language2: integer (nullable = true)
 |-- logdate: timestamp (nullable = true)
 |-- logentrydate: timestamp (nullable = true)
 |-- logserviceid: integer (nullable = true)
 |-- networkaffiliationid: integer (nullable = true)
 |-- 

## Diagnosing a data frame

In [72]:
for i in logs.columns:
    logs.describe(i).show()

                                                                                

+-------+--------------------+
|summary|      broadcastlogid|
+-------+--------------------+
|  count|              238945|
|   mean|1.2168651122760174E9|
| stddev|1.4969134241430923E7|
|    min|          1195788151|
|    max|          1249431576|
+-------+--------------------+



                                                                                

+-------+------------------+
|summary|      logserviceid|
+-------+------------------+
|  count|            238945|
|   mean| 3450.890284375065|
| stddev|199.50673962555322|
|    min|              3157|
|    max|              3925|
+-------+------------------+

+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+



                                                                                

+-------+-----------------+
|summary|       sequenceno|
+-------+-----------------+
|  count|           238945|
|   mean|466.3855824562138|
| stddev|311.5109104839795|
|    min|                1|
|    max|             1463|
+-------+-----------------+



                                                                                

+-------+-------------------+
|summary|audiencetargetageid|
+-------+-------------------+
|  count|              16112|
|   mean| 3.4929245283018866|
| stddev| 1.0415963394745127|
|    min|                  1|
|    max|                  4|
+-------+-------------------+



                                                                                

+-------+----------------------+
|summary|audiencetargetethnicid|
+-------+----------------------+
|  count|                  1710|
|   mean|    120.56432748538012|
| stddev|     71.98694059436131|
|    min|                     4|
|    max|                   337|
+-------+----------------------+



                                                                                

+-------+------------------+
|summary|        categoryid|
+-------+------------------+
|  count|             25506|
|   mean|18.485297577040697|
| stddev| 9.655852252020846|
|    min|                 1|
|    max|                29|
+-------+------------------+



                                                                                

+-------+-------------------+
|summary|    closedcaptionid|
+-------+-------------------+
|  count|             224117|
|   mean| 1.0316174141185184|
| stddev|0.24947032900378938|
|    min|                  1|
|    max|                  3|
+-------+-------------------+



                                                                                

+-------+------------------+
|summary| countryoforiginid|
+-------+------------------+
|  count|             17822|
|   mean|  4.06390977443609|
| stddev|2.6159675821007307|
|    min|                 2|
|    max|                11|
+-------+------------------+



                                                                                

+-------+------------------+
|summary|  dubdramacreditid|
+-------+------------------+
|  count|               263|
|   mean|5.1254752851711025|
| stddev|3.4513297114385635|
|    min|                 1|
|    max|                10|
+-------+------------------+



                                                                                

+-------+------------------+
|summary|   ethnicprogramid|
+-------+------------------+
|  count|              1895|
|   mean| 4.316622691292876|
| stddev|2.2966421456800292|
|    min|                 1|
|    max|                 6|
+-------+------------------+



                                                                                

+-------+------------------+
|summary|productionsourceid|
+-------+------------------+
|  count|             16223|
|   mean|11.193737286568453|
| stddev|6.0790222393313345|
|    min|                 1|
|    max|                19|
+-------+------------------+



                                                                                

+-------+----------------+
|summary|  programclassid|
+-------+----------------+
|  count|          238945|
|   mean|8.49341061750612|
| stddev|7.90211684690576|
|    min|               3|
|    max|              30|
+-------+----------------+



                                                                                

+-------+--------------------+
|summary|filmclassificationid|
+-------+--------------------+
|  count|                1885|
|   mean|   1.953315649867374|
| stddev|  1.6420515263133841|
|    min|                   1|
|    max|                   8|
+-------+--------------------+



                                                                                

+-------+------------------+
|summary|      exhibitionid|
+-------+------------------+
|  count|              9263|
|   mean|  4.52067364784627|
| stddev|1.4418772626833436|
|    min|                 1|
|    max|                 8|
+-------+------------------+



                                                                                

+-------+----------------+
|summary|        duration|
+-------+----------------+
|  count|          236724|
|   mean|            null|
| stddev|            null|
|    min|00:00:01.0000000|
|    max|06:30:09.0000000|
+-------+----------------+



                                                                                

+-------+----------------+
|summary|         endtime|
+-------+----------------+
|  count|          169979|
|   mean|            null|
| stddev|            null|
|    min|00:00:00.0000000|
|    max|23:59:59.0000000|
+-------+----------------+

+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+



                                                                                

+-------+------------------+
|summary|      productionno|
+-------+------------------+
|  count|              3519|
|   mean| 35710.61538461538|
| stddev|3749.1340008607654|
|    min|            030641|
|    max|            c34183|
+-------+------------------+



                                                                                

+-------+------------------+
|summary|      programtitle|
+-------+------------------+
|  count|            238703|
|   mean|            1999.0|
| stddev|              null|
|    min|  !NO 5PM A ID(:5)|
|    max|�t� avec Jo�l 2/Un|
+-------+------------------+



                                                                                

+-------+----------------+
|summary|       starttime|
+-------+----------------+
|  count|          238945|
|   mean|            null|
| stddev|            null|
|    min|00:00:00.0000000|
|    max|23:59:59.0000000|
+-------+----------------+



                                                                                

+-------+--------------------+
|summary|            subtitle|
+-------+--------------------+
|  count|               15468|
|   mean|   3463.573913043478|
| stddev|  16251.272419144225|
|    min|                #001|
|    max|�tre dans le trou...|
+-------+--------------------+



                                                                                

+-------+--------------------+
|summary|networkaffiliationid|
+-------+--------------------+
|  count|              108807|
|   mean|   6.082200593711802|
| stddev|   2.990848675228531|
|    min|                   1|
|    max|                   9|
+-------+--------------------+



                                                                                

+-------+------------------+
|summary|specialattentionid|
+-------+------------------+
|  count|              2395|
|   mean| 1.704384133611691|
| stddev|0.5394635034869686|
|    min|                 1|
|    max|                 3|
+-------+------------------+



                                                                                

+-------+----------------------+
|summary|broadcastoriginpointid|
+-------+----------------------+
|  count|                  9978|
|   mean|    2.1390058127881337|
| stddev|    0.9323192037553317|
|    min|                     1|
|    max|                     3|
+-------+----------------------+



                                                                                

+-------+------------------+
|summary|     compositionid|
+-------+------------------+
|  count|              9978|
|   mean|3.4141110442974543|
| stddev| 0.933860340365462|
|    min|                 1|
|    max|                 4|
+-------+------------------+



                                                                                

+-------+---------+
|summary|producer1|
+-------+---------+
|  count|     2607|
|   mean|     null|
| stddev|     null|
|    min|      AMI|
|    max|   VISION|
+-------+---------+



                                                                                

+-------+---------+
|summary|producer2|
+-------+---------+
|  count|       38|
|   mean|     null|
| stddev|     null|
|    min|    NEWSW|
|    max|       QC|
+-------+---------+



                                                                                

+-------+------------------+
|summary|         language1|
+-------+------------------+
|  count|             12586|
|   mean|101.75377403464167|
| stddev|45.684036854682155|
|    min|                94|
|    max|               437|
+-------+------------------+



                                                                                

+-------+------------------+
|summary|         language2|
+-------+------------------+
|  count|              2629|
|   mean|175.49334347660707|
| stddev|  86.0360379288395|
|    min|                 3|
|    max|               438|
+-------+------------------+





+-------+------------------+
|summary|  duration_seconds|
+-------+------------------+
|  count|            236724|
|   mean|124.30587942076004|
| stddev| 573.7742807594921|
|    min|                 1|
|    max|             23409|
+-------+------------------+



                                                                                

In [75]:
logs.select('duration_seconds').summary("count", "min", "25%", "75%", "max").show()



+-------+----------------+
|summary|duration_seconds|
+-------+----------------+
|  count|          236724|
|    min|               1|
|    25%|              15|
|    75%|              30|
|    max|           23409|
+-------+----------------+



                                                                                

## Exercise 4.3

In [76]:
import os 

DATA_PATH = 's3://data-engg-suman/dataset/all-book-sataset/broadcast_logs/'

logs = (spark
            .read
            .csv(os.path.join(DATA_PATH, 'BroadcastLogs_2018_Q3_M8_sample.CSV'), sep='|', header=True, inferSchema=True, timestampFormat='yyy-MM-dd')
            )

                                                                                

In [77]:
logs2 = (spark
            .read
            .csv(os.path.join(DATA_PATH, 'BroadcastLogs_2018_Q3_M8_sample.CSV'))
            )

In [79]:
logs.printSchema()

root
 |-- BroadcastLogID: integer (nullable = true)
 |-- LogServiceID: integer (nullable = true)
 |-- LogDate: timestamp (nullable = true)
 |-- SequenceNO: integer (nullable = true)
 |-- AudienceTargetAgeID: integer (nullable = true)
 |-- AudienceTargetEthnicID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- ClosedCaptionID: integer (nullable = true)
 |-- CountryOfOriginID: integer (nullable = true)
 |-- DubDramaCreditID: integer (nullable = true)
 |-- EthnicProgramID: integer (nullable = true)
 |-- ProductionSourceID: integer (nullable = true)
 |-- ProgramClassID: integer (nullable = true)
 |-- FilmClassificationID: integer (nullable = true)
 |-- ExhibitionID: integer (nullable = true)
 |-- Duration: string (nullable = true)
 |-- EndTime: string (nullable = true)
 |-- LogEntryDate: timestamp (nullable = true)
 |-- ProductionNO: string (nullable = true)
 |-- ProgramTitle: string (nullable = true)
 |-- StartTime: string (nullable = true)
 |-- Subtitle: string 

In [80]:
logs2.printSchema()

root
 |-- _c0: string (nullable = true)



In [82]:
logs2.show(2, truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|_c0                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+-------------------------------------------------------------------------------------------------------------------------------------------

## Exercise 4.4

In [83]:
import os 

DATA_PATH = 's3://data-engg-suman/dataset/all-book-sataset/broadcast_logs/'

logs = (spark
            .read
            .csv(os.path.join(DATA_PATH, 'BroadcastLogs_2018_Q3_M8_sample.CSV'), sep='|', header=True, inferSchema=True, timestampFormat='yyy-MM-dd')
            )


                                                                                

In [115]:
cols_needed = list(filter(lambda x: x if not x.lower().endswith('id') else None, logs.columns))
cols_needed

['LogDate',
 'SequenceNO',
 'Duration',
 'EndTime',
 'LogEntryDate',
 'ProductionNO',
 'ProgramTitle',
 'StartTime',
 'Subtitle',
 'Producer1',
 'Producer2',
 'Language1',
 'Language2']

In [116]:
logs_clean = logs.select(cols_needed)

In [117]:
logs_clean.printSchema()

root
 |-- LogDate: timestamp (nullable = true)
 |-- SequenceNO: integer (nullable = true)
 |-- Duration: string (nullable = true)
 |-- EndTime: string (nullable = true)
 |-- LogEntryDate: timestamp (nullable = true)
 |-- ProductionNO: string (nullable = true)
 |-- ProgramTitle: string (nullable = true)
 |-- StartTime: string (nullable = true)
 |-- Subtitle: string (nullable = true)
 |-- Producer1: string (nullable = true)
 |-- Producer2: string (nullable = true)
 |-- Language1: integer (nullable = true)
 |-- Language2: integer (nullable = true)



In [118]:
logs_clean2 = logs.select(*[x for x in logs.columns if not x.endswith("ID")])

In [119]:
logs_clean2.printSchema()

root
 |-- LogDate: timestamp (nullable = true)
 |-- SequenceNO: integer (nullable = true)
 |-- Duration: string (nullable = true)
 |-- EndTime: string (nullable = true)
 |-- LogEntryDate: timestamp (nullable = true)
 |-- ProductionNO: string (nullable = true)
 |-- ProgramTitle: string (nullable = true)
 |-- StartTime: string (nullable = true)
 |-- Subtitle: string (nullable = true)
 |-- Producer1: string (nullable = true)
 |-- Producer2: string (nullable = true)
 |-- Language1: integer (nullable = true)
 |-- Language2: integer (nullable = true)

