### Read {CSV,XML} files of FWM Set top box (STB) data into dataframes

Read a file list into a DF, write it to a parquet file .

Noam 2023-03-06


In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import*

spark = SparkSession.builder.appName('fwm').getOrCreate()
sc = spark.sparkContext

In [2]:
# Read a CSV into a dataframe
def load_PD_file(filename, schema = None) :
    dataPath = "/datasets/Fourthwall_STB_data/" + filename
    dfreader = spark.read.format("csv")\
      .option("header","false")\
      .option("delimiter", "|")
    if schema:
        dfreader = dfreader.schema(schema)
    return dfreader.load(dataPath)
  
# inferSchema means we will automatically figure out column types 
# at a cost of reading the data more than once

In [3]:
schema =  StructType([StructField('prog_code',StringType()),
                     StructField('title',StringType()),
                     StructField('genre',StringType()),
                     StructField('air_date',StringType()),
                     StructField('air_time',StringType()),
                     StructField('Duration',FloatType())
                                       ])
x = load_PD_file("SintecMedia.rpt_programs.date_2015-12-30.2016-11-29.pd", schema  )
              

In [4]:
from pyspark.sql.functions import col,to_date,udf
def strtime_to_fload(val:str)-> float:
    h = 66# int(val)//10000
    #m = h /100
    return h
to_float_time = udf(strtime_to_fload, FloatType())

x.printSchema()
x2 = x.withColumn('airdate',to_date(col('air_date'),'yyyymmdd'))\
      .withColumn('airtime',to_float_time(col('air_time')))\
      .drop('air_date')
x2.printSchema()
x2.show(5)

root
 |-- prog_code: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- air_date: string (nullable = true)
 |-- air_time: string (nullable = true)
 |-- Duration: float (nullable = true)

root
 |-- prog_code: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- air_time: string (nullable = true)
 |-- Duration: float (nullable = true)
 |-- airdate: date (nullable = true)
 |-- airtime: float (nullable = true)

+--------------+-----------------+-----------+--------+--------+----------+-------+
|     prog_code|            title|      genre|air_time|Duration|   airdate|airtime|
+--------------+-----------------+-----------+--------+--------+----------+-------+
|EP000000510016|A Different World|     Sitcom|  140000|    30.0|2015-01-30|   null|
|EP000000510017|A Different World|     Sitcom|  143000|    30.0|2015-01-30|   null|
|EP000000510027|A Different World|     Sitcom|  220000|    30.0|2

In [5]:
y = load_PD_file("FWM_20151229_R.pd")
y.printSchema()
y.show(10)

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)

+-----+------------+--------+------+---+-----+-------+-----+
|  _c0|         _c1|     _c2|   _c3|_c4|  _c5|    _c6|  _c7|
+-----+------------+--------+------+---+-----+-------+-----+
|01540|0000000050f3|20151229|094118|  T|  923| SMUS27|67563|
|01540|0000000050f3|20151229|094150|  T|65532|UNKNOWN|99993|
|01540|0000000050f3|20151229|101200|  T|  923| SMUS27|67563|
|01540|0000000050f3|20151229|101453|  T|65532|UNKNOWN|99993|
|01540|0000000050f3|20151229|113616|  T|  923| SMUS27|67563|
|01540|0000000050f3|20151229|113625|  T|65532|UNKNOWN|99993|
|01540|0000000050f3|20151229|113630|  T|  923| SMUS27|67563|
|01540|0000000050f3|20151229|121842|  T|65532|UNKNOWN|99993|
|01540|0000000050f3|20151229|131252|  T|  9

In [6]:
%%time
z = load_PD_file("SintecMedia.rpt_prog_view.date_2015-12-26.2016-11-29.pd")
z.printSchema()
z = z.toDF(*['MSO','device_id','playback_date','unk','unk2', 'station_num'])
z.show(10)

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)

+-----+------------+-------------+------+-----+--------------+
|  MSO|   device_id|playback_date|   unk| unk2|   station_num|
+-----+------------+-------------+------+-----+--------------+
|01540|0000000050f3|     20151226|202215|68678|SH018614720000|
|01540|0000000050f3|     20151226|202232|72772|SP003217880000|
|01540|0000000050f3|     20151226|202257|36205|EP015685700032|
|01540|0000000050f3|     20151226|210000|36205|EP002036520381|
|01540|0000000057f6|     20151226|000000|11590|EP005927330225|
|01540|0000000057f6|     20151226|072616|11590|SH016004130000|
|01540|0000000057f6|     20151226|072648|11628|EP019152141537|
|01540|0000000057f6|     20151226|075815|11590|SH016004130000|
|01540|0000000057f6|     20151226|075836|11428|EP000018937336|
|01540|0000000057f6|     201

In [7]:
%%time 
z = spark.read.parquet("../data/prog_view.parquet")

CPU times: user 3.73 ms, sys: 1.59 ms, total: 5.32 ms
Wall time: 416 ms


In [8]:
%%time
z.count()
z.show(10)

+-----+------------+-------------+------+-----+--------------+
|  MSO|   device_id|playback_date|   unk| unk2|   station_num|
+-----+------------+-------------+------+-----+--------------+
|08360|001bd75e12e3|     20151226|063440|11713|SH007227490000|
|08360|001bd75e12e3|     20151226|070000|11713|SH007227490000|
|08360|001bd75e12e3|     20151226|080000|11713|EP000018937336|
|08360|001bd75e12e3|     20151226|182527|11713|EP000009937693|
|08360|001bd75e12e3|     20151226|182608|49603|EP021213560001|
|08360|001bd75e12e3|     20151226|183000|49603|EP021213560002|
|08360|001bd75e12e3|     20151226|184055|14902|EP017277730029|
|08360|001bd75e12e3|     20151226|190000|14902|EP020481370022|
|08360|001bd75e12e3|     20151226|193000|14902|EP020481370014|
|08360|001bd75e12e3|     20151226|195448|14909|EP013320550286|
+-----+------------+-------------+------+-----+--------------+
only showing top 10 rows

CPU times: user 1.85 ms, sys: 1.02 ms, total: 2.87 ms
Wall time: 1.09 s


In [9]:
z

DataFrame[MSO: string, device_id: string, playback_date: string, unk: string, unk2: string, station_num: string]

# Read from an XML file

To read XML file, a library is needed in the cluster. The installation procedure depends on the cluster you have - e.g. Databricks or standlone.

Once the data is read into the dataframe, you can do anything with it -- such as save it to Parquet format for repeated use.