In [3]:
!ls ../data

retail-sales-index-at-constant-prices-by-industry-quarterly.csv
retail-sales-index-at-constant-prices-quarterly.csv


In [99]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean as _mean, stddev as _stddev, col
from pyspark.sql.types import *

In [9]:
spark = SparkSession \
    .builder \
    .appName("ReadData") \
    .getOrCreate()

In [12]:
spark.version

'3.2.0'

In [18]:
df = spark.read \
    .csv('../data/retail-sales-index-at-constant-prices-by-industry-quarterly.csv',header=True)

In [19]:
type(df)

pyspark.sql.dataframe.DataFrame

In [44]:
df.printSchema()

root
 |-- quarter: string (nullable = true)
 |-- level_one: string (nullable = true)
 |-- level_2: string (nullable = true)
 |-- value: string (nullable = true)



In [21]:
df.show(5)

+-------+-------+--------------------+-----+
|quarter|level_1|             level_2|value|
+-------+-------+--------------------+-----+
|1985-Q1|  Total|   Department Stores|   na|
|1985-Q1|  Total|        Supermarkets|   na|
|1985-Q1|  Total|Mini-marts & Conv...| 91.4|
|1985-Q1|  Total|      Food Retailers|   na|
|1985-Q1|  Total|      Motor Vehicles| 25.6|
+-------+-------+--------------------+-----+
only showing top 5 rows



In [23]:
df = df.withColumnRenamed('level_1','level_one')

In [24]:
df.show(2)

+-------+---------+-----------------+-----+
|quarter|level_one|          level_2|value|
+-------+---------+-----------------+-----+
|1985-Q1|    Total|Department Stores|   na|
|1985-Q1|    Total|     Supermarkets|   na|
+-------+---------+-----------------+-----+
only showing top 2 rows



In [57]:
df_stat = df.select(
    _mean(col('value')).alias('mean'),
    _stddev(col('value')).alias('std')
    ).collect()

In [58]:
print(df_stat)

[Row(mean=86.08110964332887, std=31.680109202126996)]


In [75]:
mean = df_stat[0]['mean']
std = df_stat[0]['std']
print('Mean:',mean)
print('STD:',std)

Mean: 86.08110964332887
STD: 31.680109202126996


In [118]:
df.na.fill(mean,subset=['value']).show()

+-------+---------+--------------------+-----+----------+
|quarter|level_one|             level_2|value|       day|
+-------+---------+--------------------+-----+----------+
|1985-Q1|    Total|   Department Stores|    0|2021-10-28|
|1985-Q1|    Total|        Supermarkets|    0|2021-10-28|
|1985-Q1|    Total|Mini-marts & Conv...|   91|2021-10-28|
|1985-Q1|    Total|      Food Retailers|    0|2021-10-28|
|1985-Q1|    Total|      Motor Vehicles|   25|2021-10-28|
|1985-Q1|    Total|Petrol Service St...|   67|2021-10-28|
|1985-Q1|    Total|Medical Goods & T...|    0|2021-10-28|
|1985-Q1|    Total|Wearing Apparel &...|    0|2021-10-28|
|1985-Q1|    Total|Furniture & House...|   47|2021-10-28|
|1985-Q1|    Total|  Recreational Goods|    0|2021-10-28|
|1985-Q1|    Total| Watches & Jewellery|    0|2021-10-28|
|1985-Q1|    Total|Computer & Teleco...|    0|2021-10-28|
|1985-Q1|    Total|Optical Goods & B...|    0|2021-10-28|
|1985-Q1|    Total|              Others|    0|2021-10-28|
|1985-Q2|    T

In [120]:
df.filter("value = '0' and quarter = '1985-Q1'").count()

10

In [116]:
df = df.withColumn('value',regexp_replace('value','na','0'))
df = df.withColumn('value',col('value').astype(IntegerType()))
df = df.withColumn('day',date_add(current_date(),+1))
df.show(5)

+-------+---------+--------------------+-----+----------+
|quarter|level_one|             level_2|value|       day|
+-------+---------+--------------------+-----+----------+
|1985-Q1|    Total|   Department Stores|    0|2021-10-28|
|1985-Q1|    Total|        Supermarkets|    0|2021-10-28|
|1985-Q1|    Total|Mini-marts & Conv...|   91|2021-10-28|
|1985-Q1|    Total|      Food Retailers|    0|2021-10-28|
|1985-Q1|    Total|      Motor Vehicles|   25|2021-10-28|
+-------+---------+--------------------+-----+----------+
only showing top 5 rows



In [117]:
df.write.mode('append').partitionBy('day').format("parquet").save('../data/retail')

In [121]:
df.drop('day').show(5)

+-------+---------+--------------------+-----+
|quarter|level_one|             level_2|value|
+-------+---------+--------------------+-----+
|1985-Q1|    Total|   Department Stores|    0|
|1985-Q1|    Total|        Supermarkets|    0|
|1985-Q1|    Total|Mini-marts & Conv...|   91|
|1985-Q1|    Total|      Food Retailers|    0|
|1985-Q1|    Total|      Motor Vehicles|   25|
+-------+---------+--------------------+-----+
only showing top 5 rows



In [123]:
df.distinct().count()

1834