In [1]:
import findspark
findspark.init('/home/lab/spark-3.0.1-bin-hadoop2.7')


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, StructField, StringType, IntegerType, FloatType, DateType

In [3]:
spark = (SparkSession.builder.appName("Sparkpractice").getOrCreate())

## Data Preparation

In [4]:
schema = StructType([
    StructField("Order ID", StringType(), True),
    StructField("Product", StringType(), True),
    StructField("Quantity Ordered", StringType(), True),
    StructField("Price Each", StringType(), True),
    StructField("Order Date", StringType(), True),
    StructField("Purchase Address", StringType(), True)
])

In [5]:
files_path ="/home/lab/Desktop/salesdate/salesdata"
orders_df = (spark.read.format("csv")
            .option("header", True)
            .schema(schema)
            .load(files_path))

In [6]:
orders_df.show(10)

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  295665|  Macbook Pro Laptop|               1|      1700|12/30/19 00:01|136 Church St, Ne...|
|  295666|  LG Washing Machine|               1|     600.0|12/29/19 07:03|562 2nd St, New Y...|
|  295667|USB-C Charging Cable|               1|     11.95|12/12/19 18:21|277 Main St, New ...|
|  295668|    27in FHD Monitor|               1|    149.99|12/22/19 15:13|410 6th St, San F...|
|  295669|USB-C Charging Cable|               1|     11.95|12/18/19 12:38|43 Hill St, Atlan...|
|  295670|AA Batteries (4-p...|               1|      3.84|12/31/19 22:58|200 Jefferson St,...|
|  295671|USB-C Charging Cable|               1|     11.95|12/16/19 15:10|928 12th St, Port...|
|  295672|USB-C Charging Cable|         

In [7]:
orders_df.select("Order ID", "Product", "Order Date").show(10)

+--------+--------------------+--------------+
|Order ID|             Product|    Order Date|
+--------+--------------------+--------------+
|  295665|  Macbook Pro Laptop|12/30/19 00:01|
|  295666|  LG Washing Machine|12/29/19 07:03|
|  295667|USB-C Charging Cable|12/12/19 18:21|
|  295668|    27in FHD Monitor|12/22/19 15:13|
|  295669|USB-C Charging Cable|12/18/19 12:38|
|  295670|AA Batteries (4-p...|12/31/19 22:58|
|  295671|USB-C Charging Cable|12/16/19 15:10|
|  295672|USB-C Charging Cable|12/13/19 09:29|
|  295673|Bose SoundSport H...|12/15/19 23:26|
|  295674|AAA Batteries (4-...|12/28/19 11:51|
+--------+--------------------+--------------+
only showing top 10 rows



In [8]:
orders_df.printSchema()

root
 |-- Order ID: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: string (nullable = true)
 |-- Price Each: string (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)



In [9]:
#spark.stop()

### Working with Structured Operations



## Reading a JASON File

In [10]:
from pyspark.sql.types import ArrayType, FloatType, DateType, BooleanType

In [11]:
persons_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("fav_movies", ArrayType(StringType()), True),
    StructField("salary", FloatType(), True),
    StructField("image_url", StringType(), True),
    StructField("date_of_birth", DateType(), True),
    StructField("active", BooleanType(), True)
    
])

In [12]:
json_file_path = '/home/lab/Desktop/salesdate/data/persons.json'

person_df = spark.read.json(json_file_path, persons_schema, multiLine=True)

In [13]:
person_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- fav_movies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- salary: float (nullable = true)
 |-- image_url: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- active: boolean (nullable = true)



In [14]:
person_df.show(10)

+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| id|first_name|last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+
|  1|     Drucy|    Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|
|  2|   Emelyne|    Blaza|[Musketeer, The, ...|3006.04|http://dummyimage...|   1991-11-02| false|
|  3|       Max|   Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|
|  4|    Ilario|     Kean|[Up Close and Per...|3561.36|http://dummyimage...|   1987-06-09|  true|
|  5|     Toddy|   Drexel|[Walk in the Clou...|4934.87|http://dummyimage...|   1992-10-28|  true|
|  6|    Oswald| Petrolli|[Wing and the Thi...|1153.23|http://dummyimage...|   1986-09-02| false|
|  7|    Adrian|   Clarey|[Walking Tall, Pa...|1044.73|http://dummyimage...|   1971-08-24| false|
|  8|  Dominica|  Go

In [15]:
person_df.show(10, truncate=False)

+---+----------+---------+-------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|id |first_name|last_name|fav_movies                                                   |salary |image_url                                      |date_of_birth|active|
+---+----------+---------+-------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|1  |Drucy     |Poppy    |[I giorni contati]                                           |1463.36|http://dummyimage.com/126x166.png/cc0000/ffffff|1991-02-16   |true  |
|2  |Emelyne   |Blaza    |[Musketeer, The, Topralli]                                   |3006.04|http://dummyimage.com/158x106.bmp/cc0000/ffffff|1991-11-02   |false |
|3  |Max       |Rettie   |[The Forgotten Space, Make It Happen]                        |1422.88|http://dummyimage.com/237x140.jpg/ff4444/ffffff|1990-03-03   |false |
|4  

## Columns and Expressions

In [16]:
from pyspark.sql.functions import col, expr

In [17]:
person_df.select (col("first_name"), col("last_name"), col("date_of_birth")).show(5)

+----------+---------+-------------+
|first_name|last_name|date_of_birth|
+----------+---------+-------------+
|     Drucy|    Poppy|   1991-02-16|
|   Emelyne|    Blaza|   1991-11-02|
|       Max|   Rettie|   1990-03-03|
|    Ilario|     Kean|   1987-06-09|
|     Toddy|   Drexel|   1992-10-28|
+----------+---------+-------------+
only showing top 5 rows



In [18]:
person_df.select (expr("first_name"), expr("last_name"), expr("date_of_birth")).show(5)

+----------+---------+-------------+
|first_name|last_name|date_of_birth|
+----------+---------+-------------+
|     Drucy|    Poppy|   1991-02-16|
|   Emelyne|    Blaza|   1991-11-02|
|       Max|   Rettie|   1990-03-03|
|    Ilario|     Kean|   1987-06-09|
|     Toddy|   Drexel|   1992-10-28|
+----------+---------+-------------+
only showing top 5 rows



In [19]:
person_df.select("first_name", "last_name", "date_of_birth").show(5)

+----------+---------+-------------+
|first_name|last_name|date_of_birth|
+----------+---------+-------------+
|     Drucy|    Poppy|   1991-02-16|
|   Emelyne|    Blaza|   1991-11-02|
|       Max|   Rettie|   1990-03-03|
|    Ilario|     Kean|   1987-06-09|
|     Toddy|   Drexel|   1992-10-28|
+----------+---------+-------------+
only showing top 5 rows



In [20]:
from pyspark.sql.functions import concat_ws

In [21]:
person_df.select(concat_ws(' ', col("first_name"), col("last_name")).alias("full_name"),
                col("salary"),
                (col("salary") * 0.10 + col("salary")).alias("salary_increase")).show(10)

+----------------+-------+------------------+
|       full_name| salary|   salary_increase|
+----------------+-------+------------------+
|     Drucy Poppy|1463.36|1609.6959838867188|
|   Emelyne Blaza|3006.04|  3306.64404296875|
|      Max Rettie|1422.88|1565.1680053710938|
|     Ilario Kean|3561.36|3917.4961181640624|
|    Toddy Drexel|4934.87|  5428.35712890625|
| Oswald Petrolli|1153.23| 1268.552978515625|
|   Adrian Clarey|1044.73| 1149.202978515625|
|Dominica Goodnow|1147.76|1262.5360107421875|
|   Emory Slocomb|1082.11|1190.3209838867188|
|   Jeremias Bode|3472.63|  3819.89287109375|
+----------------+-------+------------------+
only showing top 10 rows



In [22]:
person_df.select(concat_ws(' ', expr("first_name"), expr("last_name")).alias("full_name"),
                col("salary"),
                (col("salary") * 0.10 + col("salary")).alias("salary_increase")).show(10)

+----------------+-------+------------------+
|       full_name| salary|   salary_increase|
+----------------+-------+------------------+
|     Drucy Poppy|1463.36|1609.6959838867188|
|   Emelyne Blaza|3006.04|  3306.64404296875|
|      Max Rettie|1422.88|1565.1680053710938|
|     Ilario Kean|3561.36|3917.4961181640624|
|    Toddy Drexel|4934.87|  5428.35712890625|
| Oswald Petrolli|1153.23| 1268.552978515625|
|   Adrian Clarey|1044.73| 1149.202978515625|
|Dominica Goodnow|1147.76|1262.5360107421875|
|   Emory Slocomb|1082.11|1190.3209838867188|
|   Jeremias Bode|3472.63|  3819.89287109375|
+----------------+-------+------------------+
only showing top 10 rows



In [23]:
person_df.select(concat_ws(' ', "first_name", "last_name").alias("full_name"),
                "salary",
                (col("salary") * 0.10 + col("salary")).alias("salary_increase")).show(10)

+----------------+-------+------------------+
|       full_name| salary|   salary_increase|
+----------------+-------+------------------+
|     Drucy Poppy|1463.36|1609.6959838867188|
|   Emelyne Blaza|3006.04|  3306.64404296875|
|      Max Rettie|1422.88|1565.1680053710938|
|     Ilario Kean|3561.36|3917.4961181640624|
|    Toddy Drexel|4934.87|  5428.35712890625|
| Oswald Petrolli|1153.23| 1268.552978515625|
|   Adrian Clarey|1044.73| 1149.202978515625|
|Dominica Goodnow|1147.76|1262.5360107421875|
|   Emory Slocomb|1082.11|1190.3209838867188|
|   Jeremias Bode|3472.63|  3819.89287109375|
+----------------+-------+------------------+
only showing top 10 rows



## Filter and Where Condition

In [24]:
person_df.filter('salary <= 3000').show(10)

+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
| id|first_name|  last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
|  1|     Drucy|      Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|
|  3|       Max|     Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|
|  6|    Oswald|   Petrolli|[Wing and the Thi...|1153.23|http://dummyimage...|   1986-09-02| false|
|  7|    Adrian|     Clarey|[Walking Tall, Pa...|1044.73|http://dummyimage...|   1971-08-24| false|
|  8|  Dominica|    Goodnow|    [Hearts Divided]|1147.76|http://dummyimage...|   1973-08-27| false|
|  9|     Emory|    Slocomb|[Snake and Crane ...|1082.11|http://dummyimage...|   1974-06-08|  true|
| 11|   Timothy|     Ervine|[Land of the Lost...|1147.61|http://dummyimage...|   1971-06-02| false|


In [25]:
person_df.where('salary <= 3000').show(10)

+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
| id|first_name|  last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
|  1|     Drucy|      Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|
|  3|       Max|     Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|
|  6|    Oswald|   Petrolli|[Wing and the Thi...|1153.23|http://dummyimage...|   1986-09-02| false|
|  7|    Adrian|     Clarey|[Walking Tall, Pa...|1044.73|http://dummyimage...|   1971-08-24| false|
|  8|  Dominica|    Goodnow|    [Hearts Divided]|1147.76|http://dummyimage...|   1973-08-27| false|
|  9|     Emory|    Slocomb|[Snake and Crane ...|1082.11|http://dummyimage...|   1974-06-08|  true|
| 11|   Timothy|     Ervine|[Land of the Lost...|1147.61|http://dummyimage...|   1971-06-02| false|


In [26]:
person_df.where((col("salary") <= 3000) & (col("active") == True)).show(10)

+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| id|first_name|last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+
|  1|     Drucy|    Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|
|  9|     Emory|  Slocomb|[Snake and Crane ...|1082.11|http://dummyimage...|   1974-06-08|  true|
| 16|   Margaux| Archbold|[And Now a Word f...|1013.75|http://dummyimage...|   1988-07-29|  true|
| 26|     Clive|      Lax|             [Rabid]|2126.87|http://dummyimage...|   1981-10-26|  true|
| 33|  Sherline|  Primett|   [Jungle Fighters]|2309.39|http://dummyimage...|   1972-07-23|  true|
| 34|     Davis|    Pinks|          [Hounddog]|1337.14|http://dummyimage...|   1989-07-27|  true|
| 37|    Carlen|  Sharply|[Dr. Jekyll and M...|2051.85|http://dummyimage...|   2002-06-01|  true|
| 40|    Jordan|   L

In [27]:
from pyspark.sql.functions import year

In [28]:
person_df.filter((year("date_of_birth") == 2000)).show(5)

+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
| id|first_name|  last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
| 15|    Feodor|Nancekivell|   [Monsoon Wedding]|2218.46|http://dummyimage...|   2000-10-07| false|
| 25|     Kelcy|     Wogdon|    [Iron Mask, The]|4512.51|http://dummyimage...|   2000-10-20|  true|
| 32|      Redd|   Akenhead|[Century of the D...| 2470.9|http://dummyimage...|   2000-06-05| false|
| 69|  Annabell|    Doughty|[Entertaining Ang...|2022.57|http://dummyimage...|   2000-09-03|  true|
| 88|     Jobie|    Maughan|[Devils on the Do...| 3899.2|http://dummyimage...|   2000-02-07| false|
+---+----------+-----------+--------------------+-------+--------------------+-------------+------+



In [29]:
person_df.filter((year("date_of_birth") == 2000) | (year("date_of_birth")== 1989)).show(5)

+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
| id|first_name|  last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
| 14|   Ambrosi|   Vidineev|[Wall Street: Mon...|4550.88|http://dummyimage...|   1989-07-20|  true|
| 15|    Feodor|Nancekivell|   [Monsoon Wedding]|2218.46|http://dummyimage...|   2000-10-07| false|
| 18|     Alfie|   Hatliffe|     [Lord of Tears]| 3893.1|http://dummyimage...|   1989-06-21|  true|
| 25|     Kelcy|     Wogdon|    [Iron Mask, The]|4512.51|http://dummyimage...|   2000-10-20|  true|
| 32|      Redd|   Akenhead|[Century of the D...| 2470.9|http://dummyimage...|   2000-06-05| false|
+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
only showing top 5 rows



In [30]:
from pyspark.sql.functions import array_contains

In [31]:
person_df.where(array_contains(person_df.fav_movies, "Lord of Tears")).show()

+---+----------+---------+---------------+------+--------------------+-------------+------+
| id|first_name|last_name|     fav_movies|salary|           image_url|date_of_birth|active|
+---+----------+---------+---------------+------+--------------------+-------------+------+
| 18|     Alfie| Hatliffe|[Lord of Tears]|3893.1|http://dummyimage...|   1989-06-21|  true|
+---+----------+---------+---------------+------+--------------------+-------------+------+



## Distinct, Drop Duplicates, Order By

In [32]:
from pyspark.sql.functions import count, desc

In [33]:
person_df.select("active").show(10)

+------+
|active|
+------+
|  true|
| false|
| false|
|  true|
|  true|
| false|
| false|
| false|
|  true|
|  true|
+------+
only showing top 10 rows



In [34]:
person_df.select("active").distinct().show(10)

+------+
|active|
+------+
|  true|
| false|
+------+



In [35]:
person_df.select(col("first_name"),
                year(col("date_of_birth")).alias("year"),
                col("active")).orderBy("year", "first_name").show(10)

+----------+----+------+
|first_name|year|active|
+----------+----+------+
|    Adrian|1971| false|
|   Feodora|1971|  true|
|       Sky|1971| false|
|   Timothy|1971| false|
|    Lucita|1972|  true|
|      Rodi|1972| false|
|  Sherline|1972|  true|
|     Toddy|1972|  true|
|  Dominica|1973| false|
|    Kelila|1973|  true|
+----------+----+------+
only showing top 10 rows



In [36]:
dropped_df = (person_df.select(col("first_name"),
                year(col("date_of_birth")).alias("year"),
                col("active")).dropDuplicates(["year", "active"]).orderBy("year", "first_name"))

In [37]:
dropped_df.show()

+----------+----+------+
|first_name|year|active|
+----------+----+------+
|    Adrian|1971| false|
|   Feodora|1971|  true|
|      Rodi|1972| false|
|  Sherline|1972|  true|
|  Dominica|1973| false|
|    Kelila|1973|  true|
|   Balduin|1974| false|
|     Emory|1974|  true|
|    Janean|1975|  true|
|       Bev|1976|  true|
| Franciska|1976| false|
|     Johny|1977| false|
|    Daveta|1978| false|
|   Guthrie|1978|  true|
|      Maxi|1979| false|
|   Melinda|1979|  true|
|    Carter|1980| false|
|   Loralyn|1980|  true|
|     Clive|1981|  true|
|   Leanora|1981| false|
+----------+----+------+
only showing top 20 rows



In [38]:
person_df.select(col("first_name"),
                year(col("date_of_birth")).alias("year"),
                col("active")).orderBy("year", ascending = False).show(10)

+----------+----+------+
|first_name|year|active|
+----------+----+------+
|     Daron|2002|  true|
|    Virgie|2002|  true|
|    Carlen|2002|  true|
|   Lorilee|2002| false|
|    Maxine|2001| false|
|    Feodor|2000| false|
|     Kelcy|2000|  true|
|  Annabell|2000|  true|
|      Redd|2000| false|
|     Jobie|2000| false|
+----------+----+------+
only showing top 10 rows



## Rows and Union

In [39]:
from pyspark.sql import Row

In [40]:
person_row = Row(101, "Robert", "Owens",["Men in Black III","Home Alone"], 4300.64, "http://someimage.com", "1964-08-18", True)

In [41]:
person_row_list = [Row(102, "Gboyega", "Abiz",["Men in Black III","Home Alone"], 4300.64, "http://someimage.com", "1964-08-18", True)\
                  ,Row(103, "NIke", "Oko",["Men in Black III","Home Alone"], 4300.64, "http://someimage.com", "1964-08-18", True)]

In [42]:
person_row_list.append(person_row)

In [43]:
print(person_row_list)

[<Row(102, 'Gboyega', 'Abiz', ['Men in Black III', 'Home Alone'], 4300.64, 'http://someimage.com', '1964-08-18', True)>, <Row(103, 'NIke', 'Oko', ['Men in Black III', 'Home Alone'], 4300.64, 'http://someimage.com', '1964-08-18', True)>, <Row(101, 'Robert', 'Owens', ['Men in Black III', 'Home Alone'], 4300.64, 'http://someimage.com', '1964-08-18', True)>]


In [44]:
person_row_list[1]

<Row(103, 'NIke', 'Oko', ['Men in Black III', 'Home Alone'], 4300.64, 'http://someimage.com', '1964-08-18', True)>

In [45]:
new_person_df = spark.createDataFrame(person_row_list)

In [46]:
new_person_df.show()

+---+-------+-----+--------------------+-------+--------------------+----------+----+
| _1|     _2|   _3|                  _4|     _5|                  _6|        _7|  _8|
+---+-------+-----+--------------------+-------+--------------------+----------+----+
|102|Gboyega| Abiz|[Men in Black III...|4300.64|http://someimage.com|1964-08-18|true|
|103|   NIke|  Oko|[Men in Black III...|4300.64|http://someimage.com|1964-08-18|true|
|101| Robert|Owens|[Men in Black III...|4300.64|http://someimage.com|1964-08-18|true|
+---+-------+-----+--------------------+-------+--------------------+----------+----+



In [47]:
new_person_df = spark.createDataFrame(person_row_list,["id","first_name","last_name", "fav_movies", "salary", "image_url", "date_of_birth", "active"])

In [48]:
new_person_df.show()

+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| id|first_name|last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+
|102|   Gboyega|     Abiz|[Men in Black III...|4300.64|http://someimage.com|   1964-08-18|  true|
|103|      NIke|      Oko|[Men in Black III...|4300.64|http://someimage.com|   1964-08-18|  true|
|101|    Robert|    Owens|[Men in Black III...|4300.64|http://someimage.com|   1964-08-18|  true|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+



In [49]:
add_persons_df = person_df.union(new_person_df)
add_persons_df.sort(desc("id")).show(10)
#dtemp = add_persons_df.where(col("first_name") == "Robert")
#dtemp.show()


+---+----------+---------+--------------------+------------------+--------------------+-------------+------+
| id|first_name|last_name|          fav_movies|            salary|           image_url|date_of_birth|active|
+---+----------+---------+--------------------+------------------+--------------------+-------------+------+
|103|      NIke|      Oko|[Men in Black III...|           4300.64|http://someimage.com|   1964-08-18|  true|
|102|   Gboyega|     Abiz|[Men in Black III...|           4300.64|http://someimage.com|   1964-08-18|  true|
|101|    Robert|    Owens|[Men in Black III...|           4300.64|http://someimage.com|   1964-08-18|  true|
|100|    Virgie| Domanski|[Horseman, The, S...| 2165.929931640625|http://dummyimage...|   2002-01-05|  true|
| 99|   Rozalie|   Wannop|[Suddenly, The No...|1259.6400146484375|http://dummyimage...|   1997-03-25| false|
| 98|     Davin|     Labb|[Viva Riva!, Kill...| 1452.739990234375|http://dummyimage...|   1988-01-27|  true|
| 97|      Rodi|   

## Adding, Renaming and Dropping Columns


In [50]:
from pyspark.sql.functions import round

In [51]:
aug_persons_df1 = person_df.withColumn("salary_increase", expr("salary * 0.10 + salary"))
aug_persons_df1.show(10)

+---+----------+---------+--------------------+-------+--------------------+-------------+------+------------------+
| id|first_name|last_name|          fav_movies| salary|           image_url|date_of_birth|active|   salary_increase|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+------------------+
|  1|     Drucy|    Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|1609.6959838867188|
|  2|   Emelyne|    Blaza|[Musketeer, The, ...|3006.04|http://dummyimage...|   1991-11-02| false|  3306.64404296875|
|  3|       Max|   Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|1565.1680053710938|
|  4|    Ilario|     Kean|[Up Close and Per...|3561.36|http://dummyimage...|   1987-06-09|  true|3917.4961181640624|
|  5|     Toddy|   Drexel|[Walk in the Clou...|4934.87|http://dummyimage...|   1992-10-28|  true|  5428.35712890625|
|  6|    Oswald| Petrolli|[Wing and the Thi...|1153.23|http://du

In [52]:
aug_persons_df1.columns

['id',
 'first_name',
 'last_name',
 'fav_movies',
 'salary',
 'image_url',
 'date_of_birth',
 'active',
 'salary_increase']

In [53]:
aug_persons_df2 = (aug_persons_df1
                  .withColumn("birth_year", year("date_of_birth"))
                  .withColumnRenamed("fav_movies", "movies")
                  .withColumn("salary_x10", round(col("salary_increase"),2))
                  .drop("salary_increase"))

In [54]:
aug_persons_df2.show(10)

+---+----------+---------+--------------------+-------+--------------------+-------------+------+----------+----------+
| id|first_name|last_name|              movies| salary|           image_url|date_of_birth|active|birth_year|salary_x10|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+----------+----------+
|  1|     Drucy|    Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|      1991|    1609.7|
|  2|   Emelyne|    Blaza|[Musketeer, The, ...|3006.04|http://dummyimage...|   1991-11-02| false|      1991|   3306.64|
|  3|       Max|   Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|      1990|   1565.17|
|  4|    Ilario|     Kean|[Up Close and Per...|3561.36|http://dummyimage...|   1987-06-09|  true|      1987|    3917.5|
|  5|     Toddy|   Drexel|[Walk in the Clou...|4934.87|http://dummyimage...|   1992-10-28|  true|      1992|   5428.36|
|  6|    Oswald| Petrolli|[Wing and the 

## Working With Missing or Bad data

In [55]:
bad_movies_list = [Row(None, None, None),
                  Row(None, None, 2020),
                  Row("John Doe", "Awesome Movie", None),
                  Row(None, "Awesome Movie", 2021),
                  Row("Mary Jane", None, 2019),
                  Row("Victor Ikpeba", "Not another teen movie", 2001)]

In [56]:
bad_movies_list

[<Row(None, None, None)>,
 <Row(None, None, 2020)>,
 <Row('John Doe', 'Awesome Movie', None)>,
 <Row(None, 'Awesome Movie', 2021)>,
 <Row('Mary Jane', None, 2019)>,
 <Row('Victor Ikpeba', 'Not another teen movie', 2001)>]

In [57]:
bad_movies_column = ["actor_name", "movie_name", "produced_year"]

In [58]:
bad_movie_df = spark.createDataFrame(bad_movies_list, schema=bad_movies_column)

In [59]:
bad_movie_df.show()

+-------------+--------------------+-------------+
|   actor_name|          movie_name|produced_year|
+-------------+--------------------+-------------+
|         null|                null|         null|
|         null|                null|         2020|
|     John Doe|       Awesome Movie|         null|
|         null|       Awesome Movie|         2021|
|    Mary Jane|                null|         2019|
|Victor Ikpeba|Not another teen ...|         2001|
+-------------+--------------------+-------------+



In [60]:
bad_movie_df.dropna().show()

bad_movie_df.na.drop().show() #same as above

+-------------+--------------------+-------------+
|   actor_name|          movie_name|produced_year|
+-------------+--------------------+-------------+
|Victor Ikpeba|Not another teen ...|         2001|
+-------------+--------------------+-------------+

+-------------+--------------------+-------------+
|   actor_name|          movie_name|produced_year|
+-------------+--------------------+-------------+
|Victor Ikpeba|Not another teen ...|         2001|
+-------------+--------------------+-------------+



In [61]:
bad_movie_df.na.drop("any").show() 

+-------------+--------------------+-------------+
|   actor_name|          movie_name|produced_year|
+-------------+--------------------+-------------+
|Victor Ikpeba|Not another teen ...|         2001|
+-------------+--------------------+-------------+



In [62]:
bad_movie_df.na.drop("all").show() #all columns null

+-------------+--------------------+-------------+
|   actor_name|          movie_name|produced_year|
+-------------+--------------------+-------------+
|         null|                null|         2020|
|     John Doe|       Awesome Movie|         null|
|         null|       Awesome Movie|         2021|
|    Mary Jane|                null|         2019|
|Victor Ikpeba|Not another teen ...|         2001|
+-------------+--------------------+-------------+



In [63]:
bad_movie_df.where(col("actor_name").isNull() !=True).show()

+-------------+--------------------+-------------+
|   actor_name|          movie_name|produced_year|
+-------------+--------------------+-------------+
|     John Doe|       Awesome Movie|         null|
|    Mary Jane|                null|         2019|
|Victor Ikpeba|Not another teen ...|         2001|
+-------------+--------------------+-------------+



In [64]:
bad_movie_df.describe().show()
bad_movie_df.describe("actor_name").show()

+-------+-------------+--------------------+-----------------+
|summary|   actor_name|          movie_name|    produced_year|
+-------+-------------+--------------------+-----------------+
|  count|            3|                   3|                4|
|   mean|         null|                null|          2015.25|
| stddev|         null|                null|9.535023160258524|
|    min|     John Doe|       Awesome Movie|             2001|
|    max|Victor Ikpeba|Not another teen ...|             2021|
+-------+-------------+--------------------+-----------------+

+-------+-------------+
|summary|   actor_name|
+-------+-------------+
|  count|            3|
|   mean|         null|
| stddev|         null|
|    min|     John Doe|
|    max|Victor Ikpeba|
+-------+-------------+



## Challenge part2

In [101]:
schema = StructType([
    StructField("Order ID", StringType(), True),
    StructField("Product", StringType(), True),
    StructField("Quantity Ordered", StringType(), True),
    StructField("Price Each", StringType(), True),
    StructField("Order Date", StringType(), True),
    StructField("Purchase Address", StringType(), True)
])

files_path ="/home/lab/Desktop/salesdate/salesdata"
sales_df = (spark.read.format("csv")
            .option("header", True)
            .schema(schema)
            .load(files_path))

In [102]:
sales_df.show(10)

sales_df.count()  ###186850

sales_df.filter(col("Order Date").isNull() == True).count()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  295665|  Macbook Pro Laptop|               1|      1700|12/30/19 00:01|136 Church St, Ne...|
|  295666|  LG Washing Machine|               1|     600.0|12/29/19 07:03|562 2nd St, New Y...|
|  295667|USB-C Charging Cable|               1|     11.95|12/12/19 18:21|277 Main St, New ...|
|  295668|    27in FHD Monitor|               1|    149.99|12/22/19 15:13|410 6th St, San F...|
|  295669|USB-C Charging Cable|               1|     11.95|12/18/19 12:38|43 Hill St, Atlan...|
|  295670|AA Batteries (4-p...|               1|      3.84|12/31/19 22:58|200 Jefferson St,...|
|  295671|USB-C Charging Cable|               1|     11.95|12/16/19 15:10|928 12th St, Port...|
|  295672|USB-C Charging Cable|         

545

In [103]:
sales_df.describe("Order ID","Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address").show() 

+-------+-----------------+------------+-------------------+-----------------+--------------+--------------------+
|summary|         Order ID|     Product|   Quantity Ordered|       Price Each|    Order Date|    Purchase Address|
+-------+-----------------+------------+-------------------+-----------------+--------------+--------------------+
|  count|           186305|      186305|             186305|           186305|        186305|              186305|
|   mean|230417.5693788653|        null| 1.1243828986286637|184.3997347673138|          null|                null|
| stddev|51512.73710999602|        null|0.44279262402866804|332.7313298843436|          null|                null|
|    min|           141234|20in Monitor|                  1|           109.99|01/01/19 03:07|1 11th St, Atlant...|
|    max|         Order ID|      iPhone|   Quantity Ordered|       Price Each|    Order Date|    Purchase Address|
+-------+-----------------+------------+-------------------+-----------------+--

In [68]:
from pyspark.sql.functions import column

In [104]:
sales_df.filter(col("Order ID").isNull() == True).show(10)
sales_df.filter(col("Order ID").isNull() == True).count()

+--------+-------+----------------+----------+----------+----------------+
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
+--------+-------+----------------+----------+----------+----------------+
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
+--------+-------+-------

545

In [105]:
#sales_df = sales_df.dropna() #same as putting "any"
sales_df = sales_df.na.drop("any")
sales_df.show(10)
sales_df.count()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  295665|  Macbook Pro Laptop|               1|      1700|12/30/19 00:01|136 Church St, Ne...|
|  295666|  LG Washing Machine|               1|     600.0|12/29/19 07:03|562 2nd St, New Y...|
|  295667|USB-C Charging Cable|               1|     11.95|12/12/19 18:21|277 Main St, New ...|
|  295668|    27in FHD Monitor|               1|    149.99|12/22/19 15:13|410 6th St, San F...|
|  295669|USB-C Charging Cable|               1|     11.95|12/18/19 12:38|43 Hill St, Atlan...|
|  295670|AA Batteries (4-p...|               1|      3.84|12/31/19 22:58|200 Jefferson St,...|
|  295671|USB-C Charging Cable|               1|     11.95|12/16/19 15:10|928 12th St, Port...|
|  295672|USB-C Charging Cable|         

186305

In [106]:
sales_df.filter(col("Order ID").isNull() == True).show(10)
sales_df.filter(col("Order ID").isNull() == True).count()

+--------+-------+----------------+----------+----------+----------------+
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
+--------+-------+----------------+----------+----------+----------------+
+--------+-------+----------------+----------+----------+----------------+



0

In [107]:
sales_df.describe("Order ID","Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address").show() 

+-------+-----------------+------------+-------------------+-----------------+--------------+--------------------+
|summary|         Order ID|     Product|   Quantity Ordered|       Price Each|    Order Date|    Purchase Address|
+-------+-----------------+------------+-------------------+-----------------+--------------+--------------------+
|  count|           186305|      186305|             186305|           186305|        186305|              186305|
|   mean|230417.5693788653|        null| 1.1243828986286637|184.3997347673138|          null|                null|
| stddev|51512.73710999602|        null|0.44279262402866804|332.7313298843436|          null|                null|
|    min|           141234|20in Monitor|                  1|           109.99|01/01/19 03:07|1 11th St, Atlant...|
|    max|         Order ID|      iPhone|   Quantity Ordered|       Price Each|    Order Date|    Purchase Address|
+-------+-----------------+------------+-------------------+-----------------+--

In [108]:
sales_df.filter(col("Order ID") == "Order ID").show(10) #.count() to count occurence
sales_df.filter(col("Order ID") == "Order ID").count()

+--------+-------+----------------+----------+----------+----------------+
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
+--------+-------+----------------+----------+----------+----------------+
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
+--------+-------+-------

355

In [110]:
sales_df_temp = sales_df.distinct()

In [111]:
sales_df_temp.filter(col("Order ID") == "Order ID").show(10) #.count() to count occurence

sales_df_temp.filter(col("Order ID") == "Order ID").count()

+--------+-------+----------------+----------+----------+----------------+
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
+--------+-------+----------------+----------+----------+----------------+
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
+--------+-------+----------------+----------+----------+----------------+



1

In [112]:
sales_df_temp = sales_df_temp.filter(col("Order ID") != "Order ID")

In [113]:
sales_df_temp.filter(col("Order ID") == "Order ID").show(10) #.count() to count occurence

sales_df_temp.filter(col("Order ID") == "Order ID").count()

+--------+-------+----------------+----------+----------+----------------+
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
+--------+-------+----------------+----------+----------+----------------+
+--------+-------+----------------+----------+----------+----------------+



0

In [114]:
sales_df_temp.describe("Order ID","Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address").show() 

+-------+------------------+------------+------------------+------------------+--------------+--------------------+
|summary|          Order ID|     Product|  Quantity Ordered|        Price Each|    Order Date|    Purchase Address|
+-------+------------------+------------+------------------+------------------+--------------+--------------------+
|  count|            185686|      185686|            185686|            185686|        185686|              185686|
|   mean|230411.37622653297|        null|1.1245435843305365|184.51925546352427|          null|                null|
| stddev| 51511.71718332086|        null|0.4430687383832874| 332.8438383900525|          null|                null|
|    min|            141234|20in Monitor|                 1|            109.99|01/01/19 03:07|1 11th St, Atlant...|
|    max|            319670|      iPhone|                 9|            999.99|12/31/19 23:53|999 Wilson St, Sa...|
+-------+------------------+------------+------------------+------------

### Extrac the City and State from Purchase Address

In [115]:
from pyspark.sql.functions import split

In [116]:
sales_df_temp.select("Purchase Address").show(10, False)

+----------------------------------------+
|Purchase Address                        |
+----------------------------------------+
|283 Washington St, Boston, MA 02215     |
|968 8th St, Austin, TX 73301            |
|857 Center St, Boston, MA 02215         |
|679 Chestnut St, San Francisco, CA 94016|
|58 Dogwood St, San Francisco, CA 94016  |
|355 Park St, Boston, MA 02215           |
|542 9th St, New York City, NY 10001     |
|708 Walnut St, New York City, NY 10001  |
|538 Hickory St, San Francisco, CA 94016 |
|199 8th St, San Francisco, CA 94016     |
+----------------------------------------+
only showing top 10 rows



In [117]:
sales_df_temp.select("Purchase Address", split(col("Purchase Address"), ',')).show(10, False)

+----------------------------------------+--------------------------------------------+
|Purchase Address                        |split(Purchase Address, ,, -1)              |
+----------------------------------------+--------------------------------------------+
|283 Washington St, Boston, MA 02215     |[283 Washington St,  Boston,  MA 02215]     |
|968 8th St, Austin, TX 73301            |[968 8th St,  Austin,  TX 73301]            |
|857 Center St, Boston, MA 02215         |[857 Center St,  Boston,  MA 02215]         |
|679 Chestnut St, San Francisco, CA 94016|[679 Chestnut St,  San Francisco,  CA 94016]|
|58 Dogwood St, San Francisco, CA 94016  |[58 Dogwood St,  San Francisco,  CA 94016]  |
|355 Park St, Boston, MA 02215           |[355 Park St,  Boston,  MA 02215]           |
|542 9th St, New York City, NY 10001     |[542 9th St,  New York City,  NY 10001]     |
|708 Walnut St, New York City, NY 10001  |[708 Walnut St,  New York City,  NY 10001]  |
|538 Hickory St, San Francisco, 

In [118]:
sales_df_temp.select("Purchase Address", split(col("Purchase Address"), ',').getItem(1)).show(10, False)

+----------------------------------------+---------------------------------+
|Purchase Address                        |split(Purchase Address, ,, -1)[1]|
+----------------------------------------+---------------------------------+
|283 Washington St, Boston, MA 02215     | Boston                          |
|968 8th St, Austin, TX 73301            | Austin                          |
|857 Center St, Boston, MA 02215         | Boston                          |
|679 Chestnut St, San Francisco, CA 94016| San Francisco                   |
|58 Dogwood St, San Francisco, CA 94016  | San Francisco                   |
|355 Park St, Boston, MA 02215           | Boston                          |
|542 9th St, New York City, NY 10001     | New York City                   |
|708 Walnut St, New York City, NY 10001  | New York City                   |
|538 Hickory St, San Francisco, CA 94016 | San Francisco                   |
|199 8th St, San Francisco, CA 94016     | San Francisco                   |

In [119]:
sales_df_temp.select("Purchase Address", split(col("Purchase Address"), ',').getItem(2)).show(10, False)

+----------------------------------------+---------------------------------+
|Purchase Address                        |split(Purchase Address, ,, -1)[2]|
+----------------------------------------+---------------------------------+
|283 Washington St, Boston, MA 02215     | MA 02215                        |
|968 8th St, Austin, TX 73301            | TX 73301                        |
|857 Center St, Boston, MA 02215         | MA 02215                        |
|679 Chestnut St, San Francisco, CA 94016| CA 94016                        |
|58 Dogwood St, San Francisco, CA 94016  | CA 94016                        |
|355 Park St, Boston, MA 02215           | MA 02215                        |
|542 9th St, New York City, NY 10001     | NY 10001                        |
|708 Walnut St, New York City, NY 10001  | NY 10001                        |
|538 Hickory St, San Francisco, CA 94016 | CA 94016                        |
|199 8th St, San Francisco, CA 94016     | CA 94016                        |

In [120]:
sales_df_temp.select("Purchase Address", split(split(col("Purchase Address"), ',').getItem(2), ' ')).show(10, False)

+----------------------------------------+-----------------------------------------------+
|Purchase Address                        |split(split(Purchase Address, ,, -1)[2],  , -1)|
+----------------------------------------+-----------------------------------------------+
|283 Washington St, Boston, MA 02215     |[, MA, 02215]                                  |
|968 8th St, Austin, TX 73301            |[, TX, 73301]                                  |
|857 Center St, Boston, MA 02215         |[, MA, 02215]                                  |
|679 Chestnut St, San Francisco, CA 94016|[, CA, 94016]                                  |
|58 Dogwood St, San Francisco, CA 94016  |[, CA, 94016]                                  |
|355 Park St, Boston, MA 02215           |[, MA, 02215]                                  |
|542 9th St, New York City, NY 10001     |[, NY, 10001]                                  |
|708 Walnut St, New York City, NY 10001  |[, NY, 10001]                                  |

In [121]:
sales_df_temp.select("Purchase Address", split(split(col("Purchase Address"), ',').getItem(2), ' ').getItem(1)).show(10, False)

+----------------------------------------+--------------------------------------------------+
|Purchase Address                        |split(split(Purchase Address, ,, -1)[2],  , -1)[1]|
+----------------------------------------+--------------------------------------------------+
|283 Washington St, Boston, MA 02215     |MA                                                |
|968 8th St, Austin, TX 73301            |TX                                                |
|857 Center St, Boston, MA 02215         |MA                                                |
|679 Chestnut St, San Francisco, CA 94016|CA                                                |
|58 Dogwood St, San Francisco, CA 94016  |CA                                                |
|355 Park St, Boston, MA 02215           |MA                                                |
|542 9th St, New York City, NY 10001     |NY                                                |
|708 Walnut St, New York City, NY 10001  |NY                

In [122]:
sales_df_temp = (sales_df_temp.withColumn("City", split(col("Purchase Address"), ',').getItem(1))
                              .withColumn("State", split(split(col("Purchase Address"), ',').getItem(2), ' ').getItem(1) ))

In [123]:
sales_df_temp.show(10)

+--------+--------------------+----------------+----------+--------------+--------------------+--------------+-----+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|          City|State|
+--------+--------------------+----------------+----------+--------------+--------------------+--------------+-----+
|  295900|AA Batteries (4-p...|               1|      3.84|12/27/19 18:56|283 Washington St...|        Boston|   MA|
|  295923|Lightning Chargin...|               1|     14.95|12/21/19 13:41|968 8th St, Austi...|        Austin|   TX|
|  295991|Lightning Chargin...|               1|     14.95|12/15/19 20:16|857 Center St, Bo...|        Boston|   MA|
|  296076|  Macbook Pro Laptop|               1|      1700|12/03/19 15:19|679 Chestnut St, ...| San Francisco|   CA|
|  297015|AAA Batteries (4-...|               3|      2.99|12/13/19 08:43|58 Dogwood St, Sa...| San Francisco|   CA|
|  297237|Bose SoundSport H...|               1|     99.99|12/16

In [124]:
sales_df_temp.select("Order ID","Product", "City", "State" ).show(10)

+--------+--------------------+--------------+-----+
|Order ID|             Product|          City|State|
+--------+--------------------+--------------+-----+
|  295900|AA Batteries (4-p...|        Boston|   MA|
|  295923|Lightning Chargin...|        Austin|   TX|
|  295991|Lightning Chargin...|        Boston|   MA|
|  296076|  Macbook Pro Laptop| San Francisco|   CA|
|  297015|AAA Batteries (4-...| San Francisco|   CA|
|  297237|Bose SoundSport H...|        Boston|   MA|
|  297376|    27in FHD Monitor| New York City|   NY|
|  297438|Lightning Chargin...| New York City|   NY|
|  297541|AAA Batteries (4-...| San Francisco|   CA|
|  297635|    Wired Headphones| San Francisco|   CA|
+--------+--------------------+--------------+-----+
only showing top 10 rows



### Rename and Change DataTypes

In [125]:
from pyspark.sql.functions import to_timestamp, year, month
from pyspark.sql.types import IntegerType, FloatType


In [88]:
sales_df_temp = (sales_df_temp.withColumn("OrderID", col("Order ID").cast(IntegerType()))
                              .withColumn("Quantity", col("Quantity Ordered").cast(IntegerType()))
                              .withColumn("Price", col("Price Each").cast(FloatType()))
                              .withColumn("OrderDate", to_timestamp(col("Order Date"),"MM/dd/yy HH:mm"))
                              .withColumnRenamed("Purchase Address","StoreAddress")
                )

In [126]:
sales_df_temp.show(10)

+--------+--------------------+----------------+----------+--------------+--------------------+--------------+-----+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|          City|State|
+--------+--------------------+----------------+----------+--------------+--------------------+--------------+-----+
|  295900|AA Batteries (4-p...|               1|      3.84|12/27/19 18:56|283 Washington St...|        Boston|   MA|
|  295923|Lightning Chargin...|               1|     14.95|12/21/19 13:41|968 8th St, Austi...|        Austin|   TX|
|  295991|Lightning Chargin...|               1|     14.95|12/15/19 20:16|857 Center St, Bo...|        Boston|   MA|
|  296076|  Macbook Pro Laptop|               1|      1700|12/03/19 15:19|679 Chestnut St, ...| San Francisco|   CA|
|  297015|AAA Batteries (4-...|               3|      2.99|12/13/19 08:43|58 Dogwood St, Sa...| San Francisco|   CA|
|  297237|Bose SoundSport H...|               1|     99.99|12/16

In [127]:
sales_df_temp = (sales_df_temp.withColumn("OrderID", col("Order ID").cast(IntegerType()))
                              .withColumn("Quantity", col("Quantity Ordered").cast(IntegerType()))
                              .withColumn("Price", col("Price Each").cast(FloatType()))
                              .withColumn("OrderDate", to_timestamp(col("Order Date"),"MM/dd/yy HH:mm"))
                              .withColumnRenamed("Purchase Address","StoreAddress")
                              .drop("Order ID")
                              .drop("Quantity Ordered")
                              .drop("Price Each")
                              .drop("Order Date")
                )

In [128]:
sales_df_temp.show(10)

+--------------------+--------------------+--------------+-----+-------+--------+------+-------------------+
|             Product|        StoreAddress|          City|State|OrderID|Quantity| Price|          OrderDate|
+--------------------+--------------------+--------------+-----+-------+--------+------+-------------------+
|AA Batteries (4-p...|283 Washington St...|        Boston|   MA| 295900|       1|  3.84|2019-12-27 18:56:00|
|Lightning Chargin...|968 8th St, Austi...|        Austin|   TX| 295923|       1| 14.95|2019-12-21 13:41:00|
|Lightning Chargin...|857 Center St, Bo...|        Boston|   MA| 295991|       1| 14.95|2019-12-15 20:16:00|
|  Macbook Pro Laptop|679 Chestnut St, ...| San Francisco|   CA| 296076|       1|1700.0|2019-12-03 15:19:00|
|AAA Batteries (4-...|58 Dogwood St, Sa...| San Francisco|   CA| 297015|       3|  2.99|2019-12-13 08:43:00|
|Bose SoundSport H...|355 Park St, Bost...|        Boston|   MA| 297237|       1| 99.99|2019-12-16 10:28:00|
|    27in FHD Monit

In [129]:
sales_df_temp.printSchema()

root
 |-- Product: string (nullable = true)
 |-- StoreAddress: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: float (nullable = true)
 |-- OrderDate: timestamp (nullable = true)



### Add New Columns Year and Month

In [130]:
#sales_df_temp = (sales_df_temp.withColumn("Year", year("OrderDate")))
sales_df_temp = (sales_df_temp.withColumn("Year", year("OrderDate")))

In [131]:
sales_df_temp.show(10)

+--------------------+--------------------+--------------+-----+-------+--------+------+-------------------+----+
|             Product|        StoreAddress|          City|State|OrderID|Quantity| Price|          OrderDate|Year|
+--------------------+--------------------+--------------+-----+-------+--------+------+-------------------+----+
|AA Batteries (4-p...|283 Washington St...|        Boston|   MA| 295900|       1|  3.84|2019-12-27 18:56:00|2019|
|Lightning Chargin...|968 8th St, Austi...|        Austin|   TX| 295923|       1| 14.95|2019-12-21 13:41:00|2019|
|Lightning Chargin...|857 Center St, Bo...|        Boston|   MA| 295991|       1| 14.95|2019-12-15 20:16:00|2019|
|  Macbook Pro Laptop|679 Chestnut St, ...| San Francisco|   CA| 296076|       1|1700.0|2019-12-03 15:19:00|2019|
|AAA Batteries (4-...|58 Dogwood St, Sa...| San Francisco|   CA| 297015|       3|  2.99|2019-12-13 08:43:00|2019|
|Bose SoundSport H...|355 Park St, Bost...|        Boston|   MA| 297237|       1| 99.99|

In [132]:
sales_df_temp = (sales_df_temp.withColumn("ReportYear", year("OrderDate"))
                              .withColumn("Month", month("OrderDate")))

In [133]:
sales_df_temp.show(10)

+--------------------+--------------------+--------------+-----+-------+--------+------+-------------------+----+----------+-----+
|             Product|        StoreAddress|          City|State|OrderID|Quantity| Price|          OrderDate|Year|ReportYear|Month|
+--------------------+--------------------+--------------+-----+-------+--------+------+-------------------+----+----------+-----+
|AA Batteries (4-p...|283 Washington St...|        Boston|   MA| 295900|       1|  3.84|2019-12-27 18:56:00|2019|      2019|   12|
|Lightning Chargin...|968 8th St, Austi...|        Austin|   TX| 295923|       1| 14.95|2019-12-21 13:41:00|2019|      2019|   12|
|Lightning Chargin...|857 Center St, Bo...|        Boston|   MA| 295991|       1| 14.95|2019-12-15 20:16:00|2019|      2019|   12|
|  Macbook Pro Laptop|679 Chestnut St, ...| San Francisco|   CA| 296076|       1|1700.0|2019-12-03 15:19:00|2019|      2019|   12|
|AAA Batteries (4-...|58 Dogwood St, Sa...| San Francisco|   CA| 297015|       3|  

### Write output to parquet

In [134]:
sales_final_df = sales_df_temp.select("OrderID", "Product","Quantity", "Price", "OrderDate", "StoreAddress", "City","State", "ReportYear", "Month")

In [135]:
sales_final_df.show(10)
sales_final_df.printSchema()

+-------+--------------------+--------+------+-------------------+--------------------+--------------+-----+----------+-----+
|OrderID|             Product|Quantity| Price|          OrderDate|        StoreAddress|          City|State|ReportYear|Month|
+-------+--------------------+--------+------+-------------------+--------------------+--------------+-----+----------+-----+
| 295900|AA Batteries (4-p...|       1|  3.84|2019-12-27 18:56:00|283 Washington St...|        Boston|   MA|      2019|   12|
| 295923|Lightning Chargin...|       1| 14.95|2019-12-21 13:41:00|968 8th St, Austi...|        Austin|   TX|      2019|   12|
| 295991|Lightning Chargin...|       1| 14.95|2019-12-15 20:16:00|857 Center St, Bo...|        Boston|   MA|      2019|   12|
| 296076|  Macbook Pro Laptop|       1|1700.0|2019-12-03 15:19:00|679 Chestnut St, ...| San Francisco|   CA|      2019|   12|
| 297015|AAA Batteries (4-...|       3|  2.99|2019-12-13 08:43:00|58 Dogwood St, Sa...| San Francisco|   CA|      2019

In [136]:
output_path = '/home/lab/Desktop/salesdate/data/output'
sales_final_df.write.mode("overwrite").partitionBy("ReportYear","Month").parquet(output_path)