# Creating a spark session

In [70]:
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").config(conf=SparkConf()).getOrCreate()

# loading the data and assigning the schema.

path_text_orders="file:///D://data-master/retail_db/orders"

orders_text=spark.read.format("text").load(path_text_orders)

orders_table=orders_text.selectExpr("cast(split(value,',') [0] as int) order_customer_id",
 "cast(split(value,',') [1] as date) order_date",
 "cast(split(value,',') [2] as int) order_id",
 "cast(split(value,',') [3] as string) order_status")

orders_table.show(2)

+-----------------+----------+--------+---------------+
|order_customer_id|order_date|order_id|   order_status|
+-----------------+----------+--------+---------------+
|                1|2013-07-25|   11599|         CLOSED|
|                2|2013-07-25|     256|PENDING_PAYMENT|
+-----------------+----------+--------+---------------+
only showing top 2 rows



In [None]:
## to stop the warnings and info and creating spark 1.6


sc.setLogLevel("ERROR")

'''launch pysaprk'''

'''load the data'''

path_text_orders="/user/pruthviraj/sqoop_text/orders"

orders_text=sqlContext.read.format("text").load(path_text_orders)

orders_table=orders_text.selectExpr("cast(split(value,',') [0] as int) order_customer_id",
"cast(split(value,',') [1] as date) order_date",
"cast(split(value,',') [2] as int) order_id",
"cast(split(value,',') [3] as string) order_status")

orders_table.count()

![tilte](https://pysparktutorials.files.wordpress.com/2018/05/12.jpg)

# Substring

In [71]:
#substring on a cloumn


orders_table.select(orders_table.order_status,\
                    orders_table.order_status.substr(1,4).alias("substr")).show(5)

+---------------+------+
|   order_status|substr|
+---------------+------+
|         CLOSED|  CLOS|
|PENDING_PAYMENT|  PEND|
|       COMPLETE|  COMP|
|         CLOSED|  CLOS|
|       COMPLETE|  COMP|
+---------------+------+
only showing top 5 rows



# Startswith 

In [72]:
# the startswith will only provide the binary value

orders_table.select(orders_table.order_status,\
                    orders_table.order_status.startswith("CL").alias("starts_with")).show(5)

+---------------+-----------+
|   order_status|starts_with|
+---------------+-----------+
|         CLOSED|       true|
|PENDING_PAYMENT|      false|
|       COMPLETE|      false|
|         CLOSED|       true|
|       COMPLETE|      false|
+---------------+-----------+
only showing top 5 rows



# LIKE operation

In [73]:
#like operation 

orders_table.select(orders_table.order_status,\
                    orders_table.order_status.like("CL%").alias("like_mehtod")).show(5)

+---------------+-----------+
|   order_status|like_mehtod|
+---------------+-----------+
|         CLOSED|       true|
|PENDING_PAYMENT|      false|
|       COMPLETE|      false|
|         CLOSED|       true|
|       COMPLETE|      false|
+---------------+-----------+
only showing top 5 rows



# Rlike operation

In [74]:
#rlike operation 

orders_table.select(orders_table.order_status,\
                    orders_table.order_status.rlike("_").alias("rlike_mehtod")).show(5)

+---------------+------------+
|   order_status|rlike_mehtod|
+---------------+------------+
|         CLOSED|       false|
|PENDING_PAYMENT|        true|
|       COMPLETE|       false|
|         CLOSED|       false|
|       COMPLETE|       false|
+---------------+------------+
only showing top 5 rows



# ISIN opertion

In [75]:
# isin operation
orders_table.select(orders_table.order_status,\
                    orders_table.order_status.isin("CLOSED","COMPLETE").alias("isin_method")).show(5)

+---------------+-----------+
|   order_status|isin_method|
+---------------+-----------+
|         CLOSED|       true|
|PENDING_PAYMENT|      false|
|       COMPLETE|       true|
|         CLOSED|       true|
|       COMPLETE|       true|
+---------------+-----------+
only showing top 5 rows



# Format Number

In [76]:
#formatting the number

from pyspark.sql import functions as f

orders_table.select(orders_table.order_customer_id,\
                    f.format_number(orders_table.order_customer_id,4).alias("format_num")).show(5)

+-----------------+----------+
|order_customer_id|format_num|
+-----------------+----------+
|                1|    1.0000|
|                2|    2.0000|
|                3|    3.0000|
|                4|    4.0000|
|                5|    5.0000|
+-----------------+----------+
only showing top 5 rows



# Format String

In [77]:
#formatting the string

from pyspark.sql import functions as f

orders_table.select(orders_table.order_customer_id,\
                    orders_table.order_status,\
                    f.format_string("%d,%s",orders_table.order_customer_id,orders_table.order_status).\
                    alias("format_string")).show(5)

+-----------------+---------------+-----------------+
|order_customer_id|   order_status|    format_string|
+-----------------+---------------+-----------------+
|                1|         CLOSED|         1,CLOSED|
|                2|PENDING_PAYMENT|2,PENDING_PAYMENT|
|                3|       COMPLETE|       3,COMPLETE|
|                4|         CLOSED|         4,CLOSED|
|                5|       COMPLETE|       5,COMPLETE|
+-----------------+---------------+-----------------+
only showing top 5 rows



# date and time 

In [78]:
path_text_orders="file:///D://data-master/retail_db/orders"

orders_text=spark.read.format("text").load(path_text_orders)

from pyspark.sql import functions as f

orders_table=orders_text.selectExpr("cast(split(value,',') [0] as int) order_customer_id",
 "cast(split(value,',') [1] as string) order_date",
 "cast(split(value,',') [2] as int) order_id",
 "cast(split(value,',') [3] as string) order_status")


# format date, date subtraction, date addition

In [79]:

#date format, date subtractin and date date addition

from pyspark.sql import functions as f

orders_table.select(orders_table.order_date,\
                    f.date_format(orders_table.order_date,"mm-dd-yy").alias("date_format")).show(3)


orders_table.select(orders_table.order_date,\
                    f.date_sub(orders_table.order_date,10).alias("date_sub")).show(3)

orders_table.select(orders_table.order_date,\
                    f.date_sub(orders_table.order_date,-10).alias("date_add")).show(3)

+--------------------+-----------+
|          order_date|date_format|
+--------------------+-----------+
|2013-07-25 00:00:...|   00-25-13|
|2013-07-25 00:00:...|   00-25-13|
|2013-07-25 00:00:...|   00-25-13|
+--------------------+-----------+
only showing top 3 rows

+--------------------+----------+
|          order_date|  date_sub|
+--------------------+----------+
|2013-07-25 00:00:...|2013-07-15|
|2013-07-25 00:00:...|2013-07-15|
|2013-07-25 00:00:...|2013-07-15|
+--------------------+----------+
only showing top 3 rows

+--------------------+----------+
|          order_date|  date_add|
+--------------------+----------+
|2013-07-25 00:00:...|2013-08-04|
|2013-07-25 00:00:...|2013-08-04|
|2013-07-25 00:00:...|2013-08-04|
+--------------------+----------+
only showing top 3 rows



# datedifference, hour

In [80]:

#Find hour

from pyspark.sql import functions as f


orders_table.select(orders_table.order_date,\
                    f.hour(orders_table.order_date).alias("hour")).show(3)

orders_table.select(orders_table.order_date.cast("date").alias("date1"),\
                    f.date_sub(orders_table.order_date,-10).alias("date2"),\
                    f.datediff("order_date",f.date_sub(orders_table.order_date,-10)).alias("daitediff")).show(3)

+--------------------+----+
|          order_date|hour|
+--------------------+----+
|2013-07-25 00:00:...|   0|
|2013-07-25 00:00:...|   0|
|2013-07-25 00:00:...|   0|
+--------------------+----+
only showing top 3 rows

+----------+----------+---------+
|     date1|     date2|daitediff|
+----------+----------+---------+
|2013-07-25|2013-08-04|      -10|
|2013-07-25|2013-08-04|      -10|
|2013-07-25|2013-08-04|      -10|
+----------+----------+---------+
only showing top 3 rows



# day of week, month ,year

In [81]:
#day of week,month,year

from pyspark.sql import functions as f


orders_table.select(orders_table.order_date,\
                    f.dayofweek(orders_table.order_date).alias("day_of_week")).show(3)

orders_table.select(orders_table.order_date,\
                    f.dayofmonth(orders_table.order_date).alias("day_of_month")).show(3)

orders_table.select(orders_table.order_date,\
                    f.dayofyear(orders_table.order_date).alias("day_of_year")).show(3)

+--------------------+-----------+
|          order_date|day_of_week|
+--------------------+-----------+
|2013-07-25 00:00:...|          5|
|2013-07-25 00:00:...|          5|
|2013-07-25 00:00:...|          5|
+--------------------+-----------+
only showing top 3 rows

+--------------------+------------+
|          order_date|day_of_month|
+--------------------+------------+
|2013-07-25 00:00:...|          25|
|2013-07-25 00:00:...|          25|
|2013-07-25 00:00:...|          25|
+--------------------+------------+
only showing top 3 rows

+--------------------+-----------+
|          order_date|day_of_year|
+--------------------+-----------+
|2013-07-25 00:00:...|        206|
|2013-07-25 00:00:...|        206|
|2013-07-25 00:00:...|        206|
+--------------------+-----------+
only showing top 3 rows

