# creating a session

In [2]:
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").config(conf=SparkConf()).getOrCreate()

# Loading Text data

format using text

In [4]:
#available from spark 1.6 +

from pyspark.sql import functions as f

#load data to the orders_text

path_text_orders="file:///D://data-master/retail_db/orders"

orders_text=spark.read.format("text").load(path_text_orders)

#First way is to use selectExper methos

orders_text_scltexpr=orders_text.selectExpr("cast(split(value,',') [0] as int) order_customer_id",
                                            "cast(split(value,',') [1] as date) order_date",
                                            "cast(split(value,',') [2] as int) order_id",
                                            "cast(split(value,',') [3] as string) order_status")



#second way is to use withColumn method . this will add extra cloumns to the dataframe , which will ensure to remove it
orders_text_with_col=orders_text.withColumn("order_customer_id",f.split(orders_text.value,",")[0].cast("int")).\
                                 withColumn("order_date",f.split(orders_text.value,",")[1].cast("date")).\
                                 withColumn("order_id",f.split(orders_text.value,",")[2].cast("int")).\
                                 withColumn("order_status",f.split(orders_text.value,",")[3].cast("string")) 

orders_text_with_col=orders_text_with_col.select("order_customer_id","order_date","order_id","order_status")

# the third way is to select the columns and aliasing them along with cast

orders_text_split=orders_text.select(
             f.split(orders_text.value,',')[0].cast('int').alias('order_customer_id'),
             f.split(orders_text.value,',')[1].cast('date').alias('order_date'),
             f.split(orders_text.value,',')[2].cast('int').alias('order_id'),
             f.split(orders_text.value,',')[3].cast('string').alias('order_status')
             )


orders_text.show(2)
orders_text_with_col.show(2)
orders_text_scltexpr.show(2)
orders_text_split.show(2)

+--------------------+
|               value|
+--------------------+
|1,2013-07-25 00:0...|
|2,2013-07-25 00:0...|
+--------------------+
only showing top 2 rows

+-----------------+----------+--------+---------------+
|order_customer_id|order_date|order_id|   order_status|
+-----------------+----------+--------+---------------+
|                1|2013-07-25|   11599|         CLOSED|
|                2|2013-07-25|     256|PENDING_PAYMENT|
+-----------------+----------+--------+---------------+
only showing top 2 rows

+-----------------+----------+--------+---------------+
|order_customer_id|order_date|order_id|   order_status|
+-----------------+----------+--------+---------------+
|                1|2013-07-25|   11599|         CLOSED|
|                2|2013-07-25|     256|PENDING_PAYMENT|
+-----------------+----------+--------+---------------+
only showing top 2 rows

+-----------------+----------+--------+---------------+
|order_customer_id|order_date|order_id|   order_status|
+--

format using CSV

In [6]:
#available from spark 2 +

#load data to the orders_csv 

path_csv_orders="file:///D://data-master/retail_db/orders"

orders_csv=spark.read.format("csv").load(path_csv_orders)

#First way is to use selectExper methos

orders_csv_slctexpr=orders_csv.selectExpr("cast(_c0 as int) order_customer_id",
                                 "cast(_c1 as date) order_date",
                                 "cast(_c2 as int) order_id",
                                 "cast(_c3 as string) order_status")

#second way is to use withColumn method . this will add extra cloumns to the dataframe , which will ensure to remove it

orders_csv_with_col=orders_csv.withColumn("order_customer_id",orders_csv._c0.cast("int")).\
                               withColumn("order_date",orders_csv._c1.cast("date")).\
                               withColumn("order_id",orders_csv._c2.cast("int")).\
                               withColumn("order_status",orders_csv._c3.cast("string"))

orders_csv_with_col=orders_csv_with_col.select("order_customer_id","order_date","order_id","order_status")

# the third way is to select the columns and aliasing them along with cast

orders_csv_cast=orders_csv.select(orders_csv._c0.cast("int").alias("order_customer_id"),
                              orders_csv._c1.cast("date").alias("order_date"),
                              orders_csv._c2.cast("int").alias("order_id"),
                              orders_csv._c3.cast("string").alias("order_status"))
orders_csv.show(2)
orders_csv_slctexpr.show(2)
orders_csv_with_col.show(2)
orders_csv_cast.show(2)

+---+--------------------+-----+---------------+
|_c0|                 _c1|  _c2|            _c3|
+---+--------------------+-----+---------------+
|  1|2013-07-25 00:00:...|11599|         CLOSED|
|  2|2013-07-25 00:00:...|  256|PENDING_PAYMENT|
+---+--------------------+-----+---------------+
only showing top 2 rows

+-----------------+----------+--------+---------------+
|order_customer_id|order_date|order_id|   order_status|
+-----------------+----------+--------+---------------+
|                1|2013-07-25|   11599|         CLOSED|
|                2|2013-07-25|     256|PENDING_PAYMENT|
+-----------------+----------+--------+---------------+
only showing top 2 rows

+-----------------+----------+--------+---------------+
|order_customer_id|order_date|order_id|   order_status|
+-----------------+----------+--------+---------------+
|                1|2013-07-25|   11599|         CLOSED|
|                2|2013-07-25|     256|PENDING_PAYMENT|
+-----------------+----------+--------+-

Using structures with schema

In [7]:
# second method is to use the stucture to define the schema  

path_csv_orders="file:///D://data-master/retail_db/orders"

from pyspark.sql.types import *

schema = StructType([StructField("order_customer_id", IntegerType(), True),
                       StructField("order_date", DateType(), True),
                     StructField("order_id", IntegerType(), True),
                     StructField("order_status", StringType(), True)])

orders_csv=spark.read.csv(path_csv_orders,schema=schema, inferSchema=False)

orders_csv.show()

+-----------------+----------+--------+---------------+
|order_customer_id|order_date|order_id|   order_status|
+-----------------+----------+--------+---------------+
|                1|2013-07-25|   11599|         CLOSED|
|                2|2013-07-25|     256|PENDING_PAYMENT|
|                3|2013-07-25|   12111|       COMPLETE|
|                4|2013-07-25|    8827|         CLOSED|
|                5|2013-07-25|   11318|       COMPLETE|
|                6|2013-07-25|    7130|       COMPLETE|
|                7|2013-07-25|    4530|       COMPLETE|
|                8|2013-07-25|    2911|     PROCESSING|
|                9|2013-07-25|    5657|PENDING_PAYMENT|
|               10|2013-07-25|    5648|PENDING_PAYMENT|
|               11|2013-07-25|     918| PAYMENT_REVIEW|
|               12|2013-07-25|    1837|         CLOSED|
|               13|2013-07-25|    9149|PENDING_PAYMENT|
|               14|2013-07-25|    9842|     PROCESSING|
|               15|2013-07-25|    2568|       CO

# Loading Parquet data

In [8]:
#loading parquet data
orders=spark.read.format("parquet").load("file:///d://pyspark/retail_db_parquet/orders")
orders.show()

+--------+-------------+-----------------+---------------+
|order_id|   order_date|order_customer_id|   order_status|
+--------+-------------+-----------------+---------------+
|       1|1374735600000|            11599|         CLOSED|
|       2|1374735600000|              256|PENDING_PAYMENT|
|       3|1374735600000|            12111|       COMPLETE|
|       4|1374735600000|             8827|         CLOSED|
|       5|1374735600000|            11318|       COMPLETE|
|       6|1374735600000|             7130|       COMPLETE|
|       7|1374735600000|             4530|       COMPLETE|
|       8|1374735600000|             2911|     PROCESSING|
|       9|1374735600000|             5657|PENDING_PAYMENT|
|      10|1374735600000|             5648|PENDING_PAYMENT|
|      11|1374735600000|              918| PAYMENT_REVIEW|
|      12|1374735600000|             1837|         CLOSED|
|      13|1374735600000|             9149|PENDING_PAYMENT|
|      14|1374735600000|             9842|     PROCESSIN

# loading JSON data

In [9]:
#loading json data
orders=spark.read.format("json").load("file:///d://pyspark/retail_db_json_orders")
orders.show()

+-----------------+----------+--------+---------------+
|order_customer_id|order_date|order_id|   order_status|
+-----------------+----------+--------+---------------+
|                2|2013-07-25|     256|PENDING_PAYMENT|
|                4|2013-07-25|    8827|         CLOSED|
|                5|2013-07-25|   11318|       COMPLETE|
|               10|2013-07-25|    5648|PENDING_PAYMENT|
|               12|2013-07-25|    1837|         CLOSED|
|               13|2013-07-25|    9149|PENDING_PAYMENT|
|               14|2013-07-25|    9842|     PROCESSING|
|               18|2013-07-25|    1205|         CLOSED|
|               22|2013-07-25|     333|       COMPLETE|
|               25|2013-07-25|    9503|         CLOSED|
|               28|2013-07-25|     656|       COMPLETE|
|               31|2013-07-25|    6983| PAYMENT_REVIEW|
|               32|2013-07-25|    3960|       COMPLETE|
|               36|2013-07-25|    5649|        PENDING|
|               37|2013-07-25|    5863|         

# loading CSV data

In [10]:
#Loading csv data
movies=spark.read.csv("file:///d://ml-20m/movies.csv",header=True)
movies.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen