In [10]:
import findspark
findspark.init()
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql import SQLContext

spark = SparkSession.builder.appName("SQL-use").getOrCreate()

import warnings
warnings.filterwarnings("ignore")

spark

In [3]:
# Create a DataFrame

filament_data = [ 
    ["filamentA", '100W', 605],
    ["filamentB", '100W', 683],
    ["filamentB", '100W', 691],
    ["filamentB", '200W', 561],
    ["filamentA", '200W', 530],
    ["filamentA", '100W', 619],
    ['filamentB', '100W', 686],
    ['filamentB', '200W', 600],
    ['filamentB', '100W', 696],
    ['filamentA', '200W', 579],
    ['filamentA', '200W', 520],
    ['filamentA', '100W', 622],
    ['filamentA', '100W', 668],
    ['filamentB', '200W', 569],
    ['filamentB', '200W', 555],
    ['filamentA', '200W', 541]
]


filament_rdd = spark.sparkContext.parallelize(filament_data, 4)
filament_rdd.take(4)

                                                                                

[['filamentA', '100W', 605],
 ['filamentB', '100W', 683],
 ['filamentB', '100W', 691],
 ['filamentB', '200W', 561]]

### Creating DataFrame

- A DataFrame can be created from an RDD or by reading a file.

- `SQLContext` is the entering point to the PySparkSQL.

- Sometimes we need to create a schema with `StructType`.

- `StructType` takes a list of column names that are defined by `StructField`.


```python
from pyspark.sql.types import * # to import HQL data types
from pyspark.sql import Row # to create a row object
from pyspark.sql import SQLContext # SQLContext object is the entry point to PySparkSQL.
```

**`StructField(name: str, dataType: DataType, nullable: bool=True, metadata:dict)`**

- name: name of the column

- dataType: DataType of the field

- nullable: whether the column can have null value or not

- metadata (optional)

- use to create a column of the table.

In [5]:
# define columns
col_filament_type = StructField("FilamentType", StringType(), True)
col_bulb_power = StructField("BulbPower", StringType(), True)
col_life_in_hours = StructField("LifeInHours", StringType(), True)

# define the schema
filament_data_schema = StructType([col_filament_type, col_bulb_power, col_life_in_hours])

filament_data_schema


StructType([StructField('FilamentType', StringType(), True), StructField('BulbPower', StringType(), True), StructField('LifeInHours', StringType(), True)])

- To create a row object, Row object is required.

`from pyspark.sql import Row`

In [7]:
# create row object
filament_rdd_rows = filament_rdd.map(lambda x: Row(str(x[0]), str(x[1]), str(x[2])))
filament_rdd_rows.take(4)

[<Row('filamentA', '100W', '605')>,
 <Row('filamentB', '100W', '683')>,
 <Row('filamentB', '100W', '691')>,
 <Row('filamentB', '200W', '561')>]

In [13]:
# create sql context

sqlcontext = SQLContext(spark.sparkContext)

filament_df_raw = sqlcontext.createDataFrame(filament_rdd_rows, filament_data_schema)

filament_df_raw.show(4)

+------------+---------+-----------+
|FilamentType|BulbPower|LifeInHours|
+------------+---------+-----------+
|   filamentA|     100W|        605|
|   filamentB|     100W|        683|
|   filamentB|     100W|        691|
|   filamentB|     200W|        561|
+------------+---------+-----------+
only showing top 4 rows



In [14]:
filament_df_raw.printSchema()

root
 |-- FilamentType: string (nullable = true)
 |-- BulbPower: string (nullable = true)
 |-- LifeInHours: string (nullable = true)



**Changing Data Type of a Column**

- To change the data type of a column we need the `cast` function.

- `cast` function works with `withColumn`.


In [15]:
# the above DataFrame, column LifeInHours column has a string data type. 
# Let's change the data type to float 

filament_df = filament_df_raw.withColumn("LifeInHours", filament_df_raw.LifeInHours.cast(FloatType()))

filament_df.printSchema()

root
 |-- FilamentType: string (nullable = true)
 |-- BulbPower: string (nullable = true)
 |-- LifeInHours: float (nullable = true)



In [16]:
filament_df.show()

+------------+---------+-----------+
|FilamentType|BulbPower|LifeInHours|
+------------+---------+-----------+
|   filamentA|     100W|      605.0|
|   filamentB|     100W|      683.0|
|   filamentB|     100W|      691.0|
|   filamentB|     200W|      561.0|
|   filamentA|     200W|      530.0|
|   filamentA|     100W|      619.0|
|   filamentB|     100W|      686.0|
|   filamentB|     200W|      600.0|
|   filamentB|     100W|      696.0|
|   filamentA|     200W|      579.0|
|   filamentA|     200W|      520.0|
|   filamentA|     100W|      622.0|
|   filamentA|     100W|      668.0|
|   filamentB|     200W|      569.0|
|   filamentB|     200W|      555.0|
|   filamentA|     200W|      541.0|
+------------+---------+-----------+



                                                                                

In [17]:
# filter out data where bulb power is 100W

filament_100w_df = filament_df.filter(filament_df.BulbPower == "100W")
filament_100w_df.show()

+------------+---------+-----------+
|FilamentType|BulbPower|LifeInHours|
+------------+---------+-----------+
|   filamentA|     100W|      605.0|
|   filamentB|     100W|      683.0|
|   filamentB|     100W|      691.0|
|   filamentA|     100W|      619.0|
|   filamentB|     100W|      686.0|
|   filamentB|     100W|      696.0|
|   filamentA|     100W|      622.0|
|   filamentA|     100W|      668.0|
+------------+---------+-----------+



In [19]:
filament_100w_650_plus_df = filament_df.filter((filament_df.BulbPower == "100W") & (filament_df.LifeInHours > 650.0))
filament_100w_650_plus_df.show()

                                                                                

+------------+---------+-----------+
|FilamentType|BulbPower|LifeInHours|
+------------+---------+-----------+
|   filamentB|     100W|      683.0|
|   filamentB|     100W|      691.0|
|   filamentB|     100W|      686.0|
|   filamentB|     100W|      696.0|
|   filamentA|     100W|      668.0|
+------------+---------+-----------+

