In [1]:
import pyspark
import pandas as pd

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Create SparkSession

In [3]:
spark = SparkSession.builder.appName("DataFrame").getOrCreate()

In [4]:
# Create a DataFrame
emp = [(1, "Smith", "Finance", 1000),
        (2, "Rose", "Marketing", 2000),
        (3, "Williams", "Marketing", 1000),
        (4, "Jones", "Sales", 2000),
        (5, "Brown", "Sales", 1000),
        (6, "Katie", "Finance", 2000),
        (7, "Linda", "IT", 2000),
        (8, "Michael", "IT", 1000),
        (9, "Johnson", "Marketing", 1000),
        (10, "Tom", "Finance", 2000)]

dept = [("Finance", "fi"),
        ("Marketing", "ma"),
        ("Sales", "sa"),
        ("Computer Science", "cs")]
df = spark.createDataFrame(emp, ["emp_id", "name", "dept", "salary"])

deptdf = spark.createDataFrame(dept, ["name", "dept_id"])

In [5]:
df.show()

+------+--------+---------+------+
|emp_id|    name|     dept|salary|
+------+--------+---------+------+
|     1|   Smith|  Finance|  1000|
|     2|    Rose|Marketing|  2000|
|     3|Williams|Marketing|  1000|
|     4|   Jones|    Sales|  2000|
|     5|   Brown|    Sales|  1000|
|     6|   Katie|  Finance|  2000|
|     7|   Linda|       IT|  2000|
|     8| Michael|       IT|  1000|
|     9| Johnson|Marketing|  1000|
|    10|     Tom|  Finance|  2000|
+------+--------+---------+------+



In [6]:
deptdf.show()

+----------------+-------+
|            name|dept_id|
+----------------+-------+
|         Finance|     fi|
|       Marketing|     ma|
|           Sales|     sa|
|Computer Science|     cs|
+----------------+-------+



# Basic Operations on DataFrame

In [7]:
# Count
df.count()

10

In [8]:
# Columns
df.columns

['emp_id', 'name', 'dept', 'salary']

In [9]:
# Dtypes
df.dtypes

[('emp_id', 'bigint'),
 ('name', 'string'),
 ('dept', 'string'),
 ('salary', 'bigint')]

In [10]:
# Scheme: how Spark stores schema of the dataframe (column name, datatype, nullable?)
df.schema

StructType([StructField('emp_id', LongType(), True), StructField('name', StringType(), True), StructField('dept', StringType(), True), StructField('salary', LongType(), True)])

In [11]:
# print schema
df.printSchema()

root
 |-- emp_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- salary: long (nullable = true)



In [12]:
# Select column from df
df.select("name","dept").show()

+--------+---------+
|    name|     dept|
+--------+---------+
|   Smith|  Finance|
|    Rose|Marketing|
|Williams|Marketing|
|   Jones|    Sales|
|   Brown|    Sales|
|   Katie|  Finance|
|   Linda|       IT|
| Michael|       IT|
| Johnson|Marketing|
|     Tom|  Finance|
+--------+---------+



In [13]:
# Filter
df.filter(df["dept"]=="Sales").show()
df.filter(df["emp_id"]==2).show()

+------+-----+-----+------+
|emp_id| name| dept|salary|
+------+-----+-----+------+
|     4|Jones|Sales|  2000|
|     5|Brown|Sales|  1000|
+------+-----+-----+------+

+------+----+---------+------+
|emp_id|name|     dept|salary|
+------+----+---------+------+
|     2|Rose|Marketing|  2000|
+------+----+---------+------+



In [14]:
# Drop a column
dropped = df.drop("salary")
dropped.show()

+------+--------+---------+
|emp_id|    name|     dept|
+------+--------+---------+
|     1|   Smith|  Finance|
|     2|    Rose|Marketing|
|     3|Williams|Marketing|
|     4|   Jones|    Sales|
|     5|   Brown|    Sales|
|     6|   Katie|  Finance|
|     7|   Linda|       IT|
|     8| Michael|       IT|
|     9| Johnson|Marketing|
|    10|     Tom|  Finance|
+------+--------+---------+



In [15]:
df.show()

+------+--------+---------+------+
|emp_id|    name|     dept|salary|
+------+--------+---------+------+
|     1|   Smith|  Finance|  1000|
|     2|    Rose|Marketing|  2000|
|     3|Williams|Marketing|  1000|
|     4|   Jones|    Sales|  2000|
|     5|   Brown|    Sales|  1000|
|     6|   Katie|  Finance|  2000|
|     7|   Linda|       IT|  2000|
|     8| Michael|       IT|  1000|
|     9| Johnson|Marketing|  1000|
|    10|     Tom|  Finance|  2000|
+------+--------+---------+------+



In [16]:
# Aggregation 
# use groupbBy function to group data and then "agg" function to perform data aggregation
(df.groupBy("dept")
 .agg(count("salary").alias("count"),
        sum("salary").alias("sum"),
        avg("salary").alias("avg"),
        max("salary").alias("max"),
        min("salary").alias("min"))
    .show()
)

+---------+-----+----+------------------+----+----+
|     dept|count| sum|               avg| max| min|
+---------+-----+----+------------------+----+----+
|  Finance|    3|5000|1666.6666666666667|2000|1000|
|Marketing|    3|4000|1333.3333333333333|2000|1000|
|    Sales|    2|3000|            1500.0|2000|1000|
|       IT|    2|3000|            1500.0|2000|1000|
+---------+-----+----+------------------+----+----+



In [17]:
# Sort
df.sort(desc("salary"),asc("emp_id")).show(5)

+------+-----+---------+------+
|emp_id| name|     dept|salary|
+------+-----+---------+------+
|     2| Rose|Marketing|  2000|
|     4|Jones|    Sales|  2000|
|     6|Katie|  Finance|  2000|
|     7|Linda|       IT|  2000|
|    10|  Tom|  Finance|  2000|
+------+-----+---------+------+
only showing top 5 rows



In [18]:
# Derived Columns: use withColumn func to create new column based on existing column
df.withColumn("bonus", col("salary")*.2).show()

+------+--------+---------+------+-----+
|emp_id|    name|     dept|salary|bonus|
+------+--------+---------+------+-----+
|     1|   Smith|  Finance|  1000|200.0|
|     2|    Rose|Marketing|  2000|400.0|
|     3|Williams|Marketing|  1000|200.0|
|     4|   Jones|    Sales|  2000|400.0|
|     5|   Brown|    Sales|  1000|200.0|
|     6|   Katie|  Finance|  2000|400.0|
|     7|   Linda|       IT|  2000|400.0|
|     8| Michael|       IT|  1000|200.0|
|     9| Johnson|Marketing|  1000|200.0|
|    10|     Tom|  Finance|  2000|400.0|
+------+--------+---------+------+-----+



In [19]:
deptdf.show()

+----------------+-------+
|            name|dept_id|
+----------------+-------+
|         Finance|     fi|
|       Marketing|     ma|
|           Sales|     sa|
|Computer Science|     cs|
+----------------+-------+



In [20]:
df.show()

+------+--------+---------+------+
|emp_id|    name|     dept|salary|
+------+--------+---------+------+
|     1|   Smith|  Finance|  1000|
|     2|    Rose|Marketing|  2000|
|     3|Williams|Marketing|  1000|
|     4|   Jones|    Sales|  2000|
|     5|   Brown|    Sales|  1000|
|     6|   Katie|  Finance|  2000|
|     7|   Linda|       IT|  2000|
|     8| Michael|       IT|  1000|
|     9| Johnson|Marketing|  1000|
|    10|     Tom|  Finance|  2000|
+------+--------+---------+------+



## JOIN

In [21]:
# INNER JOIN
df.join(deptdf, df.dept == deptdf.name).show()

+------+--------+---------+------+---------+-------+
|emp_id|    name|     dept|salary|     name|dept_id|
+------+--------+---------+------+---------+-------+
|     1|   Smith|  Finance|  1000|  Finance|     fi|
|     6|   Katie|  Finance|  2000|  Finance|     fi|
|    10|     Tom|  Finance|  2000|  Finance|     fi|
|     2|    Rose|Marketing|  2000|Marketing|     ma|
|     3|Williams|Marketing|  1000|Marketing|     ma|
|     9| Johnson|Marketing|  1000|Marketing|     ma|
|     4|   Jones|    Sales|  2000|    Sales|     sa|
|     5|   Brown|    Sales|  1000|    Sales|     sa|
+------+--------+---------+------+---------+-------+



In [22]:
# LEFT OUTER JOIN
df.join(deptdf, df.dept == deptdf.name, "left_outer").show()

+------+--------+---------+------+---------+-------+
|emp_id|    name|     dept|salary|     name|dept_id|
+------+--------+---------+------+---------+-------+
|     1|   Smith|  Finance|  1000|  Finance|     fi|
|     2|    Rose|Marketing|  2000|Marketing|     ma|
|     3|Williams|Marketing|  1000|Marketing|     ma|
|     4|   Jones|    Sales|  2000|    Sales|     sa|
|     5|   Brown|    Sales|  1000|    Sales|     sa|
|     6|   Katie|  Finance|  2000|  Finance|     fi|
|     7|   Linda|       IT|  2000|     NULL|   NULL|
|     8| Michael|       IT|  1000|     NULL|   NULL|
|    10|     Tom|  Finance|  2000|  Finance|     fi|
|     9| Johnson|Marketing|  1000|Marketing|     ma|
+------+--------+---------+------+---------+-------+



In [23]:
# RIGHT OUTER JOIN
df.join(deptdf, df.dept == deptdf.name, "right_outer").show()

+------+--------+---------+------+----------------+-------+
|emp_id|    name|     dept|salary|            name|dept_id|
+------+--------+---------+------+----------------+-------+
|    10|     Tom|  Finance|  2000|         Finance|     fi|
|     6|   Katie|  Finance|  2000|         Finance|     fi|
|     1|   Smith|  Finance|  1000|         Finance|     fi|
|     9| Johnson|Marketing|  1000|       Marketing|     ma|
|     3|Williams|Marketing|  1000|       Marketing|     ma|
|     2|    Rose|Marketing|  2000|       Marketing|     ma|
|     5|   Brown|    Sales|  1000|           Sales|     sa|
|     4|   Jones|    Sales|  2000|           Sales|     sa|
|  NULL|    NULL|     NULL|  NULL|Computer Science|     cs|
+------+--------+---------+------+----------------+-------+



In [24]:
# FULL OUTER JOIN
df.join(deptdf, df.dept == deptdf.name, "full_outer").show()

+------+--------+---------+------+----------------+-------+
|emp_id|    name|     dept|salary|            name|dept_id|
+------+--------+---------+------+----------------+-------+
|  NULL|    NULL|     NULL|  NULL|Computer Science|     cs|
|     1|   Smith|  Finance|  1000|         Finance|     fi|
|     6|   Katie|  Finance|  2000|         Finance|     fi|
|    10|     Tom|  Finance|  2000|         Finance|     fi|
|     7|   Linda|       IT|  2000|            NULL|   NULL|
|     8| Michael|       IT|  1000|            NULL|   NULL|
|     2|    Rose|Marketing|  2000|       Marketing|     ma|
|     3|Williams|Marketing|  1000|       Marketing|     ma|
|     9| Johnson|Marketing|  1000|       Marketing|     ma|
|     4|   Jones|    Sales|  2000|           Sales|     sa|
|     5|   Brown|    Sales|  1000|           Sales|     sa|
+------+--------+---------+------+----------------+-------+



## SQL Queries
To perform SQL-like queries,need to register DataFrame as a temporary View with "createOrReplaceTempView"

In [25]:
# Register Dataframe as Temp Table
df.createOrReplaceTempView("employee")

# Run SQL Query
spark.sql("select * from employee where name =\"Brown\"").show()

+------+-----+-----+------+
|emp_id| name| dept|salary|
+------+-----+-----+------+
|     5|Brown|Sales|  1000|
+------+-----+-----+------+



In [26]:
spark.sql("select distinct emp_id from employee").show()

+------+
|emp_id|
+------+
|     1|
|     2|
|     3|
|     5|
|     4|
|     6|
|     7|
|     8|
|     9|
|    10|
+------+



In [27]:
spark.sql("select \
          emp_id, name, salary from employee \
          order by salary desc")\
        .show()

+------+--------+------+
|emp_id|    name|salary|
+------+--------+------+
|    10|     Tom|  2000|
|     2|    Rose|  2000|
|     4|   Jones|  2000|
|     7|   Linda|  2000|
|     6|   Katie|  2000|
|     3|Williams|  1000|
|     5|   Brown|  1000|
|     8| Michael|  1000|
|     1|   Smith|  1000|
|     9| Johnson|  1000|
+------+--------+------+



## Read HIVE Table as Dataframe

In [39]:
df = spark.table("employee")

In [40]:
df.show()

+------+--------+---------+------+
|emp_id|    name|     dept|salary|
+------+--------+---------+------+
|     1|   Smith|  Finance|  1000|
|     2|    Rose|Marketing|  2000|
|     3|Williams|Marketing|  1000|
|     4|   Jones|    Sales|  2000|
|     5|   Brown|    Sales|  1000|
|     6|   Katie|  Finance|  2000|
|     7|   Linda|       IT|  2000|
|     8| Michael|       IT|  1000|
|     9| Johnson|Marketing|  1000|
|    10|     Tom|  Finance|  2000|
+------+--------+---------+------+



## Create dataframe from CSV

In [45]:
df = spark.read.csv("D:/Data_Engineering/Apache_Spark/data/stocks_price_final.csv", sep = ",", header = True, inferSchema = True)

In [47]:
df.show()

+---+------+----------+---------+---------+---------+---------+-------+---------+----------+-------------+--------------------+--------+
|_c0|symbol|      date|     open|     high|      low|    close| volume| adjusted|market.cap|       sector|            industry|exchange|
+---+------+----------+---------+---------+---------+---------+-------+---------+----------+-------------+--------------------+--------+
|  1|   TXG|2019-09-12|       54|       58|       51|    52.75|7326300|    52.75|    $9.31B|Capital Goods|Biotechnology: La...|  NASDAQ|
|  2|   TXG|2019-09-13|    52.75|   54.355|49.150002|    52.27|1025200|    52.27|    $9.31B|Capital Goods|Biotechnology: La...|  NASDAQ|
|  3|   TXG|2019-09-16|52.450001|       56|52.009998|55.200001| 269900|55.200001|    $9.31B|Capital Goods|Biotechnology: La...|  NASDAQ|
|  4|   TXG|2019-09-17|56.209999|60.900002|   55.423|56.779999| 602800|56.779999|    $9.31B|Capital Goods|Biotechnology: La...|  NASDAQ|
|  5|   TXG|2019-09-18|56.849998|    62.2