In [2]:
import pyspark
import pandas as pd

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Create SparkSession

In [3]:
spark = SparkSession.builder.appName("DataFrame").getOrCreate()

In [7]:
# Create a DataFrame
emp = [(1, "Smith", "Finance", 1000),
        (2, "Rose", "Marketing", 2000),
        (3, "Williams", "Marketing", 1000),
        (4, "Jones", "Sales", 2000),
        (5, "Brown", "Sales", 1000),
        (6, "Katie", "Finance", 2000),
        (7, "Linda", "IT", 2000),
        (8, "Michael", "IT", 1000),
        (9, "Johnson", "Marketing", 1000),
        (10, "Tom", "Finance", 2000)]

dept = [("Finance", "fi"),
        ("Marketing", "ma"),
        ("Sales", "sa"),
        ("Computer Science", "cs")]
df = spark.createDataFrame(emp, ["emp_id", "name", "dept", "salary"])

deptdf = spark.createDataFrame(dept, ["name", "dept_id"])

In [8]:
df.show()

+------+--------+---------+------+
|emp_id|    name|     dept|salary|
+------+--------+---------+------+
|     1|   Smith|  Finance|  1000|
|     2|    Rose|Marketing|  2000|
|     3|Williams|Marketing|  1000|
|     4|   Jones|    Sales|  2000|
|     5|   Brown|    Sales|  1000|
|     6|   Katie|  Finance|  2000|
|     7|   Linda|       IT|  2000|
|     8| Michael|       IT|  1000|
|     9| Johnson|Marketing|  1000|
|    10|     Tom|  Finance|  2000|
+------+--------+---------+------+



In [9]:
deptdf.show()

+----------------+-------+
|            name|dept_id|
+----------------+-------+
|         Finance|     fi|
|       Marketing|     ma|
|           Sales|     sa|
|Computer Science|     cs|
+----------------+-------+



# Basic Operations on DataFrame

In [11]:
# Count
df.count()

10

In [12]:
# Columns
df.columns

['emp_id', 'name', 'dept', 'salary']

In [13]:
# Dtypes
df.dtypes

[('emp_id', 'bigint'),
 ('name', 'string'),
 ('dept', 'string'),
 ('salary', 'bigint')]

In [14]:
# Scheme: how Spark stores schema of the dataframe (column name, datatype, nullable?)
df.schema

StructType([StructField('emp_id', LongType(), True), StructField('name', StringType(), True), StructField('dept', StringType(), True), StructField('salary', LongType(), True)])

In [15]:
# print schema
df.printSchema()

root
 |-- emp_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- salary: long (nullable = true)



In [16]:
# Select column from df
df.select("name","dept").show()

+--------+---------+
|    name|     dept|
+--------+---------+
|   Smith|  Finance|
|    Rose|Marketing|
|Williams|Marketing|
|   Jones|    Sales|
|   Brown|    Sales|
|   Katie|  Finance|
|   Linda|       IT|
| Michael|       IT|
| Johnson|Marketing|
|     Tom|  Finance|
+--------+---------+



In [17]:
# Filter
df.filter(df["dept"]=="Sales").show()
df.filter(df["emp_id"]==2).show()

+------+-----+-----+------+
|emp_id| name| dept|salary|
+------+-----+-----+------+
|     4|Jones|Sales|  2000|
|     5|Brown|Sales|  1000|
+------+-----+-----+------+

+------+----+---------+------+
|emp_id|name|     dept|salary|
+------+----+---------+------+
|     2|Rose|Marketing|  2000|
+------+----+---------+------+



In [19]:
# Drop a column
dropped = df.drop("salary")
dropped.show()

+------+--------+---------+
|emp_id|    name|     dept|
+------+--------+---------+
|     1|   Smith|  Finance|
|     2|    Rose|Marketing|
|     3|Williams|Marketing|
|     4|   Jones|    Sales|
|     5|   Brown|    Sales|
|     6|   Katie|  Finance|
|     7|   Linda|       IT|
|     8| Michael|       IT|
|     9| Johnson|Marketing|
|    10|     Tom|  Finance|
+------+--------+---------+



In [20]:
df.show()

+------+--------+---------+------+
|emp_id|    name|     dept|salary|
+------+--------+---------+------+
|     1|   Smith|  Finance|  1000|
|     2|    Rose|Marketing|  2000|
|     3|Williams|Marketing|  1000|
|     4|   Jones|    Sales|  2000|
|     5|   Brown|    Sales|  1000|
|     6|   Katie|  Finance|  2000|
|     7|   Linda|       IT|  2000|
|     8| Michael|       IT|  1000|
|     9| Johnson|Marketing|  1000|
|    10|     Tom|  Finance|  2000|
+------+--------+---------+------+



In [22]:
# Aggregation 
# use groupbBy function to group data and then "agg" function to perform data aggregation
(df.groupBy("dept")
 .agg(count("salary").alias("count"),
        sum("salary").alias("sum"),
        avg("salary").alias("avg"),
        max("salary").alias("max"),
        min("salary").alias("min"))
    .show()
)

+---------+-----+----+------------------+----+----+
|     dept|count| sum|               avg| max| min|
+---------+-----+----+------------------+----+----+
|  Finance|    3|5000|1666.6666666666667|2000|1000|
|Marketing|    3|4000|1333.3333333333333|2000|1000|
|    Sales|    2|3000|            1500.0|2000|1000|
|       IT|    2|3000|            1500.0|2000|1000|
+---------+-----+----+------------------+----+----+



In [27]:
# Sort
df.sort(desc("salary"),asc("emp_id")).show(5)

+------+-----+---------+------+
|emp_id| name|     dept|salary|
+------+-----+---------+------+
|     2| Rose|Marketing|  2000|
|     4|Jones|    Sales|  2000|
|     6|Katie|  Finance|  2000|
|     7|Linda|       IT|  2000|
|    10|  Tom|  Finance|  2000|
+------+-----+---------+------+
only showing top 5 rows



In [28]:
# Derived Columns: use withColumn func to create new column based on existing column
df.withColumn("bonus", col("salary")*.2).show()

+------+--------+---------+------+-----+
|emp_id|    name|     dept|salary|bonus|
+------+--------+---------+------+-----+
|     1|   Smith|  Finance|  1000|200.0|
|     2|    Rose|Marketing|  2000|400.0|
|     3|Williams|Marketing|  1000|200.0|
|     4|   Jones|    Sales|  2000|400.0|
|     5|   Brown|    Sales|  1000|200.0|
|     6|   Katie|  Finance|  2000|400.0|
|     7|   Linda|       IT|  2000|400.0|
|     8| Michael|       IT|  1000|200.0|
|     9| Johnson|Marketing|  1000|200.0|
|    10|     Tom|  Finance|  2000|400.0|
+------+--------+---------+------+-----+



In [None]:
# Joins
