In [1]:
# import classes for python application
from pyspark.sql import SparkSession, DataFrame, functions, Row
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *

import pandas as pd
import os
import sys
import pyarrow as pa
import pyarrow.parquet as pq
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# create spark session, configuration and context
spark = SparkSession.builder.appName("DataFrame").getOrCreate()
conf = SparkConf().setMaster("local").setAppName("Assignment 2")
sc = SparkContext.getOrCreate(conf=conf)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/12 10:20:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/02/12 10:20:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
# load text data and create dataframe
df1 = spark.read.text('emps.txt')
#show schema
df1.show()
#print schema
df1.printSchema()

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+
|               value|
+--------------------+
|Michael, 29, 3000.30|
|   Andy, 30, 2500.25|
| Justin, 19, 4000.99|
+--------------------+

root
 |-- value: string (nullable = true)



                                                                                

In [3]:
#load text data 
emps_rdd = sc.textFile('emps.txt')
#transform data to tuple for dataframe
emps = emps_rdd.map(lambda e: tuple(e.split(",")))

# create spark dataframe and schema during import
df2 = spark.createDataFrame(emps, schema=['name', 'age', 'salary'])
# update types for columns
df2 = df2.withColumn("age", df2.age.cast(IntegerType()))
df2 = df2.withColumn("salary", df2.salary.cast(FloatType()))
#show schema
df2.show()
#print schema
df2.printSchema()

                                                                                

+-------+---+-------+
|   name|age| salary|
+-------+---+-------+
|Michael| 29| 3000.3|
|   Andy| 30|2500.25|
| Justin| 19|4000.99|
+-------+---+-------+

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: float (nullable = true)



In [4]:
#select content of new df
df2.select("*").show()

+-------+---+-------+
|   name|age| salary|
+-------+---+-------+
|Michael| 29| 3000.3|
|   Andy| 30|2500.25|
| Justin| 19|4000.99|
+-------+---+-------+



In [5]:
#display all details 
df2.select("*").describe().show()

[Stage 4:>                                                          (0 + 1) / 1]

+-------+-------+-----------------+------------------+
|summary|   name|              age|            salary|
+-------+-------+-----------------+------------------+
|  count|      3|                3|                 3|
|   mean|   null|             26.0|3167.1800130208335|
| stddev|   null|6.082762530298219| 764.1608609506821|
|    min|   Andy|               19|           2500.25|
|    max|Michael|               30|           4000.99|
+-------+-------+-----------------+------------------+



                                                                                

In [6]:
#transform data into Global temp table and show table
table = pa.Table.from_pandas(df2.toPandas())
pq.write_table(table, 'df.parquet')
df_parquet = spark.read.parquet('df.parquet')
df_parquet.show()

                                                                                

+-------+---+-------+
|   name|age| salary|
+-------+---+-------+
|Michael| 29| 3000.3|
|   Andy| 30|2500.25|
| Justin| 19|4000.99|
+-------+---+-------+



In [7]:
# select names of employees with salary greater than 3500 using filter
df_parquet.filter(df_parquet.salary > 3500).show()

+------+---+-------+
|  name|age| salary|
+------+---+-------+
|Justin| 19|4000.99|
+------+---+-------+



In [8]:
#select names of employees with salary greater than 3500 using select
df_parquet.select(df_parquet.name, df_parquet.salary > 3500).show()

+-------+---------------+
|   name|(salary > 3500)|
+-------+---------------+
|Michael|          false|
|   Andy|          false|
| Justin|           true|
+-------+---------------+



In [9]:
#parse json string and convert to pandas dataframe
df3 = spark.read.format('json').load("employee.json")
df3 = df3.drop('id')
df3 = df3.withColumn("age", df3.age.cast('bigint'))
df3

DataFrame[age: bigint, name: string]

In [10]:
#group by age, select count using pyspark.sql functions, change names and show dataframe
df3.groupBy('age')\
        .count()\
        .select(functions.col('count')\
        .alias('numberOfEmployees'), df3.age.alias('ageGroup')).show()

+-----------------+--------+
|numberOfEmployees|ageGroup|
+-----------------+--------+
|                1|      39|
|                2|      33|
|                1|      35|
|                1|      38|
+-----------------+--------+



In [11]:
#load text in an RDD instance and view data
result = emps_rdd.collect()
result

['Michael, 29, 3000.30', 'Andy, 30, 2500.25', 'Justin, 19, 4000.99']

In [12]:
# split lines into tuples and create rows for spark dataframe
emps = emps_rdd.map(lambda e: tuple(e.split(",")))\
                .map(lambda e: Row(name=e[0], age=int(e[1]), salary=float(e[2])))

In [13]:
# create and show spark dataframe and print schema
emps_df = spark.createDataFrame(emps)
emps_df.show()
emps_df.printSchema()

+-------+---+-------+
|   name|age| salary|
+-------+---+-------+
|Michael| 29| 3000.3|
|   Andy| 30|2500.25|
| Justin| 19|4000.99|
+-------+---+-------+

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: double (nullable = true)



In [14]:
# select names of employees with salary greater than 3500
emps_df.filter(emps_df.salary > 3500).show()

+------+---+-------+
|  name|age| salary|
+------+---+-------+
|Justin| 19|4000.99|
+------+---+-------+



In [15]:
#select names of employees with salary greater than 3500 with temporary table
emps_df.createOrReplaceTempView("tableEmps")
spark.sql("SELECT * FROM tableEmps WHERE salary > 3500").show()

+------+---+-------+
|  name|age| salary|
+------+---+-------+
|Justin| 19|4000.99|
+------+---+-------+



In [16]:
#full answers are in problem4_01.py and problem4_02.py script files

emps_df.count()

3

In [17]:
emps_df.toPandas().to_csv('emps_update.csv')

In [18]:
spark.read.csv('emps_update.csv', header=True).show()

+---+-------+---+-------+
|_c0|   name|age| salary|
+---+-------+---+-------+
|  0|Michael| 29| 3000.3|
|  1|   Andy| 30|2500.25|
|  2| Justin| 19|4000.99|
+---+-------+---+-------+



22/02/12 10:20:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , name, age, salary
 Schema: _c0, name, age, salary
Expected: _c0 but found: 
CSV file: file:///home/f_dev/big_data/emps_update.csv


In [19]:
#stop spark context and spark
sc.stop()
spark.stop()