In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = (SparkSession.builder
      .master('local[1]')
      .appName('SparkApp')
      .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/20 08:11:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
rdd2 = spark.sparkContext.textFile("data/test.txt")

In [5]:
rdd2.count()

                                                                                

8

In [6]:
rdd2.first()

'536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,17850,United Kingdom'

In [7]:
rdd2.take(10)

['536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,17850,United Kingdom',
 '536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,17850,United Kingdom',
 '536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 8:26,2.75,17850,United Kingdom',
 '536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 8:26,3.39,17850,United Kingdom',
 '536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 8:26,3.39,17850,United Kingdom',
 '536365,22752,SET 7 BABUSHKA NESTING BOXES,2,12/1/10 8:26,7.65,17850,United Kingdom',
 '536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,12/1/10 8:26,4.25,17850,United Kingdom',
 '536366,22633,HAND WARMER UNION JACK,6,12/1/10 8:28,1.85,17850,United Kingdom']

In [8]:
# Create DataFrame
data = [('James','','Smith','1991-04-01','M',3000),
        ('Michael','Rose','','2000-05-19','M',4000),
        ('Robert','','Williams','1978-09-05','M',4000),
        ('Maria','Anne','Jones','1967-12-01','F',4000),
        ('Jen','Mary','Brown','1980-02-17','F',-1)
        ]

In [9]:
columns = ["firstname","middlename","lastname","dob","gender","salary"]

In [10]:
df = spark.createDataFrame(data, columns)

In [11]:
df.show(5, 0)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|dob       |gender|salary|
+---------+----------+--------+----------+------+------+
|James    |          |Smith   |1991-04-01|M     |3000  |
|Michael  |Rose      |        |2000-05-19|M     |4000  |
|Robert   |          |Williams|1978-09-05|M     |4000  |
|Maria    |Anne      |Jones   |1967-12-01|F     |4000  |
|Jen      |Mary      |Brown   |1980-02-17|F     |-1    |
+---------+----------+--------+----------+------+------+



In [12]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

dataDF = [(('James','','Smith'),'1991-04-01','M',3000),
          (('Michael','Rose',''),'2000-05-19','M',4000),
          (('Robert','','Williams'),'1978-09-05','M',4000),
          (('Maria','Anne','Jones'),'1967-12-01','F',4000),
          (('Jen','Mary','Brown'),'1980-02-17','F',-1)
          ]

schema = StructType([
    StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
        StructField('lastname', StringType(), True),
    ])),
    StructField('dob', StringType(), True),
    StructField('gender', StringType(), True),
    StructField('salary', IntegerType(), True)
])

In [13]:
schema

StructType([StructField('name', StructType([StructField('firstname', StringType(), True), StructField('middlename', StringType(), True), StructField('lastname', StringType(), True)]), True), StructField('dob', StringType(), True), StructField('gender', StringType(), True), StructField('salary', IntegerType(), True)])

In [14]:
df = spark.createDataFrame(data=dataDF, schema=schema)

In [15]:
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [16]:
df = df.withColumnRenamed("dob", "birthdate")

In [17]:
df.show()

+--------------------+----------+------+------+
|                name| birthdate|gender|salary|
+--------------------+----------+------+------+
|    {James, , Smith}|1991-04-01|     M|  3000|
|   {Michael, Rose, }|2000-05-19|     M|  4000|
|{Robert, , Williams}|1978-09-05|     M|  4000|
|{Maria, Anne, Jones}|1967-12-01|     F|  4000|
|  {Jen, Mary, Brown}|1980-02-17|     F|    -1|
+--------------------+----------+------+------+



In [18]:
df = df.withColumnRenamed("birthdate","DateOfBirth").withColumnRenamed("salary","salary_usd")
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary_usd: integer (nullable = true)



In [19]:
schema2 = StructType([
    StructField("fname", StringType()),
    StructField("middlename", StringType()),
    StructField("lname", StringType()),
])

In [20]:
from pyspark.sql.functions import col

In [21]:
df.select(col("name").cast(schema2),
          col("DateOfBirth"), col("gender"), col("salary_usd")).printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- fname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary_usd: integer (nullable = true)



In [22]:
df.columns

['name', 'DateOfBirth', 'gender', 'salary_usd']

In [24]:
from pyspark.sql.functions import *

df.select(col("name.firstname").alias("fname"),
          col("name.middlename").alias("mname"),
          col("name.lastname").alias("lname"),
          col("DateOfBirth"),col("gender"),col("salary_usd")).printSchema()

root
 |-- fname: string (nullable = true)
 |-- mname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary_usd: integer (nullable = true)



In [33]:
df4 = df.withColumn("fname", col("name.firstname")).withColumn("mname", col("name.middlename")).withColumn("lname", col("name.lastname")).drop("name")
df4.printSchema()

root
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary_usd: integer (nullable = true)
 |-- fname: string (nullable = true)
 |-- mname: string (nullable = true)
 |-- lname: string (nullable = true)



In [34]:
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary_usd: integer (nullable = true)



In [35]:
df4.show()

+-----------+------+----------+-------+-----+--------+
|DateOfBirth|gender|salary_usd|  fname|mname|   lname|
+-----------+------+----------+-------+-----+--------+
| 1991-04-01|     M|      3000|  James|     |   Smith|
| 2000-05-19|     M|      4000|Michael| Rose|        |
| 1978-09-05|     M|      4000| Robert|     |Williams|
| 1967-12-01|     F|      4000|  Maria| Anne|   Jones|
| 1980-02-17|     F|        -1|    Jen| Mary|   Brown|
+-----------+------+----------+-------+-----+--------+



In [36]:
new_columns = ["dob","sex","salary_amt","first_name","middle_name","last_name"]
df4 = df4.toDF(*new_columns)

In [37]:
df4.printSchema()

root
 |-- dob: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- salary_amt: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)

