In [0]:
myRDD = sc.parallelize(
    [('sampath', 1998),
     ('rohith', 1998),
     ('rahul', 1998)]
)

In [0]:
myRDD.collect()

[('sampath', 1998), ('rohith', 1998), ('rahul', 1998)]

In [0]:
# or 

for row in myRDD.take(myRDD.count()): print(row)

('sampath', 1998)
('rohith', 1998)
('rahul', 1998)


In [0]:
from pyspark.sql import types

DFSchema = types.StructType([
  types.StructField('name', types.StringType()),
  types.StructField('age', types.LongType())
])

In [0]:
# 3 ways to convert the RDD to a DF:
df1 = spark.createDataFrame(myRDD, DFSchema)
df2 = myRDD.toDF(DFSchema)
df3 = myRDD.toDF(['name','age'])

In [0]:
df1.show()

+-------+----+
|   name| age|
+-------+----+
|sampath|1998|
| rohith|1998|
|  rahul|1998|
+-------+----+



In [0]:
df2.show()

+-------+----+
|   name| age|
+-------+----+
|sampath|1998|
| rohith|1998|
|  rahul|1998|
+-------+----+



In [0]:
df3.show()

+-------+----+
|   name| age|
+-------+----+
|sampath|1998|
| rohith|1998|
|  rahul|1998|
+-------+----+



In [0]:
df1.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [0]:
df1.count()

3

In [0]:
df1.columns

['name', 'age']

In [0]:
print("Partitions", df1.rdd.getNumPartitions())

Partitions 4


In [0]:
df1.repartition(2)

DataFrame[name: string, age: bigint]

In [0]:
df1.createOrReplaceTempView("sample")

In [0]:
%sql

SELECT * FROM sample

name,age
sampath,1998
rohith,1998
rahul,1998


In [0]:
display(df1.select("name", "age").where("name = 'rohith'"))

name,age
rohith,1998


In [0]:
from pyspark.sql import functions as F

display(df1.select("*").where(F.col("name").like("r%")))

name,age
rohith,1998
rahul,1998


In [0]:
minValue, maxValue, = df1.select(F.min("age"), F.max("age")).head()

print("Min: {}, Max: {}".format(minValue, maxValue))

Min: 1998, Max: 1998


In [0]:
result_df = df1.select("name", "age", (F.col("age") * 2).alias("ageX2"))

display(result_df)

name,age,ageX2
sampath,1998,3996
rohith,1998,3996
rahul,1998,3996


In [0]:
@udf("long")
def multiplyBy2(age):
  return age * 2

result_df = df1.select("name", "age", multiplyBy2("age").alias("ageX2"))

display(result_df)

name,age,ageX2
sampath,1998,3996
rohith,1998,3996
rahul,1998,3996


In [0]:
from pyspark.sql import Row

def transformRow(row):
  newAge = row.age * 2
  return Row(name=row.name, age=row.age, ageX2=newAge)
  # return Row(**{"name": row.name, "age": row.age, "ageX2": newAge})

result_df = df1.rdd.map(transformRow).toDF()

display(result_df)

name,age,ageX2
sampath,1998,3996
rohith,1998,3996
rahul,1998,3996


In [0]:
# Advanced

def rowExpander(x):
  yield Row(**{"name": x.name, "age": x.age})
  yield Row(**{"name": x.name, "age": x.age * 2})
  
result_df = df1.rdd.flatMap(rowExpander).toDF()

display(result_df)

name,age
sampath,1998
sampath,3996
rohith,1998
rohith,3996
rahul,1998
rahul,3996
