In [2]:
#importing statements
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, StructField, IntegerType
import pyspark.sql.functions as func

In [3]:
#creating sparksession
spark = SparkSession.builder.appName("sampleSession").getOrCreate()

In [4]:
#defining schema for the dataframe
myschema = StructType([
    StructField("UserID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Friends", IntegerType(), True),
])

In [5]:
# Creating DataFrame from a CSV file
people = spark.read.format("csv")\
    .schema(myschema)\
    .option("header", True)\
    .load("fakefriends.csv")

In [6]:
# Performing transformation
output = people.select(people.UserID, people.Name, people.Age, people.Friends)\
    .where(people.Age < 30)\
    .withColumn("insert_ts", func.current_timestamp())\
    .orderBy(people.UserID)

In [7]:
# Display the result
output.show()

+------+-----+---+-------+--------------------+
|UserID| Name|Age|Friends|           insert_ts|
+------+-----+---+-------+--------------------+
|     1| John| 28|    150|2024-07-03 09:11:...|
|     2| Jane| 22|    200|2024-07-03 09:11:...|
|     4|Alice| 19|    300|2024-07-03 09:11:...|
|     5|  Tom| 25|    100|2024-07-03 09:11:...|
|     7|James| 27|    180|2024-07-03 09:11:...|
|     8|Linda| 24|    220|2024-07-03 09:11:...|
|    10|David| 23|    140|2024-07-03 09:11:...|
+------+-----+---+-------+--------------------+



In [9]:
output.count()

7

In [11]:
people.explain()

== Physical Plan ==
FileScan csv [UserID#0,Name#1,Age#2,Friends#3] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/G:/studies/pyspark/fakefriends.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<UserID:int,Name:string,Age:int,Friends:int>




In [14]:
people.show()

+------+------+---+-------+
|UserID|  Name|Age|Friends|
+------+------+---+-------+
|     1|  John| 28|    150|
|     2|  Jane| 22|    200|
|     3|   Bob| 35|     50|
|     4| Alice| 19|    300|
|     5|   Tom| 25|    100|
|     6|  Emma| 30|    250|
|     7| James| 27|    180|
|     8| Linda| 24|    220|
|     9|Robert| 31|    170|
|    10| David| 23|    140|
+------+------+---+-------+



In [16]:
people.count()

10

In [17]:
people.printSchema()

root
 |-- UserID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Friends: integer (nullable = true)

