In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder \
    .appName("Read CSV example") \
    .getOrCreate()

In [3]:
raw_data = [
    ("U001","Abhishek",28,"Hyderabad",50000),
    ("U002","Neha",32,"Delhi",62000),
    ("U003","Ravi",25,"Bangalore",45000),
    ("U004","Pooja",29,"Mumbai",58000)
]



In [4]:
from pyspark.sql.types import(
    StructType,
    StructField,
    StringType,
    IntegerType,LongType
)

In [5]:
user_schema = StructType([
    StructField("user_id",StringType(),nullable=False),
    StructField("user_name",StringType(),nullable=True),
    StructField("age",IntegerType(),nullable=True),
    StructField("city",StringType(),nullable=True),
    StructField("salary",LongType(),nullable=True)
])

In [6]:
df_users=spark.createDataFrame(raw_data,schema=user_schema)

In [7]:
df_users.printSchema()

root
 |-- user_id: string (nullable = false)
 |-- user_name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: long (nullable = true)



In [8]:
df_users.show()

+-------+---------+---+---------+------+
|user_id|user_name|age|     city|salary|
+-------+---------+---+---------+------+
|   U001| Abhishek| 28|Hyderabad| 50000|
|   U002|     Neha| 32|    Delhi| 62000|
|   U003|     Ravi| 25|Bangalore| 45000|
|   U004|    Pooja| 29|   Mumbai| 58000|
+-------+---------+---+---------+------+



In [9]:
raw_data2=[
    ("P001","Abhishek","Thirty","Hyderabad",50000)

]

In [10]:
df_users=spark.createDataFrame(raw_data2,schema=user_schema)

PySparkTypeError: [FIELD_DATA_TYPE_UNACCEPTABLE_WITH_NAME] field age: IntegerType() can not accept object 'Thirty' in type <class 'str'>.

In [11]:
from pyspark.sql.types import ArrayType

In [12]:
interest_data = [
    ("U001",["AI","ML","Cloud"]),
    ("U002",["Testing","Automation"]),
    ("U003",["Data Engineering","Spark","Kafka"]),
    ("U004",["UI/UX"])
]

In [13]:
interest_schema = StructType([
    StructField("user_id",StringType(),nullable=False),
    StructField("interests",ArrayType(StringType()),nullable=True)
])

In [14]:
df_interests=spark.createDataFrame(interest_data,schema=interest_schema)
df_interests.printSchema()
df_interests.show(truncate=False)

root
 |-- user_id: string (nullable = false)
 |-- interests: array (nullable = true)
 |    |-- element: string (containsNull = true)

+-------+--------------------------------+
|user_id|interests                       |
+-------+--------------------------------+
|U001   |[AI, ML, Cloud]                 |
|U002   |[Testing, Automation]           |
|U003   |[Data Engineering, Spark, Kafka]|
|U004   |[UI/UX]                         |
+-------+--------------------------------+



In [15]:
from pyspark.sql.functions import explode
df_interests.select(
    "user_id",
    explode("interests").alias("interest")
).show()


+-------+----------------+
|user_id|        interest|
+-------+----------------+
|   U001|              AI|
|   U001|              ML|
|   U001|           Cloud|
|   U002|         Testing|
|   U002|      Automation|
|   U003|Data Engineering|
|   U003|           Spark|
|   U003|           Kafka|
|   U004|           UI/UX|
+-------+----------------+



In [16]:
from pyspark.sql.types import MapType

In [17]:
device_data = [
    ("U001",{"mobile":120,"laptop":300}),
    ("U002",{"tablet":80}),
    ("U003",{"mobile":200,"desktop":400}),
    ("U004",{"laptop":250})
]


In [18]:
device_schema=StructType([
    StructField("user_id",StringType(),nullable=False),
    StructField("device_usage",MapType(StringType(),LongType()),nullable=True)
])

In [19]:
df_devices=spark.createDataFrame(device_data,schema=device_schema)
df_devices.printSchema()
df_devices.show(truncate=False)

root
 |-- user_id: string (nullable = false)
 |-- device_usage: map (nullable = true)
 |    |-- key: string
 |    |-- value: long (valueContainsNull = true)

+-------+-------------------------------+
|user_id|device_usage                   |
+-------+-------------------------------+
|U001   |{mobile -> 120, laptop -> 300} |
|U002   |{tablet -> 80}                 |
|U003   |{mobile -> 200, desktop -> 400}|
|U004   |{laptop -> 250}                |
+-------+-------------------------------+



In [20]:
nested_data = [
    ("U001",("Hyderabad","Telangana",500081)),
    ("U002",("Delhi","Delhi",110001)),
    ("U003",("Bangalore","Karnataka",560001))
]

In [26]:
address_schema=StructType([
    StructField("city",StringType(),True),
    StructField("state",StringType(),True),
    StructField("pincode",StringType(),True)
])
profile_schema=StructType([
    StructField("user_id",StringType(),False),
    StructField("address",address_schema,True)
])

In [27]:
df_profiles=spark.createDataFrame(nested_data,profile_schema)
df_profiles.printSchema()
df_profiles.show(truncate=False)

root
 |-- user_id: string (nullable = false)
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- pincode: string (nullable = true)

+-------+------------------------------+
|user_id|address                       |
+-------+------------------------------+
|U001   |{Hyderabad, Telangana, 500081}|
|U002   |{Delhi, Delhi, 110001}        |
|U003   |{Bangalore, Karnataka, 560001}|
+-------+------------------------------+



In [30]:
df_profiles.select(
    "user_id",
    "address.city",
    "address.state",

).show()

+-------+---------+---------+
|user_id|     city|    state|
+-------+---------+---------+
|   U001|Hyderabad|Telangana|
|   U002|    Delhi|    Delhi|
|   U003|Bangalore|Karnataka|
+-------+---------+---------+



In [29]:
from pyspark.sql.functions import col
df_users.withColumn(
    "salary_int",
    col("salary").cast("int")
)

DataFrame[user_id: string, user_name: string, age: int, city: string, salary: bigint, salary_int: int]

In [31]:
from pyspark.sql.functions import to_date
df_orders.withColumn(
    "order_date",
    to_date("order_date","yyyy-MM-dd")
)

NameError: name 'df_orders' is not defined