In [0]:
import random
import pyspark.sql.functions as f
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

**Ex 01**

In [0]:
# Sample DataFrame
data = [(1,25), (2,35), (3,33), (4,39), (5,29), (6,22), (7,45), (8,48), (9,37), (10,55)]
df = spark.createDataFrame(data, ["id", "age"])
display(df)

id,age
1,25
2,35
3,33
4,39
5,29
6,22
7,45
8,48
9,37
10,55


**a) Add a Random Integer Column**

In [0]:
# Define a UDF for generating random integers
random_udf = udf(lambda: random.randint(1, 100), IntegerType())

# Add a new column with random integers
df_with_random = df.withColumn('random_int', random_udf())

display(df_with_random)

id,age,random_int
1,25,92
2,35,100
3,33,85
4,39,46
5,29,39
6,22,95
7,45,72
8,48,92
9,37,79
10,55,4


**b) Random Integers Within a Specific Range**

In [0]:
# UDF to generate random integers in a given range (e.g., 50 to 100)
random_in_range_udf = udf(lambda: random.randint(50, 100), IntegerType())

# Add a new column with random integers in the range 50 to 100
df_with_random_range = df.withColumn('random_in_range', random_in_range_udf())

display(df_with_random_range)

id,age,random_in_range
1,25,98
2,35,98
3,33,57
4,39,72
5,29,88
6,22,87
7,45,98
8,48,100
9,37,76
10,55,73


**c) Replace Existing Column Values with Random Integers**

In [0]:
# Replace the `id` column with random integers between 10 and 20
df_with_replaced_values = df.withColumn('age', random_in_range_udf())

display(df_with_replaced_values)

id,age
1,51
2,79
3,87
4,62
5,55
6,94
7,77
8,88
9,67
10,94


**d) Random Integers Based on an Existing Column Value**

In [0]:
# UDF to generate random integers based on the existing column value
random_based_on_column_udf = udf(lambda x: random.randint(x, x + 10), IntegerType())

# Add a new column with random integers based on the `id` column
df_with_random_based = df.withColumn('random_based_on_id', random_based_on_column_udf(df['id']))

display(df_with_random_based)

id,age,random_based_on_id
1,25,9
2,35,4
3,33,3
4,39,9
5,29,13
6,22,9
7,45,16
8,48,13
9,37,12
10,55,10


**e) Random Integers for Grouped Data**

In [0]:
# Group data and assign random values per group
df_grouped = df.groupBy().agg(
    f.lit(random.randint(1, 100)).alias('random_group_value')
)

display(df_grouped)

random_group_value
60


**f) Using randint for Conditional Column Values**

In [0]:
# UDF for conditional random values
conditional_random_udf = udf(lambda x: random.randint(1, 50) if x % 2 == 0 else random.randint(51, 100), IntegerType())

# Add a column with random values based on a condition
df_with_conditional_random = df.withColumn('conditional_random', conditional_random_udf(df['id']))

display(df_with_conditional_random)

id,age,conditional_random
1,25,100
2,35,34
3,33,72
4,39,47
5,29,77
6,22,31
7,45,74
8,48,48
9,37,81
10,55,6


**EX 02**

In [0]:
# Sample DataFrame with 10 rows
data = [(1, "Alice", "HR", 5000, 1),
        (2, "Bob", "IT", 6000, 2),
        (3, "Cathy", "HR", 5500, 3),
        (4, "David", "IT", 7000, 4),
        (5, "Eve", "HR", 8000, 5),
        (6, "Frank", "IT", 7500, 6),
        (7, "Grace", "HR", 6200, 7),
        (8, "Hank", "IT", 6800, 8),
        (9, "Ivy", "HR", 7800, 9),
        (10, "Jack", "IT", 6700, 10)]

columns = ["ID", "Name", "Department", "Salary", "cust_value"]

# Create DataFrame
df1 = spark.createDataFrame(data, columns)
display(df1)

ID,Name,Department,Salary,cust_value
1,Alice,HR,5000,1
2,Bob,IT,6000,2
3,Cathy,HR,5500,3
4,David,IT,7000,4
5,Eve,HR,8000,5
6,Frank,IT,7500,6
7,Grace,HR,6200,7
8,Hank,IT,6800,8
9,Ivy,HR,7800,9
10,Jack,IT,6700,10


In [0]:
# Define UDF for random salary values (e.g., between 4000 and 10000)
@udf(IntegerType())
def random_salary():
    return random.randint(4000, 10000)

# Define UDF for random cust_value values (e.g., between 1 and 20)
@udf(IntegerType())
def random_cust_value():
    return random.randint(1, 20)

#### **1) Replace random integers for single column**

In [0]:
# Replace the `cust_value` column with random integers
df_with_random_values = df1.withColumn("cust_value", random_in_range_udf())

# Show the result
display(df_with_random_values)

ID,Name,Department,Salary,cust_value
1,Alice,HR,5000,92
2,Bob,IT,6000,74
3,Cathy,HR,5500,97
4,David,IT,7000,72
5,Eve,HR,8000,70
6,Frank,IT,7500,78
7,Grace,HR,6200,85
8,Hank,IT,6800,83
9,Ivy,HR,7800,78
10,Jack,IT,6700,63


#### **2) Replace random integers for multiple columns**

In [0]:
# Update the columns with random values
df_updated = df1.withColumn("Salary", random_salary())\
                .withColumn("cust_value", random_cust_value())

# Show the result
display(df_updated)

ID,Name,Department,Salary,cust_value
1,Alice,HR,4470,17
2,Bob,IT,9752,16
3,Cathy,HR,6735,4
4,David,IT,6395,17
5,Eve,HR,6224,4
6,Frank,IT,6357,7
7,Grace,HR,4259,17
8,Hank,IT,4269,5
9,Ivy,HR,7306,15
10,Jack,IT,9428,11
