#### **rand()**
- The **rand()** function in PySpark generates a column with **random values uniformly distributed in the range [0.0, 1.0)**.
- The **range()** function in Python is used for **generating sequences of numbers**.

In [0]:
from pyspark.sql.functions import rand

In [0]:
# Create a sample DataFrame
data = [(1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,), (10,)]
df = spark.createDataFrame(data, ["id"])
display(df)

id
1
2
3
4
5
6
7
8
9
10


#### **1) Assigning Random Decimal Values to a New Column**

In [0]:
from pyspark.sql.functions import rand
# create new column named 'rand' that contains random floats between
df_with_random = df.withColumn("randomValue", rand())
display(df_with_random)

id,randomValue
1,0.2798473157901637
2,0.9295428002575244
3,0.0297845266635771
4,0.813936480443932
5,0.8380057665230152
6,0.1642475382860678
7,0.7980734069812483
8,0.9011207953998212
9,0.1918690072961981
10,0.3666338515820985


In [0]:
# create new column named 'rand' that contains random floats between
df_with_random_mult = df.withColumn("randomValue_01", rand())\
                        .withColumn("randomValue_02", rand())\
                        .withColumn("randomValue_03", rand())
display(df_with_random_mult)

id,randomValue_01,randomValue_02,randomValue_03
1,0.1359369624418737,0.3911864294589773,0.2361550279990836
2,0.6087107863494011,0.3010735703306754,0.2838274211633239
3,0.948172961543561,0.6294746601757544,0.8508272305123705
4,0.6754506643460293,0.5718517159450239,0.56443329527368
5,0.371262235836714,0.5013299549229663,0.512755233167723
6,0.7793689048510651,0.963512378492176,0.3931929552689192
7,0.1482005855566628,0.0050352265928018,0.7898927389441236
8,0.6362535155793646,0.815665146338009,0.2360847472315597
9,0.1879233988319296,0.7992995139682154,0.8691918635662662
10,0.0476406907407372,0.0421321600820707,0.9253449274192896


#### **2) Filtering Rows Based on Random Values**

In [0]:
# Filter rows where random number < 0.5
filtered_df = df.withColumn("RandomNumber", rand()).filter("RandomNumber < 0.5")
display(filtered_df)

id,RandomNumber
1,0.1840886121832148
4,0.0171028409056959
6,0.0403214332980521
7,0.3674548229648664
8,0.4423708164469996
10,0.1208471975489149


In [0]:
df.withColumn("random_value", (rand() > 0.5)).display()

id,random_value
1,False
2,False
3,True
4,True
5,False
6,True
7,False
8,False
9,True
10,False


In [0]:
df.withColumn("random_value", (rand() < 0.5)).display()

id,random_value
1,False
2,False
3,False
4,False
5,False
6,True
7,False
8,False
9,False
10,True


#### **3) Assigning Random Integer Values to a New Column**

     df.withColumn('rand_int', round(rand(), 0)).display()
                           (or)
     df.withColumn("random_int", (rand() * 100).cast("integer")).show()

In [0]:
from pyspark.sql.functions import rand, round

#create new column named 'rand' that contains random integers
df.withColumn('rand_int', round(rand(), 0)).display()

id,rand_int
1,0.0
2,0.0
3,1.0
4,0.0
5,1.0
6,0.0
7,1.0
8,0.0
9,1.0
10,1.0


In [0]:
# Assign random group IDs (1 to 3)
from pyspark.sql.functions import floor

df_with_groups = df.withColumn("GroupID", floor(rand() * 3) + 1)
display(df_with_groups)

id,GroupID
1,2
2,2
3,1
4,3
5,2
6,3
7,3
8,1
9,3
10,2


In [0]:
df_range = spark.range(9)
display(df_range)

id
0
1
2
3
4
5
6
7
8


In [0]:
df_range.withColumn("random_int", (rand() * 100).cast("integer")).display()

id,random_int
0,3
1,49
2,4
3,26
4,91
5,69
6,3
7,90
8,16


In [0]:
df_cast = df_range.withColumn("rand_01", rand() * 1000)\
                  .withColumn("random_int", (rand() * 1000).cast("integer"))
display(df_cast)

id,rand_01,random_int
0,686.6590264512935,63
1,114.69002382346416,905
2,698.6154832447789,368
3,535.625928260888,276
4,750.9944528724891,222
5,447.0103983268037,561
6,698.4892214641027,853
7,805.7483698310514,53
8,870.9678440704349,884


#### **4) How to create random decimals / integers between 0 and 100**

In [0]:
# create new column named 'rand' that contains random floats between 0 and 100
df_with_deci = df.withColumn("random_decimal", rand()*100)
display(df_with_deci)

id,random_decimal
1,33.47784823775336
2,95.8227916781665
3,45.69612799128196
4,83.46617242480953
5,37.56251572726457
6,53.76628252369895
7,80.51881103283375
8,46.92653008842241
9,86.0649836460729
10,81.39797776460824


In [0]:
# create new column named 'rand' that contains random floats between 0 and 50
df_with_deci50 = df.withColumn("random_decimal", rand()*50)
display(df_with_deci50)

id,random_decimal
1,33.97789152510946
2,5.731185128265315
3,14.433501546218867
4,19.93883746672563
5,42.56679942992203
6,42.42608453587161
7,9.05351299558433
8,35.164176421319645
9,17.425448354802253
10,34.218457419135284


In [0]:
# create new column named 'rand' that contains random floats between 0 and 100
df_with_int = df.withColumn("random_int", round(rand()*100, 0))
display(df_with_int)

id,random_int
1,62.0
2,25.0
3,96.0
4,88.0
5,9.0
6,36.0
7,83.0
8,95.0
9,44.0
10,90.0


In [0]:
# Generate random numbers between 10 and 20
df_with_random_range = df.withColumn("RandomInRange", rand() * 10 + 10)
display(df_with_random_range)

id,RandomInRange
1,15.343804446108118
2,16.701757067639218
3,11.855330626482475
4,17.844604778827193
5,10.967346516354608
6,17.279607341761736
7,17.303777165069107
8,19.619099470759657
9,19.00182963127152
10,16.019314848985136


#### **5) How to add random noise to existing numeric columns**

In [0]:
# Create a DataFrame with a numeric column
data = [(1, 100), (2, 200), (3, 300), (4, 400)]
df_numeric = spark.createDataFrame(data, ["ID", "Value"])

from pyspark.sql.functions import col
# Add random noise
df_with_noise = df_numeric.withColumn("ValueWithNoise", col("Value") + rand())
display(df_with_noise)

ID,Value,ValueWithNoise
1,100,100.94450790601594
2,200,200.57186419320865
3,300,300.46456837603057
4,400,400.2812182938175


#### **6) seed**
- By specifying a value for **seed** within the **rand()** function, we will be able to generate the **same random numbers** each time we run the code.

       rand(seed=23)
       rand(seed=42)
       df.withColumn('rand_value', rand(seed=23))
       df.withColumn('rand_value', rand(seed=42))
       df.withColumn('rand_value', rand(123))

In [0]:
# define data
data = [['Mahesh', 18], 
        ['Nitesh', 33], 
        ['Lakshan', 12], 
        ['Kishore', 15], 
        ['Harish', 19],
        ['Watson', 24],
        ['Mohit', 28],
        ['Jagadish', 40],
        ['Tharun', 24],
        ['Supriya', 13]]
  
# define column names
columns = ['team', 'points'] 
  
# create dataframe using data and column names
df_seed = spark.createDataFrame(data, columns) 
  
# view dataframe
display(df_seed)

team,points
Mahesh,18
Nitesh,33
Lakshan,12
Kishore,15
Harish,19
Watson,24
Mohit,28
Jagadish,40
Tharun,24
Supriya,13


In [0]:
from pyspark.sql.functions import rand

#create new column named 'rand' that contains random floats between 0 and 100
df_seed.withColumn('rand_seed_decimal', rand(seed=23)*100).display()

team,points,rand_seed_decimal
Mahesh,18,93.88044512577216
Nitesh,33,39.432553969527554
Lakshan,12,23.26036139908492
Kishore,15,2.339183228862929
Harish,19,82.53753350983487
Watson,24,88.94415403143505
Mohit,28,80.81524027081029
Jagadish,40,59.56629641640896
Tharun,24,27.62195585886885
Supriya,13,70.43214981152886


In [0]:
from pyspark.sql.functions import rand, round

#create new column named 'rand' that contains random integers between 0 and 100
df_seed.withColumn('rand_seed_int', round(rand(seed=23)*100, 0)).display()

team,points,rand_seed_int
Mahesh,18,94.0
Nitesh,33,39.0
Lakshan,12,23.0
Kishore,15,2.0
Harish,19,83.0
Watson,24,89.0
Mohit,28,81.0
Jagadish,40,60.0
Tharun,24,28.0
Supriya,13,70.0


In [0]:
# check out the side to side comparison when running rand without the seed
df_with_rand_seed = df.withColumn("randomValue_01", rand())\
                      .withColumn("randomValue_02", rand())\
                      .withColumn("randomSeed_03", rand(seed=23))\
                      .withColumn("randomSeed_04", rand(seed=23))
display(df_with_rand_seed)

id,randomValue_01,randomValue_02,randomSeed_03,randomSeed_04
1,0.0519837726930869,0.117065844334993,0.9388044512577216,0.9388044512577216
2,0.0842352543933203,0.4316392558230483,0.3943255396952755,0.3943255396952755
3,0.107205117566074,0.5525204832945797,0.2326036139908491,0.2326036139908491
4,0.2429811455697493,0.052983398894692,0.0233918322886292,0.0233918322886292
5,0.4618149231591518,0.8528891267293843,0.8253753350983487,0.8253753350983487
6,0.0748055569842764,0.5184338363971495,0.8894415403143504,0.8894415403143504
7,0.9391304253932776,0.3419045136148039,0.808152402708103,0.808152402708103
8,0.0967440838912826,0.9667652171714154,0.5956629641640896,0.5956629641640896
9,0.910023586236264,0.1443620957205192,0.2762195585886885,0.2762195585886885
10,0.2293001876631754,0.2646626067937685,0.7043214981152885,0.7043214981152885


#### **7) How to partition using range() & rand()?**

In [0]:
from pyspark.sql import functions as f
from pyspark.sql.functions import spark_partition_id

# range(start, end, step, num of partitions)
spark.range(0, 20, 2, 2).withColumn('rand', f.rand(seed=42) * 3)\
                        .withColumn("spart_partition", spark_partition_id()).display()

id,rand,spart_partition
0,1.8575681106759028,0
2,1.5288056527339444,0
4,2.4975778166614573,0
6,0.7896842712351707,0
8,2.0108603088792405,0
10,2.4052597283576684,1
12,1.969665884997696,1
14,0.7546787347780908,1
16,0.6220285128333222,1
18,1.917876413783678,1
