In [1]:
from lib.session import get_spark_session

spark  = get_spark_session("challenge 21 - ")

spark

In [8]:
from pyspark.sql.functions import to_date, dayofmonth, weekofyear, dayofyear, dayofweek
# get day of month, week number, day of year, day of week from date strings
data = [("2023-05-18","01 Jan 2010",), ("2023-12-31", "01 Jan 2010",)]
df = spark.createDataFrame(data, ["date_str_1", "date_str_2"])

df = df.withColumn("date_1", to_date('date_str_1', 'yyyy-MM-dd'))
df = df.withColumn("date_2", to_date('date_str_2', 'dd MMM yyyy'))

(
    df.
    withColumn("dayofmonth", dayofmonth('date_1')).
    withColumn("dayofmonth", weekofyear('date_1')).
    withColumn("dayofmonth", dayofyear('date_1')).
    withColumn("dayofmonth", dayofweek('date_1')).
    show()
)

+----------+-----------+----------+----------+----------+
|date_str_1| date_str_2|    date_1|    date_2|dayofmonth|
+----------+-----------+----------+----------+----------+
|2023-05-18|01 Jan 2010|2023-05-18|2010-01-01|         5|
|2023-12-31|01 Jan 2010|2023-12-31|2010-01-01|         1|
+----------+-----------+----------+----------+----------+



In [10]:
# convert MMM yyyy to date as of 4th of month
from pyspark.sql.functions import to_date, date_add

df = spark.createDataFrame([('Jan 2010',), ('Feb 2011',), ('Mar 2012',)], ['MonthYear'])

df.withColumn('asDate', date_add(to_date('MonthYear', 'MMM yyyy'),3)).show()

+---------+----------+
|MonthYear|    asDate|
+---------+----------+
| Jan 2010|2010-01-04|
| Feb 2011|2011-02-04|
| Mar 2012|2012-03-04|
+---------+----------+



In [13]:
# filter valid emails
from pyspark.sql.functions import col
data = ['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com']

# Convert the list to DataFrame
df = spark.createDataFrame(data, "string")

pattern = "^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"

df.where(col('value').rlike(pattern)).show()


+-----------------+
|            value|
+-----------------+
|rameses@egypt.com|
|        matt@t.co|
|narendra@modi.com|
+-----------------+



In [14]:
# pivot dataframe
data = [
(2021, 1, "US", 5000),
(2021, 1, "EU", 4000),
(2021, 2, "US", 5500),
(2021, 2, "EU", 4500),
(2021, 3, "US", 6000),
(2021, 3, "EU", 5000),
(2021, 4, "US", 7000),
(2021, 4, "EU", 6000),
]

# Create DataFrame
columns = ["year", "quarter", "region", "revenue"]
df = spark.createDataFrame(data, columns)


df_pivot = df.groupBy("year", "quarter").pivot("region").sum('revenue').show()

+----+-------+----+----+
|year|quarter|  EU|  US|
+----+-------+----+----+
|2021|      2|4500|5500|
|2021|      1|4000|5000|
|2021|      3|5000|6000|
|2021|      4|6000|7000|
+----+-------+----+----+



In [19]:
# replace space with least frequent character in string
df = spark.createDataFrame([('dbc deb abed gade',),], ["string"])

from collections import Counter
from pyspark.sql.functions import udf

def replace_least_freq_char(s):
    counter = Counter(s.replace(" ", ""))
    least_freq = min(counter, key = counter.get)
    return s.replace(" ",least_freq)
    

func = udf(replace_least_freq_char, 'string')

df.withColumn('new_string', func('string')).show(truncate=False)

+-----------------+-----------------+
|string           |new_string       |
+-----------------+-----------------+
|dbc deb abed gade|dbccdebcabedcgade|
+-----------------+-----------------+



In [12]:
from pyspark.sql.functions import expr, explode, sequence, rand

#generate a df with saturndays dates and random numbers
start_date = "2000-01-01"
end_date = "2000-3-04"

df = (
    spark.
    range(1).
    select(
        sequence(
            expr(f"date '{start_date}'"),
            expr(f"date '{end_date}'"),
            expr("interval 1 day")
        ).alias("dates")
    ).
    withColumn("day", explode("dates")).
    where("dayofweek(day) = 7").
    withColumn("randint", (rand(seed=42) * 10).cast('int')).
    select("day", "randint")
)

df.show()

+----------+-------+
|       day|randint|
+----------+-------+
|2000-01-01|      5|
|2000-01-08|      0|
|2000-01-15|      6|
|2000-01-22|      6|
|2000-01-29|      1|
|2000-02-05|      3|
|2000-02-12|      2|
|2000-02-19|      7|
|2000-02-26|      0|
|2000-03-04|      4|
+----------+-------+



In [14]:
print(df.dtypes)
df.columns

[('day', 'date'), ('randint', 'int')]


['day', 'randint']

In [16]:
# Suppose you have the following DataFrame
df = spark.createDataFrame([('Alice', 1, 30),('Bob', 2, 35)], ["name", "age", "qty"])

df.show()

# Rename lists for specific columns
old_names = ["qty", "age"]
new_names = ["user_qty", "user_age"]

for o,n in zip(old_names, new_names):
    df = df.withColumnRenamed(o,n)

df.show()

+-----+---+---+
| name|age|qty|
+-----+---+---+
|Alice|  1| 30|
|  Bob|  2| 35|
+-----+---+---+

+-----+--------+--------+
| name|user_age|user_qty|
+-----+--------+--------+
|Alice|       1|      30|
|  Bob|       2|      35|
+-----+--------+--------+



In [36]:
from pyspark.sql.functions import sum, col
df = spark.createDataFrame([
("A", 1, None),
("B", None, "123" ),
("B", 3, "456"),
("D", None, None),
], ["Name", "Value", "id"])


df.select(*( sum(col(c).isNull().cast("int")).alias(c) for c in df.columns)).show()

+----+-----+---+
|Name|Value| id|
+----+-----+---+
|   0|    2|  2|
+----+-----+---+



In [48]:
from pyspark.sql.functions import mean
df = spark.createDataFrame([
("A", 1, None),
("B", None, 123 ),
("B", 3, 456),
("D", 6, None),
], ["Name", "var1", "var2"])

df.show()


(
    df.
    fillna(
            df.
            agg(*( mean(col(c)).alias(c) for c in df.columns if c in ["var1", "var2"])).
            first().
            asDict()
           )
).show()







+----+----+----+
|Name|var1|var2|
+----+----+----+
|   A|   1|null|
|   B|null| 123|
|   B|   3| 456|
|   D|   6|null|
+----+----+----+

+----+----+----+
|Name|var1|var2|
+----+----+----+
|   A|   1| 289|
|   B|   3| 123|
|   B|   3| 456|
|   D|   6| 289|
+----+----+----+



In [52]:
from pyspark.sql.functions import format_number

df = spark.createDataFrame([(1, 0.000000123), (2, 0.000023456), (3, 0.000345678)], ["id", "your_column"])

df.show()

df.withColumn("your_column", format_number("your_column",10)).show()

+---+-----------+
| id|your_column|
+---+-----------+
|  1|    1.23E-7|
|  2|  2.3456E-5|
|  3| 3.45678E-4|
+---+-----------+

+---+------------+
| id| your_column|
+---+------------+
|  1|0.0000001230|
|  2|0.0000234560|
|  3|0.0003456780|
+---+------------+



In [66]:
from pyspark.sql.functions import lit, concat
data = [(0.1, .08), (0.2, .06), (0.33, .02)]
df = spark.createDataFrame(data, ["numbers_1", "numbers_2"])

df.show()


df.select(*(concat((col(c) * lit(100)).cast('decimal(10,2)'), lit("%")).alias(c) for c in df.columns)).show()

+---------+---------+
|numbers_1|numbers_2|
+---------+---------+
|      0.1|     0.08|
|      0.2|     0.06|
|     0.33|     0.02|
+---------+---------+

+---------+---------+
|numbers_1|numbers_2|
+---------+---------+
|   10.00%|    8.00%|
|   20.00%|    6.00%|
|   33.00%|    2.00%|
+---------+---------+



In [71]:
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql import Window

# Sample data
data = [("Alice", 1), ("Bob", 2), ("Charlie", 3), ("Dave", 4), ("Eve", 5),
("Frank", 6), ("Grace", 7), ("Hannah", 8), ("Igor", 9), ("Jack", 10)]

# Create DataFrame
df = spark.createDataFrame(data, ["Name", "Number"])

df.show()



w = Window.orderBy(monotonically_increasing_id())

(
    df.
    withColumn("id", row_number().over(w)).
    where("id % 5 == 0")
).show()

+-------+------+
|   Name|Number|
+-------+------+
|  Alice|     1|
|    Bob|     2|
|Charlie|     3|
|   Dave|     4|
|    Eve|     5|
|  Frank|     6|
|  Grace|     7|
| Hannah|     8|
|   Igor|     9|
|   Jack|    10|
+-------+------+

+----+------+---+
|Name|Number| id|
+----+------+---+
| Eve|     5|  5|
|Jack|    10| 10|
+----+------+---+



In [77]:
from pyspark.sql import Row
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql import Window

# Sample Data
data = [
Row(id=1, column1=5),
Row(id=2, column1=8),
Row(id=3, column1=12),
Row(id=4, column1=1),
Row(id=5, column1=15),
Row(id=6, column1=7),
]

df = spark.createDataFrame(data)
df.show()



w = Window().orderBy(col("column1").desc())

df.withColumn("ord", row_number().over(w)).where("ord = 1").select("id").show()

+---+-------+
| id|column1|
+---+-------+
|  1|      5|
|  2|      8|
|  3|     12|
|  4|      1|
|  5|     15|
|  6|      7|
+---+-------+

+---+
| id|
+---+
|  5|
+---+



In [100]:
from pyspark.sql import functions as F
from functools import reduce

# Sample data
data = [(10, 25, 70),
(40, 5, 20),
(70, 80, 100),
(10, 2, 60),
(40, 50, 20)]

# Create DataFrame
df = spark.createDataFrame(data, ["col1", "col2", "col3"])

# Display original DataFrame
df.show()

df.withColumn('row_sum', reduce(lambda a,b: a+b, [F.col(c) for c in df.columns])).show()

+----+----+----+
|col1|col2|col3|
+----+----+----+
|  10|  25|  70|
|  40|   5|  20|
|  70|  80| 100|
|  10|   2|  60|
|  40|  50|  20|
+----+----+----+

+----+----+----+-------+
|col1|col2|col3|row_sum|
+----+----+----+-------+
|  10|  25|  70|    105|
|  40|   5|  20|     65|
|  70|  80| 100|    250|
|  10|   2|  60|     72|
|  40|  50|  20|    110|
+----+----+----+-------+



In [123]:
from pyspark.sql.functions import udf, array
from pyspark.sql.types import FloatType
# Sample Data
data = [(1, 2, 3), (4, 5, 6), (7, 8, 9), (10, 11, 12)]

# Create DataFrame
df = spark.createDataFrame(data, ["col1", "col2", "col3"])

df.show()



def find_min_by_max(row):
    return min(row)/max(row)
    

min_by_max_func = udf(find_min_by_max, FloatType())

df.withColumn("min_by_max", min_by_max_func(array(df.columns))).show()

+----+----+----+
|col1|col2|col3|
+----+----+----+
|   1|   2|   3|
|   4|   5|   6|
|   7|   8|   9|
|  10|  11|  12|
+----+----+----+

+----+----+----+----------+
|col1|col2|col3|min_by_max|
+----+----+----+----------+
|   1|   2|   3|0.33333334|
|   4|   5|   6| 0.6666667|
|   7|   8|   9| 0.7777778|
|  10|  11|  12| 0.8333333|
+----+----+----+----------+



In [131]:
# Create a sample DataFrame
data = [("2023-01-01", "Store1", 100),
("2023-01-02", "Store1", 150),
("2023-01-03", "Store1", 200),
("2023-01-04", "Store1", 250),
("2023-01-05", "Store1", 300),
("2023-01-01", "Store2", 50),
("2023-01-02", "Store2", 60),
("2023-01-03", "Store2", 80),
("2023-01-04", "Store2", 90),
("2023-01-05", "Store2", 120)]

df = spark.createDataFrame(data, ["Date", "Store", "Sales"])

df.show()


from pyspark.sql import Window
from pyspark.sql.functions import lag, lead

w = Window().partitionBy("Store").orderBy("Date")

(
    df.
    withColumn("lag", lag("Sales").over(w)).
    withColumn("lead", lead("Sales").over(w))
).show()

+----------+------+-----+
|      Date| Store|Sales|
+----------+------+-----+
|2023-01-01|Store1|  100|
|2023-01-02|Store1|  150|
|2023-01-03|Store1|  200|
|2023-01-04|Store1|  250|
|2023-01-05|Store1|  300|
|2023-01-01|Store2|   50|
|2023-01-02|Store2|   60|
|2023-01-03|Store2|   80|
|2023-01-04|Store2|   90|
|2023-01-05|Store2|  120|
+----------+------+-----+

+----------+------+-----+----+----+
|      Date| Store|Sales| lag|lead|
+----------+------+-----+----+----+
|2023-01-01|Store1|  100|null| 150|
|2023-01-02|Store1|  150| 100| 200|
|2023-01-03|Store1|  200| 150| 250|
|2023-01-04|Store1|  250| 200| 300|
|2023-01-05|Store1|  300| 250|null|
|2023-01-01|Store2|   50|null|  60|
|2023-01-02|Store2|   60|  50|  80|
|2023-01-03|Store2|   80|  60|  90|
|2023-01-04|Store2|   90|  80| 120|
|2023-01-05|Store2|  120|  90|null|
+----------+------+-----+----+----+



In [138]:
# Create a numeric DataFrame
data = [(1, 2, 3),
(2, 3, 4),
(1, 2, 3),
(4, 5, 6),
(2, 3, 4)]
df = spark.createDataFrame(data, ["Column1", "Column2", "Column3"])

# Print DataFrame
df.show()

df.select("Column1").unionAll(df.select("Column2")).unionAll(df.select("Column3")).groupBy("Column1").agg(count(lit(1))).show()

+-------+-------+-------+
|Column1|Column2|Column3|
+-------+-------+-------+
|      1|      2|      3|
|      2|      3|      4|
|      1|      2|      3|
|      4|      5|      6|
|      2|      3|      4|
+-------+-------+-------+

+-------+--------+
|Column1|count(1)|
+-------+--------+
|      1|       2|
|      2|       4|
|      4|       3|
|      3|       4|
|      5|       1|
|      6|       1|
+-------+--------+



In [8]:
from pyspark.sql.functions import when, col
# Create a numeric DataFrame
data = [(1, 2, 3, 4),
(2, 3, 4, 5),
(1, 2, 3, 4),
(4, 5, 6, 7)]

df = spark.createDataFrame(data, ["col_1", "col_2", "col_3", "col_4"])

# Print DataFrame
df.show()


from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql import Window

w = Window.orderBy(monotonically_increasing_id())

(
    df.
    withColumn("row_num", row_number().over(w) - 1).
    select([when(col("row_num") == i,0).otherwise(col("col_" + str(i+1))).alias("col_" + str(i+1)) for i in range(4)])
).show()

+-----+-----+-----+-----+
|col_1|col_2|col_3|col_4|
+-----+-----+-----+-----+
|    1|    2|    3|    4|
|    2|    3|    4|    5|
|    1|    2|    3|    4|
|    4|    5|    6|    7|
+-----+-----+-----+-----+

+-----+-----+-----+-----+
|col_1|col_2|col_3|col_4|
+-----+-----+-----+-----+
|    0|    2|    3|    4|
|    2|    0|    4|    5|
|    1|    2|    0|    4|
|    4|    5|    6|    0|
+-----+-----+-----+-----+

