In [1]:
from lib.session import get_spark_session

spark  = get_spark_session("challenge 0 - ")

spark

In [8]:
from pyspark.sql.functions import to_date, dayofmonth, weekofyear, dayofyear, dayofweek
# get day of month, week number, day of year, day of week from date strings
data = [("2023-05-18","01 Jan 2010",), ("2023-12-31", "01 Jan 2010",)]
df = spark.createDataFrame(data, ["date_str_1", "date_str_2"])

df = df.withColumn("date_1", to_date('date_str_1', 'yyyy-MM-dd'))
df = df.withColumn("date_2", to_date('date_str_2', 'dd MMM yyyy'))

(
    df.
    withColumn("dayofmonth", dayofmonth('date_1')).
    withColumn("dayofmonth", weekofyear('date_1')).
    withColumn("dayofmonth", dayofyear('date_1')).
    withColumn("dayofmonth", dayofweek('date_1')).
    show()
)

+----------+-----------+----------+----------+----------+
|date_str_1| date_str_2|    date_1|    date_2|dayofmonth|
+----------+-----------+----------+----------+----------+
|2023-05-18|01 Jan 2010|2023-05-18|2010-01-01|         5|
|2023-12-31|01 Jan 2010|2023-12-31|2010-01-01|         1|
+----------+-----------+----------+----------+----------+



In [10]:
# convert MMM yyyy to date as of 4th of month
from pyspark.sql.functions import to_date, date_add

df = spark.createDataFrame([('Jan 2010',), ('Feb 2011',), ('Mar 2012',)], ['MonthYear'])

df.withColumn('asDate', date_add(to_date('MonthYear', 'MMM yyyy'),3)).show()

+---------+----------+
|MonthYear|    asDate|
+---------+----------+
| Jan 2010|2010-01-04|
| Feb 2011|2011-02-04|
| Mar 2012|2012-03-04|
+---------+----------+



In [13]:
# filter valid emails
from pyspark.sql.functions import col
data = ['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com']

# Convert the list to DataFrame
df = spark.createDataFrame(data, "string")

pattern = "^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"

df.where(col('value').rlike(pattern)).show()


+-----------------+
|            value|
+-----------------+
|rameses@egypt.com|
|        matt@t.co|
|narendra@modi.com|
+-----------------+



In [14]:
# pivot dataframe
data = [
(2021, 1, "US", 5000),
(2021, 1, "EU", 4000),
(2021, 2, "US", 5500),
(2021, 2, "EU", 4500),
(2021, 3, "US", 6000),
(2021, 3, "EU", 5000),
(2021, 4, "US", 7000),
(2021, 4, "EU", 6000),
]

# Create DataFrame
columns = ["year", "quarter", "region", "revenue"]
df = spark.createDataFrame(data, columns)


df_pivot = df.groupBy("year", "quarter").pivot("region").sum('revenue').show()

+----+-------+----+----+
|year|quarter|  EU|  US|
+----+-------+----+----+
|2021|      2|4500|5500|
|2021|      1|4000|5000|
|2021|      3|5000|6000|
|2021|      4|6000|7000|
+----+-------+----+----+



In [19]:
# replace space with least frequent character in string
df = spark.createDataFrame([('dbc deb abed gade',),], ["string"])

from collections import Counter
from pyspark.sql.functions import udf

def replace_least_freq_char(s):
    counter = Counter(s.replace(" ", ""))
    least_freq = min(counter, key = counter.get)
    return s.replace(" ",least_freq)
    

func = udf(replace_least_freq_char, 'string')

df.withColumn('new_string', func('string')).show(truncate=False)

+-----------------+-----------------+
|string           |new_string       |
+-----------------+-----------------+
|dbc deb abed gade|dbccdebcabedcgade|
+-----------------+-----------------+

