In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

# Spark session & context
spark = SparkSession.builder.master('local').getOrCreate()
sc = spark.sparkContext

In [2]:
df = spark.sql("SELECT sequence(to_date('2022-01-31'), to_date('2023-12-31'), interval 1 month) as closing_date").withColumn('closing_date', F.explode(F.col('closing_date')))
df2 = spark.sql("SELECT sequence(to_date('2020-01-31'), to_date('2023-12-31'), interval 1 month) as join_date").withColumn('join_date', F.explode(F.col('join_date')))

In [3]:
df = df.crossJoin(df2)

In [4]:
# Filtrar dataframe para quedar solo con las 24 fechas menores a la de cierre.
df = df.where((df.closing_date>=df.join_date) & (F.months_between('closing_date','join_date')<24) )

In [5]:
new_columns = {'year': F.extract(F.lit('YEAR'),'closing_date'),
               'cod_month': F.concat(F.year('closing_date'), F.date_format('closing_date', 'MM')) }

df = df.withColumns(new_columns)

wA = Window.orderBy(F.desc('closing_date')) #DESCENDING el order by
wB = Window.orderBy(F.desc('year'))
wC = Window.orderBy(F.desc('cod_month'))

new_columns_ranks = {'closing_date_rank': F.dense_rank().over(wA),
                     'year_rank': F.dense_rank().over(wB),
                     'month_rank': F.dense_rank().over(wC)}
df = df.withColumns(new_columns_ranks)


In [6]:
flag_columns = {'flag_current_date': F.when(F.col('closing_date') == F.col('join_date'), 1).otherwise(0),
                'flag_ytd': F.when(F.col('year') ==  F.extract(F.lit('YEAR'),'join_date'), 1).otherwise(0),
                'flag_yoy': F.when((F.year('closing_date')-1 == F.year('join_date')) & (F.month('closing_date') == F.month('join_date')), 1).otherwise(0),
                'flag_ytd_py': F.when((F.year('closing_date')-1 == F.year('join_date')) & (F.month('closing_date') >= F.month('join_date')), 1).otherwise(0),
                'flag_mom': F.when(F.last_day(F.add_months(F.col('closing_date'),-1)) == F.col('join_date'), 1).otherwise(0),
                'flag_l12m': F.when(F.last_day(F.add_months(F.col('closing_date'),-11)) <= F.col('join_date'), 1).otherwise(0),
                'flag_l12m_py': F.when((F.last_day(F.add_months(F.col('closing_date'),-23)) <= F.col('join_date')) & (F.last_day(F.add_months(F.col('closing_date'),-12)) >= F.col('join_date')), 1).otherwise(0)


}
df = df.withColumns(flag_columns)


In [7]:
df.where(df.closing_date=='2023-12-31').show(df.count())

+------------+----------+----+---------+-----------------+---------+----------+-----------------+--------+--------+-----------+--------+---------+------------+
|closing_date| join_date|year|cod_month|closing_date_rank|year_rank|month_rank|flag_current_date|flag_ytd|flag_yoy|flag_ytd_py|flag_mom|flag_l12m|flag_l12m_py|
+------------+----------+----+---------+-----------------+---------+----------+-----------------+--------+--------+-----------+--------+---------+------------+
|  2023-12-31|2022-01-31|2023|   202312|                1|        1|         1|                0|       0|       0|          1|       0|        0|           1|
|  2023-12-31|2022-02-28|2023|   202312|                1|        1|         1|                0|       0|       0|          1|       0|        0|           1|
|  2023-12-31|2022-03-31|2023|   202312|                1|        1|         1|                0|       0|       0|          1|       0|        0|           1|
|  2023-12-31|2022-04-30|2023|   202312|

In [8]:
df.dtypes

[('closing_date', 'date'),
 ('join_date', 'date'),
 ('year', 'int'),
 ('cod_month', 'string'),
 ('closing_date_rank', 'int'),
 ('year_rank', 'int'),
 ('month_rank', 'int'),
 ('flag_current_date', 'int'),
 ('flag_ytd', 'int'),
 ('flag_yoy', 'int'),
 ('flag_ytd_py', 'int'),
 ('flag_mom', 'int'),
 ('flag_l12m', 'int'),
 ('flag_l12m_py', 'int')]