In [95]:
top_n = 5
top_year = "1992"
output_path = "/user/k_haritonov/hometask_2"

In [96]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window

In [97]:
#Необходимо за выбранный год top_year найти top_n записей покупателей с самым большим количеством (уникальных) заказов.
#
#top_year - год top_dt
#row_number - номер строки (1 строка - покупатель с самым большим колиством заказов)
#customer_name
#customer_nation_name
#customer_region_name
#orders_count - количество заказов в день date

spark.sql("show tables in tpch_flat_orc_2").show()

+---------------+---------+-----------+
|       database|tableName|isTemporary|
+---------------+---------+-----------+
|tpch_flat_orc_2| customer|      false|
|tpch_flat_orc_2| lineitem|      false|
|tpch_flat_orc_2|   nation|      false|
|tpch_flat_orc_2|   orders|      false|
|tpch_flat_orc_2|     part|      false|
|tpch_flat_orc_2| partsupp|      false|
|tpch_flat_orc_2|   region|      false|
|tpch_flat_orc_2| supplier|      false|
+---------------+---------+-----------+

In [98]:
customer = spark.read.table("tpch_flat_orc_2.customer")
orders = spark.read.table("tpch_flat_orc_2.orders")
nation = spark.read.table("tpch_flat_orc_2.nation")
region = spark.read.table("tpch_flat_orc_2.region")

In [99]:
# filter orders by date and limit
top_orders = orders \
.filter((orders.o_orderdate > top_year) 
      & (orders.o_orderdate < str(int(top_year)+1))) \
.groupby(orders.o_custkey) \
.agg(F.count(orders.o_custkey).alias('count_orders')) \
.orderBy(F.col('count_orders'), ascending=False) \
.limit(top_n) \
.alias('top_orders')

In [100]:
# join with customer
orders_customers = top_orders \
.join(customer, top_orders.o_custkey == customer.c_custkey) \
.alias('orders_customers')

In [101]:
# join with nations
orders_customers_nation = orders_customers \
.join(nation, orders_customers.c_nationkey == nation.n_nationkey) \
.alias('orders_customers_nation')

In [102]:
# join with customer, then add year and row_number columns, rename columns and select only needed
df_homework_2 = orders_customers_nation \
.join(region, orders_customers_nation.n_regionkey == region.r_regionkey) \
.withColumn('top_year', F.lit(top_year)) \
.withColumn('row_number', F.row_number().over(Window.orderBy(orders_customers_nation.count_orders.desc()))) \
.withColumnRenamed('c_name', 'customer_name') \
.withColumnRenamed('n_name', 'customer_nation_name') \
.withColumnRenamed('r_name', 'customer_region_name') \
.select(*['top_year', 'row_number', 'customer_name', 'customer_nation_name', 'customer_region_name', 'count_orders'])

In [103]:
# Save dataframe to HDFS
# Sort data by partitions and save to .csv file with overwrite mode
df_homework_2 \
    .repartition(1) \
    .write.save(output_path, format='csv', sep='\t', mode='overwrite')