In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os

k8s_host=os.environ.get('KUBERNETES_SERVICE_HOST')
k8s_port=os.environ.get('KUBERNETES_SERVICE_PORT')

sparkConf = SparkConf()
#sparkConf.setMaster("k8s://https://api.crc.testing:6443")
sparkConf.setMaster("k8s://https://" + k8s_host + ":" + k8s_port)
sparkConf.setAppName("KUBERNETES-IS-AWESOME")
sparkConf.set("spark.kubernetes.container.image", "apache/spark-py:v3.3.1")
# sparkConf.set("spark.kubernetes.container.image", "jupyter/pyspark-notebook:python-3.9")
sparkConf.set("spark.kubernetes.namespace", "spark")
sparkConf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
sparkConf.set("spark.driver.host", os.environ.get('POD_IP'))
#sparkConf.set("spark.driver.bindAddress", "")

sparkConf.set("spark.executor.memory", "8g")

spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
sc = spark.sparkContext

#spark = SparkSession \
#    .builder \
#    .appName("Python Spark SQL basic example") \
#    .master("k8s://https://api.crc.testing:6443") \
#    .config("spark.kubernetes.namespace=spark") \
#    .config("spark.kubernetes.driver.container.image=apache/spark-py:v3.3.1") \
#    .config("spark.kubernetes.executor.container.image=apache/spark-py:v3.3.1") \
#    .config("spark.kubernetes.authenticate.driver.serviceAccountName=spark") \
#    .config("spark.some.config.option", "some-value") \
#    .getOrCreate()


In [2]:
# Create data
import random
import pandas as pd

num_members = 531
num_contracts = 23
num_scenarios = 4000

positions = [(mem, con, random.randint(-20, 20)) for mem in range(num_members) for con in range(num_contracts)]
# print(positions)

df_positions = pd.DataFrame(data = positions, columns = ["member_id", "contract_id", "value"])

print(df_positions)

# import random
# import pandas as pd

spark_df_positions=spark.createDataFrame(df_positions) 
spark_df_positions.printSchema()
spark_df_positions.show()

scenarios = [(sce, random.normalvariate(1, 0.1)) for sce in range(num_scenarios)]
# print(scenarios)

df_scenarios = pd.DataFrame(data = scenarios, columns = ["scenario_id", "shift_value"])
print(df_scenarios)


spark_df_scenarios=spark.createDataFrame(df_scenarios) 
spark_df_scenarios.printSchema()
spark_df_scenarios.show()

       member_id  contract_id  value
0              0            0      6
1              0            1      7
2              0            2    -17
3              0            3    -12
4              0            4      6
...          ...          ...    ...
12208        530           18     -2
12209        530           19      8
12210        530           20     -8
12211        530           21      0
12212        530           22     -5

[12213 rows x 3 columns]


  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


root
 |-- member_id: long (nullable = true)
 |-- contract_id: long (nullable = true)
 |-- value: long (nullable = true)

+---------+-----------+-----+
|member_id|contract_id|value|
+---------+-----------+-----+
|        0|          0|    6|
|        0|          1|    7|
|        0|          2|  -17|
|        0|          3|  -12|
|        0|          4|    6|
|        0|          5|  -13|
|        0|          6|  -15|
|        0|          7|   -9|
|        0|          8|  -20|
|        0|          9|  -20|
|        0|         10|   10|
|        0|         11|  -12|
|        0|         12|   16|
|        0|         13|   15|
|        0|         14|   -4|
|        0|         15|  -10|
|        0|         16|  -10|
|        0|         17|  -12|
|        0|         18|  -13|
|        0|         19|  -10|
+---------+-----------+-----+
only showing top 20 rows

      scenario_id  shift_value
0               0     0.937657
1               1     0.824433
2               2     1.023633
3        

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


+-----------+------------------+
|scenario_id|       shift_value|
+-----------+------------------+
|          0| 0.937657070250954|
|          1|0.8244331574643247|
|          2|1.0236328378512518|
|          3|1.2242555865273401|
|          4|1.0405805774149617|
|          5|0.8619712553171287|
|          6|0.9356813321588868|
|          7|1.1076484714781865|
|          8|0.9802510812313805|
|          9| 1.025266640262752|
|         10|1.0412450773775173|
|         11|1.0240022353255966|
|         12|1.0156201612802138|
|         13|1.0878026456143535|
|         14|0.8900369882860498|
|         15|0.9603567221378804|
|         16|0.9819657668045946|
|         17|0.9945288796696822|
|         18|1.0127986678972682|
|         19|0.9691530928390348|
+-----------+------------------+
only showing top 20 rows



In [3]:
# # Logic
# df_pnl = df_positions.merge(df_scenarios, how="cross")
# df_pnl["pnl_value"] = df_pnl["value"] * df_pnl["shift_value"]
# print(str(df_pnl.size))

# df_pnl = df_pnl.groupby(["member_id", "scenario_id"], as_index=False)["pnl_value"].sum() # Result
# print(df_pnl)

# print("done")

print("joining...")
joined = spark_df_positions.crossJoin(spark_df_scenarios)
#result = joined.withColumn('pnl_value', joined.value - joined.shift_value).groupBy("member_id", "scenario_id").sum("pnl_value").collect()
joined.withColumn('pnl_value', joined.value - joined.shift_value).groupBy("member_id", "scenario_id").sum("pnl_value").show(truncate=False)



joining...
+---------+-----------+-------------------+
|member_id|scenario_id|sum(pnl_value)     |
+---------+-----------+-------------------+
|0        |2066       |-111.8235109707301 |
|0        |2109       |-117.30318827276739|
|0        |2307       |-117.42045630292708|
|0        |2417       |-115.87180253361502|
|0        |2466       |-117.48207300346841|
|0        |2618       |-116.46043388467683|
|0        |2660       |-118.00436797780603|
|0        |3484       |-119.44736546293842|
|0        |3492       |-115.4269800202625 |
|0        |3698       |-119.01501255130553|
|0        |3952       |-114.33355150114969|
|5        |3963       |-52.986456845070585|
|7        |3856       |-60.47210554659992 |
|9        |3944       |-1.0436778756890135|
|10       |3963       |50.01354315492943  |
|12       |3954       |-104.04676043644272|
|13       |3971       |100.81259851546605 |
|15       |3996       |26.260057278221666 |
|24       |3948       |-0.5503287455731574|
|24       |3972      

In [2]:
spark.stop()