In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os

k8s_host=os.environ.get('KUBERNETES_SERVICE_HOST')
k8s_port=os.environ.get('KUBERNETES_SERVICE_PORT')

sparkConf = SparkConf()
#sparkConf.setMaster("k8s://https://api.crc.testing:6443")
sparkConf.setMaster("k8s://https://" + k8s_host + ":" + k8s_port)
sparkConf.setAppName("KUBERNETES-IS-AWESOME")
sparkConf.set("spark.kubernetes.container.image", "apache/spark-py:v3.3.1")
# sparkConf.set("spark.kubernetes.container.image", "jupyter/pyspark-notebook:python-3.9")
sparkConf.set("spark.kubernetes.namespace", "spark")
sparkConf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
sparkConf.set("spark.driver.host", os.environ.get('POD_IP'))
#sparkConf.set("spark.driver.bindAddress", "")

sparkConf.set("spark.executor.memory", "8g")

spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
sc = spark.sparkContext

#spark = SparkSession \
#    .builder \
#    .appName("Python Spark SQL basic example") \
#    .master("k8s://https://api.crc.testing:6443") \
#    .config("spark.kubernetes.namespace=spark") \
#    .config("spark.kubernetes.driver.container.image=apache/spark-py:v3.3.1") \
#    .config("spark.kubernetes.executor.container.image=apache/spark-py:v3.3.1") \
#    .config("spark.kubernetes.authenticate.driver.serviceAccountName=spark") \
#    .config("spark.some.config.option", "some-value") \
#    .getOrCreate()


In [2]:
# Create data
import random
import pandas as pd

num_members = 531
num_contracts = 23
num_scenarios = 4000

positions = [(mem, con, random.randint(-20, 20)) for mem in range(num_members) for con in range(num_contracts)]
# print(positions)

df_positions = pd.DataFrame(data = positions, columns = ["member_id", "contract_id", "value"])

print(df_positions)

# import random
# import pandas as pd

spark_df_positions=spark.createDataFrame(df_positions) 
spark_df_positions.printSchema()
spark_df_positions.show()

scenarios = [(sce, random.normalvariate(1, 0.1)) for sce in range(num_scenarios)]
# print(scenarios)

df_scenarios = pd.DataFrame(data = scenarios, columns = ["scenario_id", "shift_value"])
print(df_scenarios)


spark_df_scenarios=spark.createDataFrame(df_scenarios) 
spark_df_scenarios.printSchema()
spark_df_scenarios.show()

       member_id  contract_id  value
0              0            0    -16
1              0            1      2
2              0            2     19
3              0            3      9
4              0            4    -19
...          ...          ...    ...
12208        530           18    -14
12209        530           19     16
12210        530           20     -4
12211        530           21     11
12212        530           22     -3

[12213 rows x 3 columns]


  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


root
 |-- member_id: long (nullable = true)
 |-- contract_id: long (nullable = true)
 |-- value: long (nullable = true)

+---------+-----------+-----+
|member_id|contract_id|value|
+---------+-----------+-----+
|        0|          0|  -16|
|        0|          1|    2|
|        0|          2|   19|
|        0|          3|    9|
|        0|          4|  -19|
|        0|          5|    8|
|        0|          6|   -4|
|        0|          7|    7|
|        0|          8|   12|
|        0|          9|   -1|
|        0|         10|    1|
|        0|         11|   20|
|        0|         12|  -10|
|        0|         13|  -17|
|        0|         14|  -20|
|        0|         15|  -14|
|        0|         16|    5|
|        0|         17|   10|
|        0|         18|   20|
|        0|         19|  -10|
+---------+-----------+-----+
only showing top 20 rows

      scenario_id  shift_value
0               0     1.009992
1               1     0.920933
2               2     1.118965
3        

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


+-----------+------------------+
|scenario_id|       shift_value|
+-----------+------------------+
|          0|1.0099917509032859|
|          1|0.9209329803179851|
|          2|1.1189652046284522|
|          3|0.9583074925593636|
|          4|1.0152159693960154|
|          5|1.1448962363647288|
|          6|0.9618061383530386|
|          7| 0.944918570714833|
|          8|1.2532354613039307|
|          9|1.0597543607977933|
|         10|1.0348139003497112|
|         11|0.9575370352604211|
|         12|0.9971601374679556|
|         13|1.0949663240034806|
|         14|1.1067421983752088|
|         15|0.9836003768848569|
|         16|0.9838419773837204|
|         17|1.0134200736299552|
|         18|1.0142543197861544|
|         19|0.9080217795774223|
+-----------+------------------+
only showing top 20 rows



In [3]:
# # Logic
# df_pnl = df_positions.merge(df_scenarios, how="cross")
# df_pnl["pnl_value"] = df_pnl["value"] * df_pnl["shift_value"]
# print(str(df_pnl.size))

# df_pnl = df_pnl.groupby(["member_id", "scenario_id"], as_index=False)["pnl_value"].sum() # Result
# print(df_pnl)

# print("done")

print("joining...")
joined = spark_df_positions.crossJoin(spark_df_scenarios)
#result = joined.withColumn('pnl_value', joined.value - joined.shift_value).groupBy("member_id", "scenario_id").sum("pnl_value").collect()
joined.withColumn('pnl_value', joined.value - joined.shift_value).groupBy("member_id", "scenario_id").sum("pnl_value").show(truncate=False)



joining...
+---------+-----------+-------------------+
|member_id|scenario_id|sum(pnl_value)     |
+---------+-----------+-------------------+
|0        |181        |-69.06301855639256 |
|0        |189        |-68.17167759926518 |
|0        |428        |-67.02355588374093 |
|0        |444        |-63.87552376207421 |
|0        |624        |-65.44182704964109 |
|0        |1054       |-60.37780907844815 |
|0        |1319       |-65.41806552760416 |
|0        |1916       |-64.68727883337124 |
|0        |1937       |-66.9926416005281  |
|0        |2042       |-65.31226314983057 |
|6        |1994       |-13.3000625314491  |
|7        |1885       |2.5458140745032374 |
|8        |2036       |-50.73460380001232 |
|10       |1992       |-24.491608277131025|
|10       |2047       |-25.501975781729236|
|19       |2030       |-74.40332647370346 |
|21       |1989       |46.28487378829883  |
|24       |1999       |-90.31235385247517 |
|25       |2013       |-71.4998841485357  |
|25       |2021      

In [4]:
spark.stop()