In [0]:
import json
import pandas as pd
from azure.cosmos import CosmosClient
import analytics.general_utils as gen_utils

def write_cosmosdb_table(df_write, database_name, table_name, key_name, environment="dev", df_comparison = None, group_key = None, chunks_size = 1000):
  """
  Write a Spark DataFrame into CosmosDB. If a comparison Spark DataFrame is provinded, only the different observations are 
  written into Cosmos DB, deleting in addition the observations that appears in the comparison DataFrame but not in the original one
  
  Parameters:
    - df_write (DataFrame): DataFrame to write
    - database_name (String): name of the Database in Cosmos DB
    - table_name (String): name of the table in Cosmos DB
    - key_name (String): partition key of the table in Cosmos DB
    - environment (String): execution environment -> "dev", "desarrollo", "pro", "product", "pre-pro", "pre-produccion"
    - df_comparison (DataFrame): spark DataFrame used for comparison (Optional)
    - group_key (String): column name to used to group the data during the upserting / deleting processes into Cosmos DB.
    - chunks_size (Integer): chunk size to be used when upserting the data if no 'group_key' is provided. 
  
  
  """
  
  # Utilities
  envs_code = {"dev": "4", 
               "desarrollo": "4", 
               "pro": "3", 
               "produccion": "3", 
               "pre-pro": "6", 
               "pre-produccion": "6"}
  
  
  key_cosmos = dbutils.secrets.get(scope="cosmosdb{}".format(envs_code.get(environment, environment)), key="key")
  
  def _to_db_table(key, dataframe):
    j = json.loads(dataframe.to_json(orient="records"))
    for item in j:
      client = CosmosClient('https://{}satcosmosdb.documents.azure.com:443/'.format(envs_code.get(environment, environment)), key_cosmos)
      item['id'] = item[key_name]
      database = client.get_database_client(database_name)
      container = database.get_container_client(table_name)
      container.upsert_item(item)
    return pd.DataFrame()
  
  def _del_db_table(key, dataframe):
    j = json.loads(dataframe.to_json(orient="records"))
    for item in j:
      client = CosmosClient('https://{}satcosmosdb.documents.azure.com:443/'.format(envs_code.get(environment, environment)), key_cosmos)
      database = client.get_database_client(database_name)
      container = database.get_container_client(table_name)
      try:
        response = container.delete_item(item=item[key_name], partition_key=item[key_name])
      except:
        pass
    return pd.DataFrame()
  
  # Get df_write_final, df_delete
  if df_comparison is None:
    df_write_final = df_write
    
  else:
    df_write_final = df_write.drop('timestamp').subtract(df_comparison.drop('timestamp'))
    df_delete = df_comparison.drop('timestamp').join(df_write.drop('timestamp'), key_name, "left_anti")
    # Delete items from Cosbmos DB
    delete_count = df_delete.count()
    if df_delete.count() > 0:
      print("Trying to delete {} items in Cosmos DB. Database: {}; Table: {}.".format(delete_count, database_name, table_name))
      if group_key is None:      
        df_delete.groupBy().applyInPandas(_del_db_table, schema='').collect()

      else:
        df_delete.groupBy(group_key).applyInPandas(_del_db_table, schema='').collect()
  
  # Upsert items in Cosmos DB
  write_count = df_write_final.count()
  if write_count > 0:
    print("Upserting {} items in Cosmos DB. Database: {}; Table: {}.".format(write_count, database_name, table_name))
    if group_key is None:   
      
      partitions_num = int(df_write_final.count()/chunks_size) + 1
      splits = []

      for i in range(partitions_num):
        splits.append(1.0)
          
      table_parts = df_write_final.select(key_name).join(df_write, on = key_name, how = 'inner').randomSplit(splits, 1234)

      for part in table_parts:

        part_pd = part.toPandas()

        j = json.loads(part_pd.to_json(orient="records"))

              # La creación del cliente 

        client = CosmosClient('https://{}satcosmosdb.documents.azure.com:443/'.format(envs_code.get(environment, environment)), key_cosmos)

        for item in j:
          new_item = {}
          clave=key_name

          for element in item:
            if item[element] is not None:
              new_item[element] = item[element]

          new_item['id'] = new_item[clave]    
          #insertar en cosmos
          # Base de datos
          database = client.get_database_client(database_name)
          # Contenedor     
          container = database.get_container_client(clave)
          # Inserción de elemento
          container.upsert_item(new_item)
      
    else:
      df_write_final.select(key_name).join(df_write, on = key_name, how = 'inner').groupBy(group_key).applyInPandas(_to_db_table, schema='').collect()

In [0]:
# df_old = (spark.createDataFrame([("000033", 119, "MED"), ("00002", 85, "MED"), ("00001", 55, "TRAD"), ("00004", 25, "TRAD"), ], ["COD_MEDIADOR", "NP_POL", "TIPO_MEDIADOR"]))
# df_new = (spark.createDataFrame([("000033", 119, "MED"), ("000022", 85, "MED"), ("00001", 55, "TRAD"), ("00004", 25, "TRAD"), ], ["COD_MEDIADOR", "NP_POL", "TIPO_MEDIADOR"]))

In [0]:
# write_cosmosdb_table(df_write=df_new,
#                database_name="COD_MEDIADOR", 
#                table_name="COD_MEDIADOR", 
#                key_name="COD_MEDIADOR", 
#                environment="dev", 
#                df_comparison = df_old, 
#                group_key = "TIPO_MEDIADOR")


# write_cosmosdb_table(df_write=df_new,
#                database_name="COD_MEDIADOR", 
#                table_name="COD_MEDIADOR", 
#                key_name="COD_MEDIADOR", 
#                environment="dev", 
#                df_comparison = None, 
#                group_key = "TIPO_MEDIADOR")

In [0]:
gen_utils.write_cosmosdb_table = write_cosmosdb_table