In [1]:
import re
from dotenv import load_dotenv
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from jinja2 import Template
import subprocess
from lib.sql_code_parser import SqlCodeParser

load_dotenv()


True

In [18]:
ddl_df = pd.read_csv('./results/parsed_code_cache.csv')
ddl_df.head()

Unnamed: 0,db_object_name,sql_operation,sql_code
0,Employee Sales by Country,DROP PROCEDURE,"/*\n** Copyright Microsoft, Inc. 1994 - 2000\n..."
1,Sales by Year,DROP PROCEDURE,"/*\n** Copyright Microsoft, Inc. 1994 - 2000\n..."
2,Ten Most Expensive Products,DROP PROCEDURE,"/*\n** Copyright Microsoft, Inc. 1994 - 2000\n..."
3,CustOrderHist,DROP PROCEDURE,"/*\n** Copyright Microsoft, Inc. 1994 - 2000\n..."
4,CustOrdersDetail,DROP PROCEDURE,"/*\n** Copyright Microsoft, Inc. 1994 - 2000\n..."


In [19]:
sql_code_parser = SqlCodeParser(
    source_directory="source_code/sql_server",
    source_file_glob_pattern="**/*.sql",
    use_cache=True,
    debug=True
)

In [None]:
# Save the results to a CSV file
procedure_to_table_map_path="./results/tables_mapped_to_procedures.csv"
final_df.to_csv(procedure_to_table_map_path, index=False)

In [None]:
def derive_service_name(group):
    # Prefer to name service according to which tables they write to, if possible
    group_write = group[group['operation_type'] == 'WRITE']

    if group_write.empty:
        if group.size > 0:
            most_common_table = group['table_name'].value_counts().idxmax()
            return most_common_table
        else:
            return 'Service With No Name'

    most_common_table = group_write['table_name'].value_counts().idxmax()
    return most_common_table


def add_derived_service_name_for_each_cluster_based_on_most_common_table_name(df):
    # Derive the service name for each cluster, pass the table_name and operation_type columns 
    # to the derive_service_name function.  Assign the service names to a new column called service_name.
    service_names = df.groupby('cluster_label')[['table_name', 'operation_type']].apply(derive_service_name).reset_index()
    service_names.columns = ['cluster_label', 'service_name']

    # Merge the service names into the original DataFrame
    df = pd.merge(cluster_df, service_names, on='cluster_label')

    return df


cluster_df = add_derived_service_name_for_each_cluster_based_on_most_common_table_name(cluster_df)
cluster_df.head(30)


Unnamed: 0,table_name,dml_operation,procedure_name,operation_type,combined_feature,cluster_label,service_name
0,Products,SELECT,Ten Most Expensive Products,READ,Products Ten Most Expensive Products READ,2,Products
1,Products,SELECT,CustOrdersDetail,READ,Products CustOrdersDetail READ,2,Products
2,Products,SELECT,CustOrderHist,READ,Products CustOrderHist READ,2,Products
3,Products,SELECT,SalesByCategory,READ,Products SalesByCategory READ,2,Products
4,Products,INSERT,InsertProduct,WRITE,Products InsertProduct WRITE,2,Products
5,Products,UPDATE,UpdateProduct,WRITE,Products UpdateProduct WRITE,2,Products
6,Products,DELETE,DeleteProduct,WRITE,Products DeleteProduct WRITE,2,Products
7,Employees,SELECT,Employee Sales by Country,READ,Employees Employee Sales by Country READ,1,Orders
8,Orders,SELECT,Employee Sales by Country,READ,Orders Employee Sales by Country READ,1,Orders
9,Order Subtotals,SELECT,Employee Sales by Country,READ,Order Subtotals Employee Sales by Country READ,1,Orders


In [None]:
def create_service_diagram(service_name, procedures, read_tables, write_tables):
  """
  Given a service name, procedures, read tables, and write tables, create a PlantUML diagram.
  """    
  diagram_template_text = """
  @startuml "{{ service_name }}"

  class {{ service_name }} <<domain service>> {
    {%- for procedure in procedures %}
    + {{ procedure }}() <<api>>
    {%- endfor %}
  }

  package "{{ service_name }}_PROCS" {
    {% for procedure in procedures %}
    class {{ procedure }} <<proc>> {
    }
    {% endfor %}
  }

  package "{{ service_name }}_READS" {
    {% for table in read_tables %}
    class {{ table }} <<table>> {
    }
    {% endfor %}
  }

  package "{{ service_name }}_WRITES" {
    {% for table in write_tables %}
    class {{ table }} <<table>> {
    }
    {% endfor %}
  }

  {{ service_name }} --> "{{ service_name }}_PROCS" : calls
  {{ service_name }}_PROCS --> "{{ service_name }}_READS" : reads
  {{ service_name }}_PROCS --> "{{ service_name }}_WRITES" : writes

  @enduml
  """

  template = Template(diagram_template_text)
  rendered_text = template.render(service_name=service_name, procedures=procedures, read_tables=read_tables, write_tables=write_tables)
  # print(rendered_text)

  diagram_path = f"./results/{service_name}.puml"
  with open(diagram_path, 'w') as file:
        file.write(rendered_text)

  cmd = f'plantuml {diagram_path}'
  subprocess.run(cmd, shell=True, check=True)


