In [None]:
# Set up the environment
# This cell only needed the first time you use this notebook on a system
import sys,os,os.path
os.environ['IBM_DB_HOME']='C:\Program Files\IBM\SQLLIB'
# Check to see if the libraries already have been installed
import importlib
# Check for ibm_db_sa.  If it exists, it's safe to assume that the other requirements
# are already installed.
spec = importlib.util.find_spec("ibm_db_sa")
if spec is None:
    print("Installing prerequisites.")
    !pip install ipython-sql
    !pip install "ibm-db==2.0.8a"
    !pip install ibm_db_sa
else:
    print("sql magic, ibm_db and ibm_db_sa already installed.")
spec = importlib.util.find_spec("sqlparse")
if spec is None:
    print("Installing prerequisites.")
    !pip install sqlparse
else:
    print("sqlparse already installed.")
# Restart the Kernel if this is your first time installing the above. The next steps will fail unless you do this.

In [None]:
# Import the modules and load the SQL magic
import ibm_db
import ibm_db_sa
import sqlalchemy
%load_ext sql
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import pandas as pd
from IPython.display import display, HTML
import datetime
from IPython.display import display, HTML, Markdown
import sqlparse
from urllib.parse import unquote_plus

In [None]:
# Define filename for passwords
filename = 'ember_variables.py'
# source the file
%run $filename

# SQL Performance Report

In [None]:
print(datetime.datetime.now())

In [None]:
#Connect to the database. Change the values of user, host, and password to match your environment. 
#For connection to a local host, use 'localhost' for the host name. 
#Also change the port number and database name.
user=User
host=Host
inst=insts[0]

password=PW
db=dbs['SAMPLE'][0]
port=ports['SAMPLE']

schema='DB2INST1'

%sql db2+ibm_db://$user:$password@$host:$port/$db
#%sql db2+ibm_db://$user:$pw_parse@$host:$port/$db

print("Database: "+db)
print("Host: "+host)


In [None]:
## Hide code cells
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

## Problem SQL

In [None]:
#First, run query to identify top problem queries and gather metrics about those SQL statements.

In [None]:
%%sql prob_sql << WITH SUM_TAB (SUM_RR, SUM_CPU, SUM_EXEC, SUM_SORT, SUM_NUM_EXEC) AS ( 
        SELECT  nullif(FLOAT(SUM(ROWS_READ)),0), 
                nullif(FLOAT(SUM(TOTAL_CPU_TIME)),0), 
                nullif(FLOAT(SUM(STMT_EXEC_TIME)),0), 
                nullif(FLOAT(SUM(TOTAL_SECTION_SORT_TIME)),0), 
                nullif(FLOAT(SUM(NUM_EXECUTIONS)),0) 
            FROM TABLE(MON_GET_PKG_CACHE_STMT ( 'D', NULL, NULL, -2)) AS T 
            WHERE stmt_text not like '%monreport.dbsummary%'
        ) 
SELECT substr(stmt_text,1,25) as STATEMENT, 
        ROWS_READ, 
        coalesce(DECIMAL(100*(FLOAT(ROWS_READ)/SUM_TAB.SUM_RR),5,2),0) AS PCT_TOT_RR, 
        TOTAL_CPU_TIME, 
        coalesce(DECIMAL(100*(FLOAT(TOTAL_CPU_TIME)/SUM_TAB.SUM_CPU),5,2),0) AS PCT_TOT_CPU, 
        STMT_EXEC_TIME, 
        coalesce(DECIMAL(100*(FLOAT(STMT_EXEC_TIME)/SUM_TAB.SUM_EXEC),5,2),0) AS PCT_TOT_EXEC_TIME, 
        TOTAL_SECTION_SORT_TIME, 
        coalesce(DECIMAL(100*(FLOAT(TOTAL_SECTION_SORT_TIME)/SUM_TAB.SUM_SORT),5,2),0) AS PCT_TOT_SRT, 
        NUM_EXECUTIONS, 
        coalesce(DECIMAL(100*(FLOAT(NUM_EXECUTIONS)/SUM_TAB.SUM_NUM_EXEC),5,2),0) AS PCT_TOT_EXECS, 
        DECIMAL(FLOAT(STMT_EXEC_TIME)/FLOAT(NUM_EXECUTIONS),10,2) AS AVG_EXEC_TIME, 
        INSERT_TIMESTAMP,
        hex(EXECUTABLE_ID) as EXECUTABLE_ID,
        RTRIM(STMT_TEXT) as FULL_STATEMENT 
    FROM TABLE(MON_GET_PKG_CACHE_STMT ( 'D', NULL, NULL, -2)) AS T, SUM_TAB 
    WHERE (DECIMAL(100*(FLOAT(ROWS_READ)/SUM_TAB.SUM_RR),5,2) > 10 
            OR DECIMAL(100*(FLOAT(TOTAL_CPU_TIME)/SUM_TAB.SUM_CPU),5,2) >10 
            OR DECIMAL(100*(FLOAT(STMT_EXEC_TIME)/SUM_TAB.SUM_EXEC),5,2) >10 
            OR DECIMAL(100*(FLOAT(TOTAL_SECTION_SORT_TIME)/SUM_TAB.SUM_SORT),5,2) >10 
            OR DECIMAL(100*(FLOAT(NUM_EXECUTIONS)/SUM_TAB.SUM_NUM_EXEC),5,2) >10 )
        AND stmt_text not like '%monreport.dbsummary%'
    ORDER BY ROWS_READ DESC 
    FETCH FIRST 20 ROWS ONLY 
    WITH UR

In [None]:
df=prob_sql.DataFrame()
#df = pd.read_csv(r"C:\Users\ecrooks\Documents\GitHub\private_jupyter_notebooks\problem_sql.csv")

#display(df.columns)
df[['pct_tot_rr']]=df[['pct_tot_rr']].astype(float)
df[['pct_tot_cpu']]=df[['pct_tot_cpu']].astype(float)
df[['pct_tot_exec_time']]=df[['pct_tot_exec_time']].astype(float)
df[['pct_tot_srt']]=df[['pct_tot_srt']].astype(float)
df[['pct_tot_execs']]=df[['pct_tot_execs']].astype(float)
df[['avg_exec_time']]=df[['avg_exec_time']].astype(float)

df['rows_read'] = df['rows_read'].map(lambda x: '{:,}'.format(x))
df['total_cpu_time'] = df['total_cpu_time'].map(lambda x: '{:,}'.format(x))
df['stmt_exec_time'] = df['stmt_exec_time'].map(lambda x: '{:,}'.format(x))
df['total_section_sort_time'] = df['total_section_sort_time'].map(lambda x: '{:,}'.format(x))
df['num_executions'] = df['num_executions'].map(lambda x: '{:,}'.format(x))

#pd.options.display.float_format = '{:,.2f}'.format
display(df[['STATEMENT','rows_read','pct_tot_rr','total_cpu_time','pct_tot_cpu','stmt_exec_time','pct_tot_exec_time','total_section_sort_time','pct_tot_srt','num_executions','pct_tot_execs','avg_exec_time']])
#df.plot(x='STATEMENT', y=['pct_tot_rr','pct_tot_cpu','pct_tot_exec_time','pct_tot_srt'], kind='barh')
#plt.show


In [None]:
pos=len(df)
df_add=df
df_add.loc[pos] = pd.Series('OTHER', index = ['STATEMENT'])
df_add.at[pos, 'pct_tot_rr'] = 100 - df['pct_tot_rr'].sum()
df_add.at[pos, 'pct_tot_cpu'] = 100 - df['pct_tot_cpu'].sum()
df_add.at[pos, 'pct_tot_exec_time'] = 100 - df['pct_tot_exec_time'].sum()
df_add.at[pos, 'pct_tot_srt'] = 100 - df['pct_tot_srt'].sum()
df_add.at[pos, 'pct_tot_execs'] = 100 - df['pct_tot_execs'].sum()


In [None]:
df_add['query_num'] = df_add.index
df_add['query_num']=df_add['query_num'].apply(lambda x: '{0:0>2}'.format(x))
df_add[['query_num']]= 'query' + df[['query_num']]
df_add.at[pos, 'query_num'] = 'other'

#display(df_add)

In [None]:
df_add2=df_add.drop(['rows_read', 'total_cpu_time', 'stmt_exec_time', 'total_section_sort_time', 'num_executions', 'avg_exec_time', 'insert_timestamp', 'executable_id', 'STATEMENT', 'full_statement'], axis=1)
#display(df_add2)

In [None]:
df_add3=df_add2.set_index('query_num').T

df_add3.rename(index={'pct_tot_rr':'Rows Read'},inplace=True)
df_add3.rename(index={'pct_tot_cpu':'CPU Time'},inplace=True)
df_add3.rename(index={'pct_tot_exec_time':'Execution Time'},inplace=True)
df_add3.rename(index={'pct_tot_srt':'Sort Time'},inplace=True)
df_add3.rename(index={'pct_tot_execs':'Number of Executions'},inplace=True)
#display(df_add3)

## Resource Utilization by Problem Queries

In [None]:
ax = df_add3.plot(kind='barh', title ="Percent of Resource Consumption by Top Problem Queries",figsize=(15,10),legend=True, stacked=True, fontsize=12, colormap='Paired')

plt.show

## Problem Query Details

In [None]:
conn=%sql
#display(conn)
pd.set_option('display.max_colwidth', -1)
for index, row in df.iterrows():
    # skip the "other" row added to balance out numbers for the metrics
    if row['query_num'] == 'other': 
        continue
    # Display basic information about the query
    display(Markdown("## Query "+str(index)))
    display(Markdown("### Query Characteristics"))
    display(Markdown("Executed "+str(row['num_executions'])+" times since last placed in the package cache at "+str(row['insert_timestamp'])))
    display(Markdown("Consumed "+str(row['pct_tot_rr'])+" percent of all rows read by all queries in the package cache."))
    display(Markdown("Consumed "+str(row['pct_tot_cpu'])+" percent of all cpu time used by all queries in the package cache."))
    display(Markdown("Consumed "+str(row['pct_tot_exec_time'])+" percent of all execution time used by all queries in the package cache."))
    display(Markdown("Consumed "+str(row['pct_tot_srt'])+" percent of all sort time used by all queries in the package cache."))
    display(Markdown("### Query Text"))
    formatted_sql=sqlparse.format(df['full_statement'][index], reindent=True)
    print(formatted_sql.replace("\\n","<br>"))
    # If a database connection is available, gather additional information about this query
    ## Note: explain may fail if the interval between runing the query to find problem sql and this section was too long, and the section has been cleared from the package cache
    if conn:
        #When db connection is available
        display(Markdown("### Query Explain Plan"))
        display(row.dtypes)
        exe_id=row['executable_id']
        ex_schema='SYSTOOLS'
        ex_requester=''
        ex_time=''
        src_name=''
        src_schema=schema
        src_version=''
        %sql call explain_from_section(x'{exe_id}', 'M', NULL, 0, :ex_schema, :ex_requester, :ex_time, :src_name, :src_schema, :src_version)
        expln_plan=%sql select * from vdba.last_explained
        print(expln_plan)