In [1]:
from sshtunnel import SSHTunnelForwarder 
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from sqlalchemy.sql import text
from dotenv import load_dotenv
import os
import pandas as pd
import plotly.express as px

In [2]:
load_dotenv()

True

Pre-requisites:
- MariaDB Connector/C from CS Package Repository https://mariadb.com/docs/server/connect/programming-languages/c/install/
- MariaDB community server on the machine

### Helper functions

In [4]:
def run_query(query_str):
    with SSHTunnelForwarder(
        (os.getenv("BASTION_SERVER_IP"), 22), #Remote server IP and SSH port
        ssh_username = os.getenv("SSH_USER_NAME"),
        ssh_pkey=os.getenv("SSH_PRIVATE_KEY_PATH"),
        remote_bind_address=(os.getenv("RDS_ENDPOINT"), 3306)) as server: 
            
        server.start() #start ssh sever
        # print('Server connected via SSH')
        
        #connect to MariaDB
        local_port = str(server.local_bind_port)
        database_name = 'financial'
        engine = create_engine(
            'mariadb+mariadbconnector://' 
            + os.getenv('DB_USERNAME') 
            + ':' 
            + os.getenv('DB_PASSWORD') 
            + '@127.0.0.1:'
            + local_port 
            + '/' 
            + database_name
        )

        Session = sessionmaker(bind=engine)
        session = Session()
        
        # print('Database session created')
        
        #test data retrieval
        test = session.execute(text(query_str))
        df = pd.DataFrame.from_records(test)
            
        session.close()

        return df

In [None]:
def plot_categorical_variable_distribution(var, tb_name, show_data=False):
  """
  Queries a database to get the distribution of a categorical variable and plots it.

  Args:
    var (str): The name of the categorical variable (column) to analyze.
    tb_name (str): The name of the table in the database to query.
    show_data (bool, optional): If True, prints the DataFrame containing
                                the variable counts. Defaults to False.
  """
  df = run_query(f"SELECT {var}, COUNT(*) FROM {tb_name} GROUP BY {var}")
  df.columns = [var, 'count']

  if show_data:
    print(df)

  fig = px.bar(df, x=var, y='count')
  fig.show()


In [None]:
def plot_categorical_variable_with_segmentation(cat_var, seg_var, tb_name):
  """
  Queries a database to get the distribution of a categorical variable,
  segmented by another variable, and plots it as a grouped bar chart.

  Args:
    cat_var (str): The name of the primary categorical variable (column) to analyze.
                   This will be on the x-axis of the bar chart.
    seg_var (str): The name of the segmentation variable (column).
                   This variable will be used to group/color the bars.
    tb_name (str): The name of the table in the database to query.
  """
  # Construct and execute the SQL query to get counts of cat_var, grouped by both seg_var and cat_var
  df = run_query(f"SELECT {seg_var}, {cat_var}, COUNT(*) FROM {tb_name} GROUP BY {seg_var}, {cat_var}")
  
  # Rename columns for clarity and compatibility with plotting function
  df.columns = [seg_var, cat_var, 'count']
  
  # Create a bar chart where:
  # - x-axis represents the primary categorical variable (cat_var)
  # - y-axis represents the count
  # - bars are colored based on the segmentation variable (seg_var)
  fig = px.bar(df, x=cat_var, y='count', color=seg_var)
  
  # Display the generated plot
  fig.show()

In [None]:
def plot_numerical_variable_distribution(var, rounding_num, tb_name):
  """
  Queries a database to get the distribution of a rounded numerical variable
  and plots it as a histogram.

  The function first rounds the specified numerical variable in the SQL query,
  then groups by these rounded values to get their counts. This pre-aggregated
  data is then used to generate a histogram.

  Args:
    var (str): The name of the numerical variable (column) to analyze.
    rounding_num (int): The number of decimal places to round the numerical
                        variable to in the SQL query.
    tb_name (str): The name of the table in the database to query.
  """
  # Construct and execute the SQL query:
  # - Selects the numerical variable rounded to 'rounding_num' decimal places.
  # - Counts occurrences for each rounded value.
  # - Groups the results by the rounded variable.
  query = f'SELECT ROUND({var}, {rounding_num}), COUNT(*) FROM {tb_name} GROUP BY ROUND({var}, {rounding_num})'
  df = run_query(query)
  
  # Rename the columns of the resulting DataFrame for clarity and plotting.
  # The first column will be the rounded variable, the second will be its count.
  df.columns = [var, 'count']
  
  # Create a histogram using Plotly Express.
  # - 'x' axis represents the rounded numerical variable.
  # - 'y' axis represents the count of each rounded value.
  # Note: px.histogram typically bins raw data. Here, we are providing
  # already binned/aggregated data where 'var' represents the bin (rounded value)
  # and 'count' represents the frequency.
  fig = px.histogram(df, x=var, y='count')
  
  # Display the generated plot.
  fig.show()

In [None]:
def plot_numerical_variable_with_segmentation(num_var, rounding_num, seg_var, tb_name):
  """
  Queries a database to get the segmented distribution of a rounded numerical
  variable and plots it as a colored histogram.

  The function first rounds the specified numerical variable in the SQL query,
  then groups by these rounded values and the segmentation variable to get
  their counts. This pre-aggregated, segmented data is then used to generate
  a histogram where different segments are represented by different colors.

  Args:
    num_var (str): The name of the numerical variable (column) to analyze.
                   This will be on the x-axis of the histogram.
    rounding_num (int): The number of decimal places to round the numerical
                        variable to in the SQL query.
    seg_var (str): The name of the segmentation variable (column).
                   This variable will be used to group/color the bars
                   in the histogram.
    tb_name (str): The name of the table in the database to query.
  """
  # Construct and execute the SQL query:
  # - Selects the segmentation variable.
  # - Selects the numerical variable rounded to 'rounding_num' decimal places.
  # - Counts occurrences for each combination of segment and rounded numerical value.
  # - Groups the results by both the segmentation variable and the rounded numerical variable.
  query = f'SELECT {seg_var}, ROUND({num_var}, {rounding_num}), COUNT(*) FROM {tb_name} GROUP BY {seg_var}, ROUND({num_var}, {rounding_num})'
  df = run_query(query)
  
  # Rename the columns of the resulting DataFrame for clarity and plotting.
  # Columns will be: segmentation variable, rounded numerical variable, count.
  df.columns = [seg_var, num_var, 'count']
  
  # Create a histogram using Plotly Express.
  # - 'x' axis represents the rounded numerical variable.
  # - 'y' axis represents the count of each rounded value.
  # - 'color' is used to differentiate bars based on the segmentation variable.
  # Note: px.histogram typically bins raw data. Here, we are providing
  # already binned/aggregated data where 'num_var' represents the bin (rounded value)
  # and 'count' represents the frequency for each segment.
  fig = px.histogram(df, x=num_var, y='count', color=seg_var)
  
  # Display the generated plot.
  fig.show()

In [None]:
def show_numerical_variable_min_max(var, tb_name):
  """
  Queries a database to find and print the minimum and maximum values
  of a specified numerical variable (column) in a given table.

  Args:
    var (str): The name of the numerical variable (column) for which
               to find the min and max values.
    tb_name (str): The name of the table in the database to query.
  """
  # Construct and execute the SQL query to get the MIN and MAX of the specified variable
  df = run_query(f"SELECT MIN({var}), MAX({var}) FROM {tb_name}")
  
  # Print the resulting DataFrame which contains the min and max values.
  # The DataFrame will typically have one row and two columns,
  # e.g., MIN(var_name) and MAX(var_name).
  print(df)



In [None]:
def plot_date_variable_yearly(var, tb_name, show_data=False):
  """
  Queries a database to count occurrences per year for a given date variable
  and plots the yearly distribution as a bar chart.

  The SQL query extracts the year from the date variable, standardizes it
  to the first day of that year (YYYY-01-01) as a DATETIME, and then
  groups by this standardized year to count records.

  Args:
    var (str): The name of the date or datetime variable (column) to analyze.
    tb_name (str): The name of the table in the database to query.
    show_data (bool, optional): If True, prints the DataFrame containing
                                the yearly counts. Defaults to False.
  """
  # Construct the SQL query:
  # 1. Inner query:
  #    - DATE_FORMAT({var}, '%Y-01-01'): Extracts the year from the 'var' column
  #      and formats it as 'YYYY-01-01' (e.g., '2023-01-01').
  #    - CAST(... AS DATETIME): Converts this formatted string into a DATETIME object.
  #      This results in a DATETIME representing the first day of the year for each record.
  #    - Aliases this DATETIME object as 'year'.
  # 2. Outer query:
  #    - Groups by the 'year' (the DATETIME of the first day of the year).
  #    - Counts the number of records (COUNT(*)) for each year.
  query = (
      f"SELECT year, COUNT(*) "
      f"FROM (SELECT CAST(DATE_FORMAT({var}, '%Y-01-01') AS DATETIME) year FROM {tb_name}) tb "
      f"GROUP BY year"
  )
  df = run_query(query)
  
  # Rename the columns of the resulting DataFrame for clarity and plotting.
  df.columns = ['year', 'count']

  # Optionally print the DataFrame to show the raw counts per year.
  if show_data:
    print(df)

  # Create a bar chart using Plotly Express:
  # - 'x' axis represents the year (as a DATETIME object: YYYY-01-01).
  # - 'y' axis represents the count of records for that year.
  fig = px.bar(df, x='year', y='count')
  
  # Display the generated plot.
  fig.show()

In [None]:
def plot_date_variable_yearly_with_segmentation(date_var, seg_var, tb_name):
  """
  Queries a database to count occurrences per year for a given date variable,
  segmented by another variable, and plots the yearly distribution as a
  grouped bar chart.

  The SQL query first extracts the year from the date variable, standardizes it
  to the first day of that year (YYYY-01-01) as a DATETIME. It then groups
  by this standardized year and the segmentation variable to count records
  for each combination.

  Args:
    date_var (str): The name of the date or datetime variable (column) to analyze.
                    This will be used for the x-axis (yearly aggregation).
    seg_var (str): The name of the segmentation variable (column).
                   This variable will be used to group/color the bars.
    tb_name (str): The name of the table in the database to query.
  """
  # Construct the SQL query:
  # 1. Inner query (aliased as 'tb'):
  #    - Selects the segmentation variable '{seg_var}'.
  #    - Transforms the '{date_var}' column:
  #      - DATE_FORMAT({date_var}, '%Y-01-01'): Extracts the year and formats
  #        it as 'YYYY-01-01' (e.g., '2023-01-01').
  #      - CAST(... AS DATETIME): Converts this formatted string into a DATETIME object.
  #      - This transformed date (representing the first day of its year) is then
  #        aliased back to the original '{date_var}' name for use in the outer query.
  #    - From the specified table '{tb_name}'.
  # 2. Outer query:
  #    - Selects the segmentation variable '{seg_var}'.
  #    - Selects the transformed '{date_var}' (which is now YYYY-01-01 as DATETIME).
  #    - Counts the number of records (COUNT(*)) for each group.
  #    - Groups the results by both '{seg_var}' and the transformed '{date_var}'.
  query = (
      f"SELECT {seg_var}, {date_var}, COUNT(*) "
      f"FROM (SELECT {seg_var}, CAST(DATE_FORMAT({date_var}, '%Y-01-01') AS DATETIME) {date_var} FROM {tb_name}) tb "
      f"GROUP BY {seg_var}, {date_var}"
  )
  df = run_query(query)
  
  # Rename the columns of the resulting DataFrame for clarity and plotting.
  # The columns correspond to: segmentation variable, the (transformed) date variable, and the count.
  df.columns = [seg_var, date_var, 'count']
  
  # Create a grouped bar chart using Plotly Express:
  # - 'x' axis represents the year (as a DATETIME object: YYYY-01-01 from the transformed date_var).
  # - 'y' axis represents the count of records for that year and segment.
  # - 'color' is used to differentiate bars based on the segmentation variable '{seg_var}'.
  fig = px.bar(df, x=date_var, y='count', color=seg_var)
  
  # Display the generated plot.
  fig.show()

### List Tables

In [4]:
df = run_query('SHOW TABLES')
df

Unnamed: 0,0
0,account
1,card
2,client
3,disp
4,district
5,loan
6,order
7,trans


### Account 

In [17]:
df = run_query('SHOW COLUMNS FROM account')
df

Unnamed: 0,0,1,2,3,4,5
0,account_id,int(11),NO,PRI,0.0,
1,district_id,int(11),NO,MUL,0.0,
2,frequency,varchar(18),NO,,,
3,date,date,NO,,,


In [23]:
plot_categorical_variable_distribution('district_id', 'account')

Most of the accounts are from district 1.

In [25]:
plot_categorical_variable_distribution('frequency', 'account')

Most of the account frequency is monthly issuance.
- POPLATEK MESICNE: Monthly Issuance
- POPLATEK TYDNE: Weekly Issuance
- POPLATEK PO OBRATU: Issuance After Transaction

In [29]:
plot_date_variable_yearly('date', 'account', show_data=True)

        year  count
0 1993-01-01   1139
1 1994-01-01    439
2 1995-01-01    661
3 1996-01-01   1363
4 1997-01-01    898


Most of the account is created in 1993 and 1996

### Card

In [10]:
df = run_query('SHOW COLUMNS FROM card')
df

Unnamed: 0,0,1,2,3,4,5
0,card_id,int(11),NO,PRI,0.0,
1,disp_id,int(11),NO,MUL,,
2,type,varchar(7),NO,,,
3,issued,date,NO,,,


In [27]:
plot_categorical_variable_distribution('type', 'card')

Most of the cards are Classic type.

In [30]:
plot_date_variable_yearly('issued', 'card')

The cards issued increase from 1993 to 1998 exponentially.

### Client

In [4]:
df = run_query('SHOW COLUMNS FROM client')
df

Unnamed: 0,0,1,2,3,4,5
0,client_id,int(11),NO,PRI,,
1,gender,varchar(1),NO,,,
2,birth_date,date,NO,,,
3,district_id,int(11),NO,MUL,,


In [31]:
plot_categorical_variable_distribution('gender', 'client')

The clients are equally distributed between males and females.

In [32]:
plot_date_variable_yearly('birth_date', 'client')

Most of the clients' birth year are between 1939 and 1980.

In [14]:
plot_date_variable_yearly_with_segmentation('birth_date', 'gender', 'client')

The genders are quite equally distributed across birth years.

In [17]:
plot_categorical_variable_distribution('district_id', 'client')

Most of the clients are from district 1.

In [19]:
plot_categorical_variable_with_segmentation('district_id', 'gender', 'client')

The genders are equally distributed across districts.

### Disposition

In [14]:
df = run_query('SHOW COLUMNS FROM disp')
df

Unnamed: 0,0,1,2,3,4,5
0,disp_id,int(11),NO,PRI,,
1,client_id,int(11),NO,MUL,,
2,account_id,int(11),NO,MUL,,
3,type,varchar(9),NO,,,


In [20]:
plot_categorical_variable_distribution('type', 'disp')

Most of the dispositions are owner.

### District

In [16]:
df = run_query('SHOW COLUMNS FROM district')
df

Unnamed: 0,0,1,2,3,4,5
0,district_id,int(11),NO,PRI,0.0,
1,A2,varchar(19),NO,,,
2,A3,varchar(15),NO,,,
3,A4,int(11),NO,,,
4,A5,int(11),NO,,,
5,A6,int(11),NO,,,
6,A7,int(11),NO,,,
7,A8,int(11),NO,,,
8,A9,int(11),NO,,,
9,A10,"decimal(4,1)",NO,,,


| Column	| Description	Notes |
| ------- | ----------------- | 
| A1 | district_id	District Identifier	| 
| A2 |	District Name |	 
| A3 |	Region |	 
| A4 |	No. of Inhabitants |	 
| A5 |	No. of Municipalities with inhabitants < 499 |	 
| A6 |	No. of Municipalities with inhabitants 500-1999 |	 
| A7 |	No. of Municipalities with inhabitants 2000-9999 |	 
| A8 |	No. of Municipalities with inhabitants > 10000 |	 
| A9 |	No. of Cities |	 
| A10 |	Ratio of urban inhabitants |	 
| A11 |	Average Salary |	 
| A12 |	Unemployment rate in 1995 |	 
| A13 |	Unemployment rate in 1996 |	 
| A14 |	No. of Enterpreneurs per 1000 inhabitants |	 
| A15 |	No. of Crimes commited in 1995 |	 
| A16 |	No. of Crimes commited in 1996 |

In [40]:
df = run_query(f"SELECT * FROM district LIMIT 5")
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.2,0.43,167,85677,99107
1,2,Benesov,central Bohemia,88884,80,26,6,2,5,46.7,8507,1.6,1.85,132,2159,2674
2,3,Beroun,central Bohemia,75232,55,26,4,1,5,41.7,8980,1.9,2.21,111,2824,2813
3,4,Kladno,central Bohemia,149893,63,29,6,2,6,67.4,9753,4.6,5.05,109,5244,5892
4,5,Kolin,central Bohemia,95616,65,30,4,1,6,51.4,9307,3.8,4.43,118,2616,3040


In [None]:
df = run_query(f"SELECT A2 district_name, A4 no_of_inhabitants FROM district")
cols = ['district_name', 'no_of_inhabitants']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

Hl.m. Praha has the highest number of inhabitants

In [26]:
df = run_query(f"SELECT A2 district_name, A5 no_of_municipalities_with_inhabitants_lt_499 FROM district")
cols = ['district_name', 'no_of_municipalities_with_inhabitants_lt_499']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

In [27]:
df = run_query(f"SELECT A2 district_name, A6 no_of_municipalities_with_inhabitants_500_1999 FROM district")
cols = ['district_name', 'no_of_municipalities_with_inhabitants_500_1999']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

Brno - venkov has the highest number of municipalities with inhabitants from 500 to 1999

In [28]:
df = run_query(f"SELECT A2 district_name, A7 no_of_municipalities_with_inhabitants_2000_9999 FROM district")
cols = ['district_name', 'no_of_municipalities_with_inhabitants_2000_9999']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

In [30]:
df = run_query(f"SELECT A2 district_name, A8 no_of_municipalities_with_inhabitants_gt_10000 FROM district")
cols = ['district_name', 'no_of_municipalities_with_inhabitants_gt_10000']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

In [31]:
df = run_query(f"SELECT A2 district_name, A9 no_of_cities FROM district")
cols = ['district_name', 'no_of_cities']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

In [55]:
df = run_query(f"SELECT A2 district_name, A10 ratio_of_urban_inhabitants FROM district")
cols = ['district_name', 'ratio_of_urban_inhabitants']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

In [33]:
df = run_query(f"SELECT A2 district_name, A11 avg_salary FROM district")
cols = ['district_name', 'avg_salary']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

Averaga salary is pretty similar across districts

In [34]:
df = run_query(f"SELECT A2 district_name, A12 unemployment_rate_1995 FROM district")
cols = ['district_name', 'unemployment_rate_1995']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

In [36]:
df = run_query(f"SELECT A2 district_name, A13 unemployment_rate_1996 FROM district")
cols = ['district_name', 'unemployment_rate_1996']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

In [37]:
df = run_query(f"SELECT A2 district_name, A14 no_of_enterpreneurs_per_1000_inhabitants FROM district")
cols = ['district_name', 'no_of_enterpreneurs_per_1000_inhabitants']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

In [38]:
df = run_query(f"SELECT A2 district_name, A15 no_crimes_committed_1995 FROM district")
cols = ['district_name', 'no_crimes_committed_1995']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

In [39]:
df = run_query(f"SELECT A2 district_name, A16 no_crimes_committed_1996 FROM district")
cols = ['district_name', 'no_crimes_committed_1996']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

Hl.m. Praha has the highest crimes committed in 1995 and 1996

In [41]:
df = run_query(f"SELECT A3 region, SUM(A4) no_of_inhabitants FROM district GROUP BY A3")
cols = ['region', 'no_of_inhabitants']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

In [47]:
df = run_query(f"SELECT A3 region, SUM(A5) no_of_municipalities_with_inhabitants_lt_499 FROM district GROUP BY A3")
cols = ['region', 'no_of_municipalities_with_inhabitants_lt_499']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

In [48]:
df = run_query(f"SELECT A3 region, SUM(A6) no_of_municipalities_with_inhabitants_500_1999 FROM district GROUP BY A3")
cols = ['region', 'no_of_municipalities_with_inhabitants_500_1999']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

In [49]:
df = run_query(f"SELECT A3 region, SUM(A7) no_of_municipalities_with_inhabitants_2000_9999 FROM district GROUP BY A3")
cols = ['region', 'no_of_municipalities_with_inhabitants_2000_9999']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

In [50]:
df = run_query(f"SELECT A3 region, SUM(A8) no_of_municipalities_with_inhabitants_gt_10000 FROM district GROUP BY A3")
cols = ['district_name', 'no_of_municipalities_with_inhabitants_gt_10000']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

Prague only has municipalities more than 10,000

In [52]:
df = run_query(f"SELECT A3 region, SUM(A9) no_of_cities FROM district")
cols = ['region', 'no_of_cities']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

In [54]:
df = run_query(f"SELECT A3 region, SUM(A10) ratio_of_urban_inhabitants FROM district GROUP BY A3")
cols = ['region', 'ratio_of_urban_inhabitants']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

Prague has the lowest ratio of urban inhabitants

In [57]:
df = run_query(f"SELECT A3 region, ROUND(AVG(A11), 0) avg_salary FROM district GROUP BY A3")
cols = ['region', 'avg_salary']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

In [58]:
df = run_query(f"SELECT A3 region, ROUND(AVG(A12), 0) avg_unemployment_rate_1995 FROM district GROUP BY A3")
cols = ['region', 'avg_unemployment_rate_1995']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

In [59]:
df = run_query(f"SELECT A3 region, ROUND(AVG(A13), 0) avg_unemployment_rate_1996 FROM district GROUP BY A3")
cols = ['region', 'avg_unemployment_rate_1996']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

North Bohemia and North Moravia have the highest average unemployment rate in 1995 and 1996

In [60]:
df = run_query(f"SELECT A3 region, SUM(A15) no_crimes_committed_1995 FROM district GROUP BY A3")
cols = ['region', 'no_crimes_committed_1995']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

In [61]:
df = run_query(f"SELECT A3 region, SUM(A16) no_crimes_committed_1996 FROM district GROUP BY A3")
cols = ['region', 'no_crimes_committed_1996']
df.columns = cols
fig = px.bar(df, x=cols[0], y=cols[1])
fig.show()

### Loan

In [18]:
df = run_query('SHOW COLUMNS FROM loan')
df

Unnamed: 0,0,1,2,3,4,5
0,loan_id,int(11),NO,PRI,0.0,
1,account_id,int(11),NO,MUL,,
2,date,date,NO,,,
3,amount,int(11),NO,,,
4,duration,int(11),NO,,,
5,payments,"decimal(6,2)",NO,,,
6,status,varchar(1),NO,,,


In [62]:
plot_date_variable_yearly('date', 'loan')

Most of the loans are granted in 1997

In [22]:
df = run_query("SELECT month_name, COUNT(*) FROM (SELECT MONTHNAME(date) month_name FROM loan) tb GROUP BY month_name")
df.columns = ['month_name', 'count']
fig = px.bar(df, x='month_name', y='count')
fig.show()

The loan grants are equally distributed across the months.

In [63]:
show_numerical_variable_min_max('amount', 'loan')

      0       1
0  4980  590820


In [64]:
plot_numerical_variable_distribution('amount', -4, 'loan')

Most of the loan amounts are between 0 to 90K

In [65]:
show_numerical_variable_min_max('duration', 'loan')

    0   1
0  12  60


In [66]:
plot_numerical_variable_distribution('duration', 0, 'loan')

Most of the loan durations are between 20 and 39

In [67]:
show_numerical_variable_min_max('payments', 'loan')

        0        1
0  304.00  9910.00


In [68]:
plot_numerical_variable_distribution('payments', -2, 'loan')

Most of the loan payments are between 3000 and 3900

In [69]:
plot_categorical_variable_distribution('status', 'loan')

Most of the loans are running contract, OK thus-far.

Status definition:
- 'A' stands for contract finished, no problems
- 'B' stands for contract finished, loan not paid
- 'C' stands for running contract, OK thus-far
- 'D' stands for running contract, client in debt

In [71]:
plot_numerical_variable_with_segmentation('amount', -4, 'status', 'loan')

Most of the loans with contract finished and no problems are with lower amount. Most of the loans that contract finished and not paid have the amount between 50 - 90K.

In [72]:
plot_numerical_variable_with_segmentation('duration', 0, 'status', 'loan')

Most of the loans that contract finished, and no problems have the duration less than 40. The loans with above 40 duration are running contract and OK thus-far.

In [73]:
plot_numerical_variable_with_segmentation('payments', -2, 'status', 'loan')

The contract statuses are quite equally distributed across payments

### Order

In [17]:
df = run_query('SHOW COLUMNS FROM financial.order')
df

Unnamed: 0,0,1,2,3,4,5
0,order_id,int(11),NO,PRI,0.0,
1,account_id,int(11),NO,MUL,,
2,bank_to,varchar(2),NO,,,
3,account_to,int(11),NO,,,
4,amount,"decimal(6,1)",NO,,,
5,k_symbol,varchar(8),NO,,,


In [74]:
plot_categorical_variable_distribution('bank_to', 'financial.order')

The banks of the recipients are approximately equally distributed across all the banks.

In [75]:
show_numerical_variable_min_max('amount', 'financial.order')

     0        1
0  1.0  14882.0


In [76]:
plot_numerical_variable_distribution('amount', -2, 'financial.order')

Most of the orders' amount are between 0 to 4K.

In [77]:
plot_categorical_variable_distribution('k_symbol', 'financial.order')

Most of the orders are SIPO (household payment). There are orders without k_symbol.

- 'POJISTNE' stands for Insurance Payment
- 'SIPO' stands for Household Payment
- 'LEASING' stands for Leasing Payment
- 'UVER' stands for Loan Payment

In [78]:
plot_categorical_variable_with_segmentation('bank_to', 'k_symbol', 'financial.order')

The k_symbol is quite equally distributed across banks of the recipients.

### Transactions

In [7]:
tb_name = 'trans'
df = run_query(f'SHOW COLUMNS FROM {tb_name}')
df

Unnamed: 0,0,1,2,3,4,5
0,trans_id,int(11),NO,PRI,0.0,
1,account_id,int(11),NO,MUL,0.0,
2,date,date,NO,,,
3,type,varchar(6),NO,,,
4,operation,varchar(14),YES,,,
5,amount,int(11),NO,,,
6,balance,int(11),NO,,,
7,k_symbol,varchar(11),YES,,,
8,bank,varchar(2),YES,,,
9,account,int(10) unsigned,YES,,,


In [79]:
plot_date_variable_yearly('date', 'trans')

More transactions as year goes on.

In [10]:
df = run_query(f"SELECT month_name, COUNT(*) FROM (SELECT MONTHNAME(date) month_name FROM {tb_name}) tb GROUP BY month_name")
df.columns = ['month_name', 'count']
fig = px.bar(df, x='month_name', y='count')
fig.show()

The transactions are approximately equally distributed across months with the highest in January.

In [80]:
plot_categorical_variable_distribution('type', 'trans')

The highest transaction type is VYDAJ, followed by PRIJEM and VYBER.

- 'VYDAJ' stands for Debit (withdrawal)
- 'PRIJEM' stands for Credit
- 'VYBER' stands for Withdrawal in Cash

In [81]:
plot_categorical_variable_distribution('operation', 'trans')

Most of the transaction operation is withdrawal in cash (VYBER).
- 'VYBER KARTOU' stands for Credit Card Withdrawal
- 'VKLAD' stands for Credit in Cash
- 'PREVOD Z UCTU' stands for Collection from Another Bank
- 'VYBER' stands for Withdrawal in Cash
- 'PREVOD NA UCET' stands for Remittance to Another Bank

In [10]:
show_numerical_variable_min_max('amount', tb_name)

   0      1
0  0  87400


In [8]:
plot_numerical_variable_distribution('amount', -3, tb_name)

The transaction amount ranges from 0 to 87,400 with most of it ranges between 0 to 10K.

In [11]:
show_numerical_variable_min_max('balance', tb_name)

       0       1
0 -41126  209637


In [10]:
plot_numerical_variable_distribution('balance', -4, 'trans')

The balances range from -40K to 200K with most of it ranges between 0 to 50K.

In [82]:
plot_categorical_variable_distribution('k_symbol', 'trans', True)

      k_symbol   count
0         None  481881
1                53433
2       DUCHOD   30338
3     POJISTNE   18500
4  SANKC. UROK    1577
5         SIPO  118065
6       SLUZBY  155832
7         UROK  183114
8         UVER   13580


Most of the transactions don't have k_symbol.
- 'POJISTNE' stands for Insurance Payment
- 'SLUZBY' stands for Payment of Statement
- 'UROK' stands for Interest Credited
- 'SANKC. UROK' stands for Sanction Interest if Negative Balance
- 'SIPO' stands for Household Payment
- 'DUCHOD' stands for Old-age Pension Payment
- 'UVER' stands for Loan Payment

In [83]:
plot_categorical_variable_distribution('bank', 'trans', True)

    bank   count
0   None  782812
1     AB   21720
2     CD   19597
3     EF   21293
4     GH   21499
5     IJ   20525
6     KL   21234
7     MN   19623
8     OP   21094
9     QR   22285
10    ST   21711
11    UV   21167
12    WX   20178
13    YZ   21582


Most of the transactions don't have bank code.

### Summary

Account
- Most of the accounts are from district 1
- Most of the accounts' frequency is monthly issuance
  - POPLATEK MESICNE: Monthly Issuance
  - POPLATEK TYDNE: Weekly Issuance
  - POPLATEK PO OBRATU: Issuance After Transaction
- Most of the accounts are created in 1993 and 1996

Card
- Most of the cards are Classic type
- The cards issued increase from 1993 to 1998 exponentially

Client
- The clients are equally distributed between males and females
- Most of the clients' birth year are between 1939 and 1980
- The genders are quite equally distributed across birth years
- Most of the clients are from district 1
- The genders are equally distributed across districts

Disposition
- Most of the dispositions are owner

District
- Hl.m. Praha has the highest number of inhabitants
- Brno - venkov has the highest number of municipalities with inhabitants from 500 to 1999
- Averaga salary is pretty similar across districts
- Hl.m. Praha has the highest crimes committed in 1995 and 1996
- Prague only has municipalities more than 10,000
- Prague has the lowest ratio of urban inhabitants
- North Bohemia and North Moravia have the highest average unemployment rate in 1995 and 1996

Loan
- Most of the loans are granted in 1997
- Most of the loan amounts are between 0 to 90K
- Most of the loan durations are between 20 and 39
- Most of the loan payments are between 3000 and 3900
- Most of the loans are running contract, OK thus-far
- Most of the loans with contract finished, and no problems are with lower amount
- Most of the loans that contract finished and not paid have the amount between 50 - 90K
- Most of the loans that contract finished, and no problems have the duration less than 40
- The loans with above 40 duration are running contract and OK thus-far

Order
- Most of the orders' amount are between 0 to 4K
- Most of the orders are SIPO (household payment)
- There are orders without k_symbol

Transaction
- More transactions as year goes on
- The highest transaction type is debit (withdrawal)
- Most of the transaction operation is withdrawal in cash (VYBER)
- The transaction amount ranges from 0 to 87,400 with most of it ranges between 0 to 10K
- The balances range from -40K to 200K with most of it ranges between 0 to 50K
- Most of the transactions don't have k_symbol
- Most of the transactions don't have bank code

To do:
- Feature evaluation: For each loan, calculate account statistics like balance, payments