In [1]:
from sshtunnel import SSHTunnelForwarder 
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from sqlalchemy.sql import text
from dotenv import load_dotenv
import os
import pandas as pd
import plotly.express as px

In [2]:
load_dotenv()

True

Need to install MariaDB Connector/C from CS Package Repository https://mariadb.com/docs/server/connect/programming-languages/c/install/

Need to install MariaDB community server on the machine

In [3]:
def run_query(query_str):
    with SSHTunnelForwarder(
        (os.getenv("BASTION_SERVER_IP"), 22), #Remote server IP and SSH port
        ssh_username = os.getenv("SSH_USER_NAME"),
        ssh_pkey=os.getenv("SSH_PRIVATE_KEY_PATH"),
        remote_bind_address=(os.getenv("RDS_ENDPOINT"), 3306)) as server: 
            
        server.start() #start ssh sever
        # print('Server connected via SSH')
        
        #connect to MariaDB
        local_port = str(server.local_bind_port)
        database_name = 'financial'
        engine = create_engine(
            'mariadb+mariadbconnector://' 
            + os.getenv('DB_USERNAME') 
            + ':' 
            + os.getenv('DB_PASSWORD') 
            + '@127.0.0.1:'
            + local_port 
            + '/' 
            + database_name
        )

        Session = sessionmaker(bind=engine)
        session = Session()
        
        # print('Database session created')
        
        #test data retrieval
        test = session.execute(text(query_str))
        df = pd.DataFrame.from_records(test)
            
        session.close()

        return df

### List Tables

In [16]:
df = run_query('SHOW TABLES')
df

Unnamed: 0,0
0,account
1,card
2,client
3,disp
4,district
5,loan
6,order
7,trans


### Account 

In [17]:
df = run_query('SHOW COLUMNS FROM account')
df

Unnamed: 0,0,1,2,3,4,5
0,account_id,int(11),NO,PRI,0.0,
1,district_id,int(11),NO,MUL,0.0,
2,frequency,varchar(18),NO,,,
3,date,date,NO,,,


In [18]:
df = run_query('SELECT * FROM account LIMIT 5')
df

Unnamed: 0,0,1,2,3
0,1,18,POPLATEK MESICNE,1995-03-24
1,2,1,POPLATEK MESICNE,1993-02-26
2,3,5,POPLATEK MESICNE,1997-07-07
3,4,12,POPLATEK MESICNE,1996-02-21
4,5,15,POPLATEK MESICNE,1997-05-30


In [19]:
df = run_query('SELECT DISTINCT frequency FROM account')
df

Unnamed: 0,0
0,POPLATEK MESICNE
1,POPLATEK TYDNE
2,POPLATEK PO OBRATU


The data is in Czech.
- POPLATEK MESICNE: MONTHLY FEE
- POPLATEK TYDNE: WEEKLY FEE
- POPLATEK PO OBRATU: FEE AFTER TRANSACTION

In [20]:
df = run_query('SELECT MIN(date) earliest_date, MAX(date) latest_date FROM account')
df

Unnamed: 0,0,1
0,1993-01-01,1997-12-29


In [4]:
df = run_query('SELECT district_id, COUNT(*) count FROM account GROUP BY district_id')
df.columns = ['district_id', 'count']
df

Unnamed: 0,district_id,count
0,1,554
1,2,42
2,3,50
3,4,48
4,5,65
...,...,...
72,73,56
73,74,135
74,75,51
75,76,55


In [6]:
fig = px.histogram(df, x='district_id', y='count')
fig.show()

Most of the account is from district 0 to 9

In [7]:
df = run_query('SELECT frequency, COUNT(*) count FROM account GROUP BY frequency')
df.columns = ['frequency', 'count']
fig = px.histogram(df, x='frequency', y='count')
fig.show()

Most of the account's frequency is POPLATEK MESICNE(MONTHLY FEE)

In [None]:
df = run_query('SELECT date, COUNT(*) count FROM account GROUP BY date')
df.columns = ['date', 'count']
df

In [None]:
fig = px.line(df, x='date', y='count')
fig.show()