In [143]:
import psycopg2 as pg
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import plotly.express as px

Constants

In [None]:
# UUIDs
uuid_hp = '34cfe992-c2c0-11ea-9026-02e7594ce0a0'
# connection parameters
dbname = "eamena"
user = "eamenar"
password = "eamenar"
host = "52.50.27.140"
port = "5432"
# verbose
verbose = True

Connect the database

In [145]:
try:
    connection = pg.connect(
        dbname=dbname,
        user=user,
        password=password,
        host=host,
        port=port
    )
    cur = connection.cursor()
    print("Connection established successfully!")
except pg.Error as e:
    print(f"Error: {e}")

Connection established successfully!


Select an HP

In [147]:
selected_hp = 'EAMENA-0500002'

Get the UUID of the HP from its ID

In [148]:
sqll = """
SELECT
      resourceinstanceid AS resourceid
      FROM tiles
      WHERE tiledata -> '%s' -> 'en' ->> 'value' LIKE '%s'
""" % (uuid_hp, selected_hp)
# print(sqll)
cur.execute(sqll)
hpid = cur.fetchone()[0]   
print("the UUID of '" + selected_hp + "' is '" + hpid + "'")

the UUID of 'EAMENA-0500002' is 'dbc95d2d-38fb-465e-a6cb-0545eaa7584f'


Read the [output.tsv](https://github.com/eamena-project/eamena-arches-dev/blob/main/dev/data_quality/output.tsv) file with listed UUID linked to fields. This TSV file is exported automatically (GitHub Action) from the `template.xlsx` file, in the same directory

In [156]:
tsv_file = "https://raw.githubusercontent.com/eamena-project/eamena-arches-dev/main/dev/data_quality/output.tsv"
df = pd.read_csv(tsv_file, delimiter='\t')
df = df[["level1", "level2", "level3", "uuid_sql"]]
df_listed = df.dropna()
print(df_listed.to_markdown())

|    | level1               | level2               | level3                                | uuid_sql                             |
|---:|:---------------------|:---------------------|:--------------------------------------|:-------------------------------------|
|  0 | ASSESSMENT SUMMARY   | ASSESSMENT ACTIVITY  | Assessment Investigator - Actor       | 34cfea8a-c2c0-11ea-9026-02e7594ce0a0 |
|  1 | ASSESSMENT SUMMARY   | ASSESSMENT ACTIVITY  | Investigator Role Type                | d2e1ab96-cc05-11ea-a292-02e7594ce0a0 |
|  2 | ASSESSMENT SUMMARY   | ASSESSMENT ACTIVITY  | Assessment Activity Type              | 34cfea4d-c2c0-11ea-9026-02e7594ce0a0 |
|  3 | ASSESSMENT SUMMARY   | ASSESSMENT ACTIVITY  | Assessment Activity Date              | 34cfea81-c2c0-11ea-9026-02e7594ce0a0 |
|  4 | ASSESSMENT SUMMARY   | ASSESSMENT ACTIVITY  | GE Assessment(Yes/No)                 | bcd3a8ae-0404-11eb-a11c-0a5a9a4f6ef7 |
|  5 | ASSESSMENT SUMMARY   | ASSESSMENT ACTIVITY  | GE Imagery Acquisition 

Select a column (default, `level1`) Create an empty dataframe, loop over UUIDs to collect data from the HP, and fill the empty dataframe

In [158]:
mylevel = 'level1'
# empty dataframe
level_values = df[mylevel].unique()
data = {'field': level_values,
        'nb_of_records': np.repeat(0, len(level_values)).tolist()}
df_res = pd.DataFrame(data)
# loop and fill it
for i in df_listed.index:
    if verbose:
        print("read: " + df[mylevel][i] + ' | ' + df['uuid_sql'][i])
    df_field = df[mylevel][i]
    df_field_sql = re.sub(" ", "_", df_field) # rm space
    df_uuid = df['uuid_sql'][i]
    sqll = """
    SELECT value FROM values 
    WHERE valueid::text IN
    (
    SELECT tiledata ->> '%s' AS %s
    FROM tiles 
    WHERE resourceinstanceid::text LIKE '%s'
    AND tiledata -> '%s' IS NOT NULL
    )
    """ % (df_uuid, df_field_sql, hpid, df_uuid)
    # print(sqll)
    cur.execute(sqll)
    outvalue = cur.fetchall()
    if len(outvalue) > 0:
        row_num = df_res[df_res['field'] == df_field].index.tolist()
        df_res.at[row_num[0], 'nb_of_records'] = df_res.loc[row_num[0]]['nb_of_records'] + 1
        if verbose:
            print("recorded values: " + str(outvalue))
print(df_res.to_markdown())

read: ASSESSMENT ACTIVITY | 34cfea8a-c2c0-11ea-9026-02e7594ce0a0
read: ASSESSMENT ACTIVITY | d2e1ab96-cc05-11ea-a292-02e7594ce0a0
read: ASSESSMENT ACTIVITY | 34cfea4d-c2c0-11ea-9026-02e7594ce0a0
recorded values: [('Desk-based Assessment',)]
read: ASSESSMENT ACTIVITY | 34cfea81-c2c0-11ea-9026-02e7594ce0a0
read: ASSESSMENT ACTIVITY | bcd3a8ae-0404-11eb-a11c-0a5a9a4f6ef7
read: ASSESSMENT ACTIVITY | b9643302-0407-11eb-a11c-0a5a9a4f6ef7
read: ASSESSMENT ACTIVITY | 34cfea34-c2c0-11ea-9026-02e7594ce0a0
read: ASSESSMENT ACTIVITY | a5eb59b4-0406-11eb-a11c-0a5a9a4f6ef7
read: RESOURCE NAME | 34cfe9dd-c2c0-11ea-9026-02e7594ce0a0
read: RESOURCE NAME | 34cfea97-c2c0-11ea-9026-02e7594ce0a0
recorded values: [('Toponym',)]
read: RESOURCE SUMMARY | 34cfe9ef-c2c0-11ea-9026-02e7594ce0a0
recorded values: [('Urban Heritage',)]
read: CONDITION ASSESSMENT | 34cfe9f5-c2c0-11ea-9026-02e7594ce0a0
recorded values: [('Poor',)]
read: CONDITION ASSESSMENT | 34cfea8e-c2c0-11ea-9026-02e7594ce0a0
recorded values: [('31

## Spider diagram

Show spider diagram with number of fields recorded

In [159]:
variable = df_res['field'].tolist()
value = df_res['nb_of_records'].tolist()
# Sample data
df = pd.DataFrame(dict(
    value = value,
    variable = variable))
fig = px.line_polar(df, r = 'value', theta = 'variable', line_close = True)
fig.show()