In [13]:
import pandas as pd
import json
import random



In [2]:

url_part1=('https://raw.githubusercontent.com/civicliteracies/EITI_SDT_data_verification_and_validation/sqlite/4_clean/2_data_editing/output/eiti-data_part1_1.3.csv')
url_part5=('https://raw.githubusercontent.com/civicliteracies/EITI_SDT_data_verification_and_validation/sqlite/4_clean/2_data_editing/output/eiti-data_part5-0.11.8.csv')

df_part1 = pd.read_csv(url_part1)
df_part5 = pd.read_csv(url_part5)

  df_part5 = pd.read_csv(url_part5)


# Part 1 - Generating statements

1. Starting from the consolidated part 1 of the summary data file, we perform an initial mapping of the dataset columns to the relevant fields of the [BODS 0.4 schema](https://github.com/openownership/data-standard/tree/main/schema, using 
    * a [mapping reference](https://docs.google.com/spreadsheets/d/1CPeZ_5FiqIRCmHGHh7Gz1McpxmwN1EoBwkMYtRqFWFo/edit?pli=1#gid=134387124) made possible by [flattening the BODS json schema](https://github.com/civicliteracies/EITI_SDT_data_verification_and_validation/blob/sqlite/4_clean/3_bods_mapping/02_schema_flattening.ipynb) files
    * a dictionary modelled after the BODS statemement schema.
    * a loop that processes the Part 1's data using the instructions in the mapping reference.
2. We store the created JSON statements in a dictionary using the statemeent IDs as keys to facilitate matching with the future JSON files containing recordDetails info.


In [29]:
# BODS statement structure template
bods_statement_schema = {
    "statementId": "",
    "statementDate": "",
    "annotations": [],
    "publicationDetails": {
        "publicationDate": "",
        "bodsVersion": "",
        "license": "",
        "publisher": {
            "name": "",
            "url": ""
        }
    },
    "source": {
        "type": [],
        "description": "",
        "url": "",
        "retrievedAt": "",
        "assertedBy": [
            {
                "name": "",
                "uri": ""
            }
        ]
    },
    "declaration": "",
    "declarationSubject": "",
    "recordId": "",
    "recordType": "",
    "recordDetails": {}
}

# Dictionary to hold the JSON strings
statement_dict = {}

# Iterate over each row in df_part1
for index, row in df_part1.iterrows():
    bods_statement = bods_statement_schema.copy()

    # Fill the bods_statement with data from the row
    bods_statement["statementId"] = ''
    bods_statement["statementDate"] = row['eiti_data_publication_date']
    bods_statement["publicationDetails"]["publicationDate"] = row['end_date']
    bods_statement["publicationDetails"]["bodsVersion"] = '0.4'
    bods_statement["publicationDetails"]["license"] = 'http://opendatacommons.org/licenses/pddl/1.0/'
    bods_statement["publicationDetails"]["publisher"]["name"] = 'Extractive Industries Transparency Initiative'
    bods_statement["publicationDetails"]["publisher"]["url"] = 'https://eiti.org/open-data'
    bods_statement["source"]["type"] = ['officialRegister', 'verified']
    bods_statement["source"]["url"] = 'https://eiti.portaljs.com'
    bods_statement["source"]["retrievedAt"] = pd.Timestamp('today').strftime('%Y-%m-%d')
    bods_statement["source"]["assertedBy"][0]["name"] = row['submitter_name']
    bods_statement["source"]["assertedBy"][0]["uri"] = row['submitter_email']
    bods_statement["declaration"] = f"{row['iso_alpha2_code']}-{row['start_date'].replace('-', '')}-{row['end_date'].replace('-', '')}"
    bods_statement["declarationSubject"] = row['iso_alpha2_code']
    bods_statement["recordId"] = ''
    bods_statement["recordType"] = ''
    
    # Create a variable name based on the statement identifier
    variable_name = row['eiti_id_declaration']
    
    # Save the JSON string in the dictionary
    statement_dict[variable_name] = json.dumps(bods_statement, indent=2, ensure_ascii=False)

# Print a sample of 2 random iterms froom the dictionary containing JSON strings
separator = "-" * 40
random_keys = random.sample(list(statement_dict.keys()), 2)

for random_key in random_keys:
    print(f"{random_key}: {statement_dict[random_key]}\n{separator}\n")

1f61bd83-c1cd-3658-8fab-29ba86d584a7: {
  "statementId": "",
  "statementDate": NaN,
  "annotations": [],
  "publicationDetails": {
    "publicationDate": "2018-12-20",
    "bodsVersion": "0.4",
    "license": "http://opendatacommons.org/licenses/pddl/1.0/",
    "publisher": {
      "name": "Extractive Industries Transparency Initiative",
      "url": "https://eiti.org/open-data"
    }
  },
  "source": {
    "type": [
      "officialRegister",
      "verified"
    ],
    "description": "",
    "url": "https://eiti.portaljs.com",
    "retrievedAt": "2024-05-20",
    "assertedBy": [
      {
        "name": NaN,
        "uri": NaN
      }
    ]
  },
  "declaration": "AF-20171221-20181220",
  "declarationSubject": "AF",
  "recordId": "",
  "recordType": "",
  "recordDetails": {}
}
----------------------------------------

e5e7c542-e52b-3dda-adad-8cd101ea7fdb: {
  "statementId": "",
  "statementDate": "2020-12-16",
  "annotations": [],
  "publicationDetails": {
    "publicationDate": "2019-

# Part 2. Generating Entities

TODO: 
1. need to change back the dictionary construction to generate one json per recordDetails
2. There is one statement per record details. Does it mean that I have to match then duplicate the statements for each recordDetails?


In [20]:
# Extract unique entities and add entity type
unique_companies = df_part5[['company_name', 'eiti_id_company', 'iso_alpha2_code', 'country', 'company_public_listing_or_website', 'start_date', 'end_date']].dropna(subset=['eiti_id_company']).drop_duplicates().assign(entity_type='registeredEntity')
unique_projects = df_part5[['project_name', 'eiti_id_project', 'iso_alpha2_code', 'country', 'start_date', 'end_date']].dropna(subset=['eiti_id_project']).drop_duplicates().assign(entity_type='arrangement')
unique_government = df_part5[['government_entity', 'eiti_id_government', 'iso_alpha2_code', 'country', 'start_date', 'end_date']].dropna(subset=['eiti_id_government']).drop_duplicates().assign(entity_type='stateBody')


# Combine into a single DataFrame
df_combined = pd.concat([unique_companies, unique_projects, unique_government], ignore_index=True)


In [26]:
# Define the BODS entity structure based on the JSON schema
bods_entity_schema = {
    "isComponent": False,
    "entityType": {
        "type": "",
        "subtype": "",
        "details": ""
    },
    "name": "",
    "jurisdiction": {
        "name": "",
        "code": ""
    },
    "identifiers": [],
    "addresses": [],
    "uri": "",
    "publicListing": None,
    "formedByStatute": None
}

# Create a dictionary to hold the JSON files using declaration_reference as the key
entity_dict = {}

# Iterate over each row in df_combined to create JSON files
for index, row in df_combined.iterrows():
    if index % 100 == 0:  # Update progress every 100 rows
        print(f"\rProcessing row {index+1}/{len(df_combined)}", end='')

    bods_entity = bods_entity_schema.copy()

    bods_entity["isComponent"] = False
    bods_entity["entityType"]["type"] = row['entity_type']
    bods_entity["entityType"]["subtype"] = (
        'governmentDepartment' if row['entity_type'] == 'stateBody' and 'minist' in str(row['government_entity']).lower() else
        'stateAgency' if row['entity_type'] == 'stateBody' else ''
    )
    bods_entity["name"] = (
        row['company_name'] if row['entity_type'] == 'registeredEntity' else
        row['project_name'] if row['entity_type'] == 'arrangement' else
        row['government_entity']
    )
    bods_entity["jurisdiction"]["name"] = row['country']
    bods_entity["jurisdiction"]["code"] = row['iso_alpha2_code']
    bods_entity["identifiers"] = [{
        "id": (
            row['eiti_id_company'] if row['entity_type'] == 'registeredEntity' else
            row['eiti_id_project'] if row['entity_type'] == 'arrangement' else
            row['eiti_id_government']
        ),
        "scheme": "XI-EITI",
        "schemeName": "Extractive Industries Transparency Initiative",
        "uri": f"/entity_statement/{row['eiti_id_company'] if row['entity_type'] == 'registeredEntity' else row['eiti_id_project'] if row['entity_type'] == 'arrangement' else row['eiti_id_government']}"
    }]
    bods_entity["uri"] = row['company_public_listing_or_website']
    
    # Generate the declaration reference
    declaration_reference = f"{row['iso_alpha2_code']}-{row['start_date'].replace('-', '')}-{row['end_date'].replace('-', '')}"
    
    # Add the JSON string to the dictionary with the declaration_reference as the key
    entity_dict[declaration_reference] = json.dumps(bods_entity, indent=2, ensure_ascii=False)

# Ensure the progress line is cleared after completion
print(f"\rProcessing completed. {len(df_combined)} rows processed.\n")


random_keys = random.sample(list(entity_dict.keys()), 2)

for random_key in random_keys:
    print(f"{random_key}: {entity_dict[random_key]}\n{separator}\n")

Processing completed. 8242 rows processed.

ZM-20180101-20181231: {
  "isComponent": false,
  "entityType": {
    "type": "stateBody",
    "subtype": "governmentDepartment",
    "details": ""
  },
  "name": "MINISTRY OF FINANCE (MOF)",
  "jurisdiction": {
    "name": "Zambia",
    "code": "ZM"
  },
  "identifiers": [
    {
      "id": "1bc7df5f-c94f-4b40-861a-1f1f1ffd459c",
      "scheme": "XI-EITI",
      "schemeName": "Extractive Industries Transparency Initiative",
      "uri": "/entity_statement/1bc7df5f-c94f-4b40-861a-1f1f1ffd459c"
    }
  ],
  "addresses": [],
  "uri": NaN,
  "publicListing": null,
  "formedByStatute": null
}
----------------------------------------

NG-20180101-20181231: {
  "isComponent": false,
  "entityType": {
    "type": "stateBody",
    "subtype": "governmentDepartment",
    "details": ""
  },
  "name": "FEDERAL MINISTRY OF FINANCE",
  "jurisdiction": {
    "name": "Nigeria",
    "code": "NG"
  },
  "identifiers": [
    {
      "id": "9a0a0ab0-8ba1-4745-b0

In [9]:
# Initialize an empty DataFrame for df_entity
df_entity = pd.DataFrame()

# Apply the necessary transformations and keep only the transformed columns
df_entity['country_code'] = df_combined['iso_alpha2_code']
df_entity['name'] = df_combined['country']
df_entity['entity_type'] = df_combined['entity_type']

# Determine the subtype based on the government entity name
df_entity['subtype'] = df_combined.apply(
    lambda x: 'governmentDepartment' if x['entity_type'] == 'stateBody' and 'minist' in str(x['government_entity']).lower() else (
        'stateAgency' if x['entity_type'] == 'stateBody' else ''
    ), axis=1
)

# Identifiers and scheme code
df_entity['identifiers'] = df_combined.apply(
    lambda x: x['eiti_id_company'] if x['entity_type'] == 'registeredEntity' else (
        x['eiti_id_project'] if x['entity_type'] == 'arrangement' else (
            x['eiti_id_government'] if x['entity_type'] == 'stateBody' else ''
        )
    ), axis=1
)
df_entity['scheme_code'] = df_entity['identifiers']
df_entity['scheme_name'] = 'eiti'

# URI
df_entity['entity_uri'] = '/entity_statement/' + df_entity['identifiers'].fillna('')

# Fixed values
df_entity['is_component'] = 'false'

# Jurisdiction details
df_entity['country_or_subdivision_code'] = df_combined['iso_alpha2_code']

# Entity name based on the type
df_entity['entity_name'] = df_combined.apply(
    lambda x: x['company_name'] if x['entity_type'] == 'registeredEntity' else (
        x['project_name'] if x['entity_type'] == 'arrangement' else (
            x['government_entity'] if x['entity_type'] == 'stateBody' else ''
        )
    ), axis=1
)

# URI
df_entity['uri'] = df_combined['company_public_listing_or_website']

# Declaration reference
df_entity['declaration_reference'] = df_combined['iso_alpha2_code'] + '-' + df_combined['start_date'].str.replace('-','') + '-' + df_combined['end_date'].str.replace('-','')

# Display the df_entity DataFrame
display(df_entity.head())

Unnamed: 0,country_code,name,entity_type,subtype,identifiers,scheme_code,scheme_name,entity_uri,is_component,country_or_subdivision_code,entity_name,uri,declaration_reference
0,AF,Afghanistan,registeredEntity,,9efb56c2-e978-40ec-985a-13c0b18c2413,9efb56c2-e978-40ec-985a-13c0b18c2413,eiti,/entity_statement/9efb56c2-e978-40ec-985a-13c0...,False,AF,10000 EHDAS CONSTRUCTION AND GRAVEL COMPANY,,AF-20171221-20181220
1,AF,Afghanistan,registeredEntity,,09acfc31-a64f-4e57-a9e2-79bd74bad8d2,09acfc31-a64f-4e57-a9e2-79bd74bad8d2,eiti,/entity_statement/09acfc31-a64f-4e57-a9e2-79bd...,False,AF,ABAAN RAYAN LIMITED,,AF-20171221-20181220
2,AF,Afghanistan,registeredEntity,,f78c41e6-827c-4751-a1d7-f7faa7ba5a25,f78c41e6-827c-4751-a1d7-f7faa7ba5a25,eiti,/entity_statement/f78c41e6-827c-4751-a1d7-f7fa...,False,AF,ABBAS GHAZNAVI LIMITED,,AF-20171221-20181220
3,AF,Afghanistan,registeredEntity,,4a18fb8b-ec14-4e89-8f47-4beb9cf3edd7,4a18fb8b-ec14-4e89-8f47-4beb9cf3edd7,eiti,/entity_statement/4a18fb8b-ec14-4e89-8f47-4beb...,False,AF,ABDUL FATAH,n/v,AF-20171221-20181220
4,AF,Afghanistan,registeredEntity,,65d5fb0e-cb53-4d45-ba7d-0d66c20eecb2,65d5fb0e-cb53-4d45-ba7d-0d66c20eecb2,eiti,/entity_statement/65d5fb0e-cb53-4d45-ba7d-0d66...,False,AF,ABDUL RAOUF,n/v,AF-20171221-20181220


In [18]:
# Define the BODS entity structure based on the JSON schema
bods_entity_schema = {
    "isComponent": False,
    "entityType": {
        "type": "",
        "subtype": "",
        "details": ""
    },
    "name": "",
    "jurisdiction": {
        "name": "",
        "code": ""
    },
    "identifiers": [],
    "addresses": [],
    "uri": "",
    "publicListing": None,
    "formedByStatute": None
}

# Create a dictionary to hold the JSON files using declaration_reference as the key
entity_dict = {}

# Iterate over each row in df_entity to create JSON files
for index, row in df_entity.iterrows():
    if index % 100 == 0:  # Update progress every 100 rows
        print(f"\rProcessing row {index+1}/{len(df_entity)}", end='')

    bods_entity = bods_entity_schema.copy()
    bods_entity["isComponent"] = row['is_component']
    bods_entity["entityType"]["type"] = row['entity_type']
    bods_entity["entityType"]["subtype"] = row['subtype']
    bods_entity["name"] = row['entity_name']
    bods_entity["jurisdiction"]["name"] = row['name']
    bods_entity["jurisdiction"]["code"] = row['country_code']
    bods_entity["identifiers"] = [{
        "id": row['identifiers'],
        "scheme": row['scheme_code'],
        "schemeName": row['scheme_name'],
        "uri": row['entity_uri']
    }]
    bods_entity["uri"] = row['uri']
    
    # Convert to JSON
    bods_entity_str = json.dumps(bods_entity, indent=2)
    
    # Add the JSON string to the dictionary with the declaration_reference as the key
    entity_dict[row['declaration_reference']] = bods_entity_str

# Ensure the progress line is cleared after completion
print(f"\rProcessing completed. {len(df_entity)} rows processed.")

# Display the JSON dictionary
for declaration_reference, bods_entity in entity_dict.items():
    print(f"Declaration Reference: {declaration_reference}")
    print(bods_entity)
    print("\n\n")


NameError: name 'df_entity' is not defined

In [49]:
print(*entity_dict['SC-20180101-20181231'], sep='\n\n')

{
  "isComponent": "false",
  "entityType": {
    "type": "registeredEntity",
    "subtype": "",
    "details": ""
  },
  "name": "GX TECHNOLOGY",
  "jurisdiction": {
    "name": "Seychelles",
    "code": "SC"
  },
  "identifiers": [
    {
      "id": "dc174b45-bc3c-4dd2-b3dd-48fbc9034642",
      "scheme": "dc174b45-bc3c-4dd2-b3dd-48fbc9034642",
      "schemeName": "eiti",
      "uri": "/entity_statement/dc174b45-bc3c-4dd2-b3dd-48fbc9034642"
    }
  ],
  "addresses": [],
  "uri": NaN,
  "publicListing": null,
  "formedByStatute": null
}

{
  "isComponent": "false",
  "entityType": {
    "type": "registeredEntity",
    "subtype": "",
    "details": ""
  },
  "name": "JOGMEX",
  "jurisdiction": {
    "name": "Seychelles",
    "code": "SC"
  },
  "identifiers": [
    {
      "id": "21db233f-077c-4e7e-a9c5-a542fdd493ca",
      "scheme": "21db233f-077c-4e7e-a9c5-a542fdd493ca",
      "schemeName": "eiti",
      "uri": "/entity_statement/21db233f-077c-4e7e-a9c5-a542fdd493ca"
    }
  ],
  "add

In [None]:
data = []
for var_name, json_str in entity_dict.items():
    # Load the JSON string as a Python list
    json_list = json.loads(json_str)
    # Count the number of items in the list
    count_json_objects = len(json_list)
    # Append to data list
    data.append((var_name, count_json_objects))

# Create DataFrame
df = pd.DataFrame(data, columns=['Key', 'Number of JSON Objects'])

# Sort the DataFrame by 'Number of JSON Objects' in descending order
df_sorted = df.sort_values(by='Number of JSON Objects', ascending=False)

# Display the DataFrame
df_sorted

# Part 3 - Relationships

TODO 
1. Gnerate the relationship record details
2. Need to add a payment details sub-structure in relationships

# Part 4 - Matching statements to declarations

In [46]:
from collections import defaultdict

cleaned_entity_dict = {}
for declaration_reference, bods_entity_str in entity_dict.items():
    cleaned_key = declaration_reference.replace("Declaration Reference: ", "").strip()
    cleaned_entity_dict[cleaned_key] = bods_entity_str

# Create a dictionary to group entities by cleaned declaration_reference
grouped_entities = defaultdict(list)

# Iterate over each entry in cleaned_entity_dict to group entities by declaration_reference
for declaration_reference, bods_entity_str in cleaned_entity_dict.items():
    bods_entity = json.loads(bods_entity_str)
    grouped_entities[declaration_reference].append(bods_entity)

# Debug: Print the number of entities for each declaration reference
for declaration_reference, entities in grouped_entities.items():
    print(f"Declaration Reference: {declaration_reference} has {len(entities)} entities")

Declaration Reference: AF-20171221-20181220 has 1 entities
Declaration Reference: AF-20181221-20191220 has 1 entities
Declaration Reference: AL-20170101-20171231 has 1 entities
Declaration Reference: AL-20180101-20181231 has 1 entities
Declaration Reference: AR-20180101-20181231 has 1 entities
Declaration Reference: AM-20180101-20181231 has 1 entities
Declaration Reference: AM-20190101-20191231 has 1 entities
Declaration Reference: BF-20170101-20171231 has 1 entities
Declaration Reference: BF-20180101-20181231 has 1 entities
Declaration Reference: BF-20190101-20191231 has 1 entities
Declaration Reference: BF-20201201-20201231 has 1 entities
Declaration Reference: CI-20170101-20171231 has 1 entities
Declaration Reference: CI-20180101-20181231 has 1 entities
Declaration Reference: CM-20170101-20171231 has 1 entities
Declaration Reference: CD-20170101-20171231 has 1 entities
Declaration Reference: CG-20170101-20171231 has 1 entities
Declaration Reference: DE-20170101-20171231 has 1 entiti