In [1]:
import pandas as pd
import json
import random



In [2]:

url_part1=('https://raw.githubusercontent.com/civicliteracies/EITI_SDT_data_verification_and_validation/sqlite/4_clean/2_data_editing/output/eiti-data_part1_1.3.csv')
url_part5=('https://raw.githubusercontent.com/civicliteracies/EITI_SDT_data_verification_and_validation/sqlite/4_clean/2_data_editing/output/eiti-data_part5-0.11.8.csv')

df_part1 = pd.read_csv(url_part1)
df_part5 = pd.read_csv(url_part5)

  df_part5 = pd.read_csv(url_part5)


# Part 1 - Generating statements

1. Starting from the consolidated part 1 of the summary data file, we perform an initial mapping of the dataset columns to the relevant fields of the [BODS 0.4 schema](https://github.com/openownership/data-standard/tree/main/schema, using 
    * a [mapping reference](https://docs.google.com/spreadsheets/d/1CPeZ_5FiqIRCmHGHh7Gz1McpxmwN1EoBwkMYtRqFWFo/edit?pli=1#gid=134387124) made possible by [flattening the BODS json schema](https://github.com/civicliteracies/EITI_SDT_data_verification_and_validation/blob/sqlite/4_clean/3_bods_mapping/02_schema_flattening.ipynb) files
    * a dictionary modelled after the BODS statemement schema.
    * a loop that processes the Part 1's data using the instructions in the mapping reference.
2. We store the created JSON statements in a dictionary using the statemeent IDs as keys to facilitate matching with the future JSON files containing recordDetails info.


In [20]:
# BODS statement structure template
bods_statement_schema = {
    "statementId": "",
    "statementDate": "",
    "annotations": [],
    "publicationDetails": {
        "publicationDate": "",
        "bodsVersion": "",
        "license": "",
        "publisher": {
            "name": "",
            "url": ""
        }
    },
    "source": {
        "type": [],
        "description": "",
        "url": "",
        "retrievedAt": "",
        "assertedBy": [
            {
                "name": "",
                "uri": ""
            }
        ]
    },
    "declaration": "",
    "declarationSubject": "",
    "recordId": "",
    "recordType": "",
    "recordDetails": {}
}

# Dictionary to hold the JSON strings
statement_dict = {}

# Iterate over each row in df_part1
for index, row in df_part1.iterrows():
    bods_statement = bods_statement_schema.copy()

    # Fill the bods_statement with data from the row
    bods_statement["statementId"] = ''
    bods_statement["statementDate"] = row['eiti_data_publication_date']
    bods_statement["publicationDetails"]["publicationDate"] = row['end_date']
    bods_statement["publicationDetails"]["bodsVersion"] = '0.4'
    bods_statement["publicationDetails"]["license"] = 'http://opendatacommons.org/licenses/pddl/1.0/'
    bods_statement["publicationDetails"]["publisher"]["name"] = 'Extractive Industries Transparency Initiative'
    bods_statement["publicationDetails"]["publisher"]["url"] = 'https://eiti.org/open-data'
    bods_statement["source"]["type"] = ['officialRegister', 'verified']
    bods_statement["source"]["url"] = 'https://eiti.portaljs.com'
    bods_statement["source"]["retrievedAt"] = pd.Timestamp('today').strftime('%Y-%m-%d')
    bods_statement["source"]["assertedBy"][0]["name"] = row['submitter_name']
    bods_statement["source"]["assertedBy"][0]["uri"] = row['submitter_email']
    bods_statement["declaration"] = f"{row['iso_alpha2_code']}-{row['start_date'].replace('-', '')}-{row['end_date'].replace('-', '')}"
    bods_statement["declarationSubject"] = row['iso_alpha2_code']
    bods_statement["recordId"] = ''
    bods_statement["recordType"] = ''
    
    # Create a variable name based on the statement identifier
    variable_name = row['eiti_id_declaration']
    
    # Save the JSON string in the dictionary
    statement_dict[variable_name] = json.dumps(bods_statement, indent=2, ensure_ascii=False)

# Print a sample of 2 random iterms froom the dictionary containing JSON strings
separator = "-" * 40
random_keys = random.sample(list(statement_dict.keys()), 2)

print(f"the dictionnary has {len(statement_dict.keys())} ")

for random_key in random_keys:
    print(f"{random_key}: {statement_dict[random_key]}\n{separator}\n")

the dictionnary has73
0d9c43a6-f72e-398b-b1eb-771b510ea6eb: {
  "statementId": "",
  "statementDate": "n/v",
  "annotations": [],
  "publicationDetails": {
    "publicationDate": "2017-12-31",
    "bodsVersion": "0.4",
    "license": "http://opendatacommons.org/licenses/pddl/1.0/",
    "publisher": {
      "name": "Extractive Industries Transparency Initiative",
      "url": "https://eiti.org/open-data"
    }
  },
  "source": {
    "type": [
      "officialRegister",
      "verified"
    ],
    "description": "",
    "url": "https://eiti.portaljs.com",
    "retrievedAt": "2024-05-21",
    "assertedBy": [
      {
        "name": "Maher Kabsi",
        "uri": "Maher.Kabsi@bdo-ifi.com"
      }
    ]
  },
  "declaration": "CG-20170101-20171231",
  "declarationSubject": "CG",
  "recordId": "",
  "recordType": "",
  "recordDetails": {}
}
----------------------------------------

935a9737-16b1-316e-8c3e-9a12c3c27470: {
  "statementId": "",
  "statementDate": "2021-03-31",
  "annotations": [],

# Part 2. Generating Entities

TODO: 
1. need to change back the dictionary construction to generate one json per recordDetails
2. There is one statement per record details. Does it mean that I have to match then duplicate the statements for each recordDetails?


In [9]:
# Extract unique entities and add entity type
unique_companies = df_part5[['company_name', 'eiti_id_company', 'iso_alpha2_code', 'country', 'company_public_listing_or_website', 'start_date', 'end_date', 'eiti_id_declaration']].dropna(subset=['eiti_id_company']).drop_duplicates().assign(entity_type='registeredEntity')
unique_projects = df_part5[['project_name', 'eiti_id_project', 'iso_alpha2_code', 'country', 'start_date', 'end_date', 'eiti_id_declaration']].dropna(subset=['eiti_id_project']).drop_duplicates().assign(entity_type='arrangement')
unique_government = df_part5[['government_entity', 'eiti_id_government', 'iso_alpha2_code', 'country', 'start_date', 'end_date', 'eiti_id_declaration']].dropna(subset=['eiti_id_government']).drop_duplicates().assign(entity_type='stateBody')


# Combine into a single DataFrame
df_combined = pd.concat([unique_companies, unique_projects, unique_government], ignore_index=True)


In [10]:
# Define the BODS entity structure based on the JSON schema
bods_entity_schema = {
    "isComponent": False,
    "entityType": {
        "type": "",
        "subtype": "",
        "details": ""
    },
    "name": "",
    "jurisdiction": {
        "name": "",
        "code": ""
    },
    "identifiers": [],
    "addresses": [],
    "uri": "",
    "publicListing": None,
    "formedByStatute": None
}

# Create a dictionary to hold the JSON files using declaration_reference as the key
entity_dict = {}

# Iterate over each row in df_combined to create JSON files
for index, row in df_combined.iterrows():
    if index % 100 == 0:  # Update progress every 100 rows
        print(f"\rProcessing row {index+1}/{len(df_combined)}", end='')

    bods_entity = bods_entity_schema.copy()

    bods_entity["isComponent"] = False
    bods_entity["entityType"]["type"] = row['entity_type']
    bods_entity["entityType"]["subtype"] = (
        'governmentDepartment' if row['entity_type'] == 'stateBody' and 'minist' in str(row['government_entity']).lower() else
        'stateAgency' if row['entity_type'] == 'stateBody' else ''
    )
    bods_entity["name"] = (
        row['company_name'] if row['entity_type'] == 'registeredEntity' else
        row['project_name'] if row['entity_type'] == 'arrangement' else
        row['government_entity']
    )
    bods_entity["jurisdiction"]["name"] = row['country']
    bods_entity["jurisdiction"]["code"] = row['iso_alpha2_code']
    bods_entity["identifiers"] = [{
        "id": (
            row['eiti_id_company'] if row['entity_type'] == 'registeredEntity' else
            row['eiti_id_project'] if row['entity_type'] == 'arrangement' else
            row['eiti_id_government']
        ),
        "scheme": "XI-EITI",
        "schemeName": "Extractive Industries Transparency Initiative",
        "uri": f"/entity_statement/{row['eiti_id_company'] if row['entity_type'] == 'registeredEntity' else row['eiti_id_project'] if row['entity_type'] == 'arrangement' else row['eiti_id_government']}"
    }]
    bods_entity["uri"] = row['company_public_listing_or_website']
    
    # Create a variable name based on the statement identifier
    variable_name = row['eiti_id_declaration']

    # Add the JSON string to the dictionary with the declaration_reference as the key
    entity_dict[variable_name] = json.dumps(bods_entity, indent=2, ensure_ascii=False)

# Ensure the progress line is cleared after completion
print(f"\rProcessing completed. {len(df_combined)} rows processed.\n")


random_keys = random.sample(list(entity_dict.keys()), 2)

for random_key in random_keys:
    print(f"{random_key}: {entity_dict[random_key]}\n{separator}\n")

Processing completed. 8242 rows processed.

f11d93c6-1ca8-3b49-b082-f4bd12e7c5cb: {
  "isComponent": false,
  "entityType": {
    "type": "stateBody",
    "subtype": "stateAgency",
    "details": ""
  },
  "name": "SOCIÉTÉ DES HYDROCARBURES DU TCHAD (SHT)",
  "jurisdiction": {
    "name": "Chad",
    "code": "TD"
  },
  "identifiers": [
    {
      "id": "2789c7a9-d3c3-4c03-85bc-9101eecf9a7a",
      "scheme": "XI-EITI",
      "schemeName": "Extractive Industries Transparency Initiative",
      "uri": "/entity_statement/2789c7a9-d3c3-4c03-85bc-9101eecf9a7a"
    }
  ],
  "addresses": [],
  "uri": NaN,
  "publicListing": null,
  "formedByStatute": null
}
----------------------------------------

32ec6b11-e012-3080-919e-e5ba08c0d92f: {
  "isComponent": false,
  "entityType": {
    "type": "stateBody",
    "subtype": "stateAgency",
    "details": ""
  },
  "name": "ETHIOPIAN REVENUES AND CUSTOMS AUTHORITY (ERCA)",
  "jurisdiction": {
    "name": "Ethiopia",
    "code": "ET"
  },
  "identifi

In [17]:
combined_dict = {}
index = 0

for key in entity_dict.keys():
    if key in statement_dict:
        print(f"Match found for key: {key}")  # Debug message
        statement = json.loads(statement_dict[key])
        entity = json.loads(entity_dict[key])
        statement["recordDetails"] = entity
        combined_dict[index] = json.dumps(statement, indent=2, ensure_ascii=False)
        index += 1

# Print the length of the combined dictionary
print(f"Number of combined entries: {len(combined_dict)}")

# Display only 2 random items
if combined_dict:
    random_keys = random.sample(list(combined_dict.keys()), min(2, len(combined_dict)))
    for key in random_keys:
        print(f"Combined Entry {key}:")
        print(combined_dict[key])
        print("\n\n")
else:
    print("No matching keys found between statement_dict and entity_dict.")


Match found for key: 1f61bd83-c1cd-3658-8fab-29ba86d584a7
Match found for key: c40776d5-a273-3f9d-b805-075e804b9f3e
Match found for key: fb121867-d9c5-3112-90f5-f798ed67c49d
Match found for key: e821dd0c-7660-3334-a55a-732ab12351d7
Match found for key: 2df5c073-4216-3f74-a2cd-23aa44dd3c9c
Match found for key: 4c603c65-856f-307e-9317-a0aafd609fd9
Match found for key: 3feda957-7e06-3973-b6b9-4516a8d00747
Match found for key: e29c05ee-d532-38fa-a930-6f5e20b1767c
Match found for key: d060e8fb-01b0-3ff5-9c5a-76d13e6b2eb1
Match found for key: 1b63ccc9-6e10-38f3-893b-b4acc6621a33
Match found for key: bca8d920-8812-3357-938f-429117baa1c5
Match found for key: 116e309f-4a0e-3322-bb55-48bce11e119c
Match found for key: 71c9f815-7b52-3ee6-b348-4d6fce4857e5
Match found for key: 386f455d-e70c-3e0f-a9e4-2177c6c841ef
Match found for key: ccfb08b7-f235-3638-9b1e-b2627cc5d1d9
Match found for key: 0d9c43a6-f72e-398b-b1eb-771b510ea6eb
Match found for key: 3db323c5-52f0-3d2a-a301-18a90f9e9bad
Match found fo

# Part 3 - Relationships

TODO 
1. Gnerate the relationship record details
2. Need to add a payment details sub-structure in relationships

# Part 4 - Matching statements to declarations

In [7]:
from collections import defaultdict

cleaned_entity_dict = {}
for declaration_reference, bods_entity_str in entity_dict.items():
    cleaned_key = declaration_reference.replace("Declaration Reference: ", "").strip()
    cleaned_entity_dict[cleaned_key] = bods_entity_str

# Create a dictionary to group entities by cleaned declaration_reference
grouped_entities = defaultdict(list)

# Iterate over each entry in cleaned_entity_dict to group entities by declaration_reference
for declaration_reference, bods_entity_str in cleaned_entity_dict.items():
    bods_entity = json.loads(bods_entity_str)
    grouped_entities[declaration_reference].append(bods_entity)

# Debug: Print the number of entities for each declaration reference
for declaration_reference, entities in grouped_entities.items():
    print(f"Declaration Reference: {declaration_reference} has {len(entities)} entities")

Declaration Reference: AF-20171221-20181220 has 1 entities
Declaration Reference: AF-20181221-20191220 has 1 entities
Declaration Reference: AL-20170101-20171231 has 1 entities
Declaration Reference: AL-20180101-20181231 has 1 entities
Declaration Reference: AR-20180101-20181231 has 1 entities
Declaration Reference: AM-20180101-20181231 has 1 entities
Declaration Reference: AM-20190101-20191231 has 1 entities
Declaration Reference: BF-20170101-20171231 has 1 entities
Declaration Reference: BF-20180101-20181231 has 1 entities
Declaration Reference: BF-20190101-20191231 has 1 entities
Declaration Reference: BF-20201201-20201231 has 1 entities
Declaration Reference: CI-20170101-20171231 has 1 entities
Declaration Reference: CI-20180101-20181231 has 1 entities
Declaration Reference: CM-20170101-20171231 has 1 entities
Declaration Reference: CD-20170101-20171231 has 1 entities
Declaration Reference: CG-20170101-20171231 has 1 entities
Declaration Reference: DE-20170101-20171231 has 1 entiti