In [30]:
import pandas as pd
import json
import random
import copy
import uuid

# Function to generate a UUID3 based on subject and interestedParty
def generate_uuid3(subject, interestedParty):
    namespace = uuid.UUID('00000000-0000-0000-0000-000000000000')
    name = f"{subject}-{interestedParty}"
    return str(uuid.uuid3(namespace, name))

# Generate UUID4 for statementId
def generate_uuid4():
    return str(uuid.uuid4())



In [2]:

url_part1=('https://raw.githubusercontent.com/civicliteracies/EITI_SDT_data_verification_and_validation/sqlite/4_clean/2_data_editing/output/eiti-data_part1_1.3.csv')
url_part5=('https://raw.githubusercontent.com/civicliteracies/EITI_SDT_data_verification_and_validation/sqlite/4_clean/2_data_editing/output/eiti-data_part5-0.11.8.csv')

df_part1 = pd.read_csv(url_part1)
df_part5 = pd.read_csv(url_part5)

  df_part5 = pd.read_csv(url_part5)


# Part 1 - Generating statements

1. Starting from the consolidated part 1 of the summary data file, we perform an initial mapping of the dataset columns to the relevant fields of the [BODS 0.4 schema](https://github.com/openownership/data-standard/tree/main/schema, using 
    * a [mapping reference](https://docs.google.com/spreadsheets/d/1CPeZ_5FiqIRCmHGHh7Gz1McpxmwN1EoBwkMYtRqFWFo/edit?pli=1#gid=134387124) made possible by [flattening the BODS json schema](https://github.com/civicliteracies/EITI_SDT_data_verification_and_validation/blob/sqlite/4_clean/3_bods_mapping/02_schema_flattening.ipynb) files
    * a dictionary modelled after the BODS statemement schema.
    * a loop that processes the Part 1's data using the instructions in the mapping reference.
2. We store the created JSON statements in a dictionary using the statemeent IDs as keys to facilitate matching with the future JSON files containing recordDetails info.


In [3]:
# BODS statement structure template
bods_statement_schema = {
    "statementId": "",
    "statementDate": "",
    "annotations": [],
    "publicationDetails": {
        "publicationDate": "",
        "bodsVersion": "",
        "license": "",
        "publisher": {
            "name": "",
            "url": ""
        }
    },
    "source": {
        "type": [],
        "description": "",
        "url": "",
        "retrievedAt": "",
        "assertedBy": [
            {
                "name": "",
                "uri": ""
            }
        ]
    },
    "declaration": "",
    "declarationSubject": "",
    "recordId": "",
    "recordType": "",
    "recordDetails": {}
}

# Dictionary to hold the JSON strings
statement_dict = {}

# Iterate over each row in df_part1
for index, row in df_part1.iterrows():
    bods_statement = bods_statement_schema.copy()

    # Fill the bods_statement with data from the row
    bods_statement["statementId"] = ''
    bods_statement["statementDate"] = row['eiti_data_publication_date']
    bods_statement["publicationDetails"]["publicationDate"] = row['end_date']
    bods_statement["publicationDetails"]["bodsVersion"] = '0.4'
    bods_statement["publicationDetails"]["license"] = 'http://opendatacommons.org/licenses/pddl/1.0/'
    bods_statement["publicationDetails"]["publisher"]["name"] = 'Extractive Industries Transparency Initiative'
    bods_statement["publicationDetails"]["publisher"]["url"] = 'https://eiti.org/open-data'
    bods_statement["source"]["type"] = ['officialRegister', 'verified']
    bods_statement["source"]["url"] = 'https://eiti.portaljs.com'
    bods_statement["source"]["retrievedAt"] = pd.Timestamp('today').strftime('%Y-%m-%d')
    bods_statement["source"]["assertedBy"][0]["name"] = row['submitter_name']
    bods_statement["source"]["assertedBy"][0]["uri"] = row['submitter_email']
    bods_statement["declaration"] = f"{row['iso_alpha2_code']}-{row['start_date'].replace('-', '')}-{row['end_date'].replace('-', '')}"
    bods_statement["declarationSubject"] = row['iso_alpha2_code']
    bods_statement["recordId"] = ''
    bods_statement["recordType"] = ''
    
    # Create a variable name based on the statement identifier
    variable_name = row['eiti_id_declaration']
    
    # Save the JSON string in the dictionary
    statement_dict[variable_name] = json.dumps(bods_statement, indent=2, ensure_ascii=False)

# Print a sample of 2 random iterms froom the dictionary containing JSON strings
separator = "-" * 40
random_keys = random.sample(list(statement_dict.keys()), 2)

print(f"The dictionnary has {len(statement_dict.keys())} items\n")

for random_key in random_keys:
    print(f"{random_key}: {statement_dict[random_key]}\n{separator}\n")

The dictionnary has 73 items

9984e501-79e0-3630-9afd-1dc278c5a715: {
  "statementId": "",
  "statementDate": NaN,
  "annotations": [],
  "publicationDetails": {
    "publicationDate": "2018-12-31",
    "bodsVersion": "0.4",
    "license": "http://opendatacommons.org/licenses/pddl/1.0/",
    "publisher": {
      "name": "Extractive Industries Transparency Initiative",
      "url": "https://eiti.org/open-data"
    }
  },
  "source": {
    "type": [
      "officialRegister",
      "verified"
    ],
    "description": "",
    "url": "https://eiti.portaljs.com",
    "retrievedAt": "2024-05-27",
    "assertedBy": [
      {
        "name": "Freda Effah Bortier",
        "uri": "fredarry121@yahoo.com; fredabortier@gmail.com"
      }
    ]
  },
  "declaration": "GH-20180101-20181231",
  "declarationSubject": "GH",
  "recordId": "",
  "recordType": "",
  "recordDetails": {}
}
----------------------------------------

4718e6f9-6259-3876-86a7-42060858205b: {
  "statementId": "",
  "statementDate"

# Part 2. Generating Entities

TODO: 
1. need to change back the dictionary construction to generate one json per recordDetails
2. There is one statement per record details. Does it mean that I have to match then duplicate the statements for each recordDetails?


In [4]:
# Extract unique entities and add entity type
unique_companies = df_part5[['company_name', 'eiti_id_company', 'iso_alpha2_code', 'country', 'company_public_listing_or_website', 'start_date', 'end_date', 'eiti_id_declaration']].dropna(subset=['eiti_id_company']).drop_duplicates().assign(entity_type='registeredEntity')
unique_projects = df_part5[['project_name', 'eiti_id_project', 'iso_alpha2_code', 'country', 'start_date', 'end_date', 'eiti_id_declaration']].dropna(subset=['eiti_id_project']).drop_duplicates().assign(entity_type='arrangement')
unique_government = df_part5[['government_entity', 'eiti_id_government', 'iso_alpha2_code', 'country', 'start_date', 'end_date', 'eiti_id_declaration']].dropna(subset=['eiti_id_government']).drop_duplicates().assign(entity_type='stateBody')


# Combine into a single DataFrame
df_combined = pd.concat([unique_companies, unique_projects, unique_government], ignore_index=True)

print(f"The dataframe has {len(df_combined.index)} rows\n")

The dataframe has 8242 rows



In [5]:
# Define the BODS entity structure based on the JSON schema
bods_entity_schema = {
    "isComponent": False,
    "entityType": {
        "type": "",
        "subtype": "",
        "details": ""
    },
    "name": "",
    "jurisdiction": {
        "name": "",
        "code": ""
    },
    "identifiers": [],
    "addresses": [],
    "uri": "",
    "publicListing": None,
    "formedByStatute": None
}

# Create a dictionary to hold the JSON files using declaration_reference as the key
entity_dict = {}

# Iterate over each row in df_combined to create JSON files
for index, row in df_combined.iterrows():
    if index % 100 == 0:  # Update progress every 100 rows
        print(f"\rProcessing row {index+1}/{len(df_combined)}", end='')

    bods_entity = bods_entity_schema.copy()

    bods_entity["isComponent"] = False
    bods_entity["entityType"]["type"] = row['entity_type']
    bods_entity["entityType"]["subtype"] = (
        'governmentDepartment' if row['entity_type'] == 'stateBody' and 'minist' in str(row['government_entity']).lower() else
        'stateAgency' if row['entity_type'] == 'stateBody' else ''
    )
    bods_entity["name"] = (
        row['company_name'] if row['entity_type'] == 'registeredEntity' else
        row['project_name'] if row['entity_type'] == 'arrangement' else
        row['government_entity']
    )
    bods_entity["jurisdiction"]["name"] = row['country']
    bods_entity["jurisdiction"]["code"] = row['iso_alpha2_code']
    bods_entity["identifiers"] = [{
        "id": (
            row['eiti_id_company'] if row['entity_type'] == 'registeredEntity' else
            row['eiti_id_project'] if row['entity_type'] == 'arrangement' else
            row['eiti_id_government']
        ),
        "scheme": "XI-EITI",
        "schemeName": "Extractive Industries Transparency Initiative",
        "uri": f"/entity_statement/{row['eiti_id_company'] if row['entity_type'] == 'registeredEntity' else row['eiti_id_project'] if row['entity_type'] == 'arrangement' else row['eiti_id_government']}"
    }]
    bods_entity["uri"] = row['company_public_listing_or_website']
    
    # Create a variable name based on the statement identifier
    entity_dict_key = (index, row['eiti_id_declaration'])

    # Add the JSON string to the dictionary with the declaration_reference as the key
    entity_dict[entity_dict_key] = json.dumps(bods_entity, indent=2, ensure_ascii=False)

# Ensure the progress line is cleared after completion
print(f"\rProcessing completed. {len(df_combined)} rows processed.\n")
print(f"The dictionnary has {len(entity_dict.keys())} items")


Processing completed. 8242 rows processed.

The dictionnary has 8242 items


In [6]:
# Display 2 random items for quality check

random_entity = random.sample(list(entity_dict.keys()), 2)

for random_key in random_entity:
    print(f"{random_key}: {entity_dict[random_key]}\n{separator}\n")

(5204, '1fec5e66-7e78-3028-81d7-8d97e3348ded'): {
  "isComponent": false,
  "entityType": {
    "type": "arrangement",
    "subtype": "",
    "details": ""
  },
  "name": "P112",
  "jurisdiction": {
    "name": "United Kingdom",
    "code": "GB"
  },
  "identifiers": [
    {
      "id": "253faa92-f890-47bf-b14a-d8cc4467efaa",
      "scheme": "XI-EITI",
      "schemeName": "Extractive Industries Transparency Initiative",
      "uri": "/entity_statement/253faa92-f890-47bf-b14a-d8cc4467efaa"
    }
  ],
  "addresses": [],
  "uri": NaN,
  "publicListing": null,
  "formedByStatute": null
}
----------------------------------------

(82, '1f61bd83-c1cd-3658-8fab-29ba86d584a7'): {
  "isComponent": false,
  "entityType": {
    "type": "registeredEntity",
    "subtype": "",
    "details": ""
  },
  "name": "HAYAT KHAN",
  "jurisdiction": {
    "name": "Afghanistan",
    "code": "AF"
  },
  "identifiers": [
    {
      "id": "24d61c56-dead-4999-9ce8-8af5d2f37658",
      "scheme": "XI-EITI",
      

# Part 3 - Matching entities with statements

In [7]:
combined_dict = {}

for (index, eiti_id_declaration) in entity_dict.keys():
    if eiti_id_declaration in statement_dict:
        statement = json.loads(statement_dict[eiti_id_declaration])
        entity = json.loads(entity_dict[(index, eiti_id_declaration)])
        statement["recordDetails"] = entity

        # Set recordId and recordType in statement_dict
        statement["recordId"] = entity["identifiers"][0]["id"]
        statement["recordType"] = 'entity'
        
        combined_dict[index] = json.dumps(statement, indent=2, ensure_ascii=False)

# Print the length of the combined dictionary
print(f"Number of combined entries: {len(combined_dict)}")

Number of combined entries: 8242


In [8]:
# Display 2 random items for quality check

random_combined = random.sample(list(combined_dict.keys()), 2)

for random_key in random_combined:
    print(f"{random_key}: {combined_dict[random_key]}\n{separator}\n")


1737: {
  "statementId": "",
  "statementDate": "2021-03-30",
  "annotations": [],
  "publicationDetails": {
    "publicationDate": "2018-12-31",
    "bodsVersion": "0.4",
    "license": "http://opendatacommons.org/licenses/pddl/1.0/",
    "publisher": {
      "name": "Extractive Industries Transparency Initiative",
      "url": "https://eiti.org/open-data"
    }
  },
  "source": {
    "type": [
      "officialRegister",
      "verified"
    ],
    "description": "",
    "url": "https://eiti.portaljs.com",
    "retrievedAt": "2024-05-27",
    "assertedBy": [
      {
        "name": "Haitham S. El-Boukhary",
        "uri": "haitham.elboukhary@ggi-davinci.com"
      }
    ]
  },
  "declaration": "IQ-20180101-20181231",
  "declarationSubject": "IQ",
  "recordId": "1d840d41-caf9-4548-a95c-b6e519da5d1b",
  "recordType": "entity",
  "recordDetails": {
    "isComponent": false,
    "entityType": {
      "type": "registeredEntity",
      "subtype": "",
      "details": ""
    },
    "name": "N

# Part 4 - Relationships

TODO 
1. Generate the relationship record details
2. Need to add a payment details sub-structure in relationships

We defined 5 types of relationships an assigned the following attributes

| InterestedParty | Subject | directOrIndirect | descriptor |
| ---- | ---- | ---- | ---- |
| Country | Government Agency | direct | controlByLegalFramework |
| Government Agency | Company (SOE) | direct | controlByLegalFramework, rightsToProfitOrIncome |
| Government Agency | Company (Private) | direct | rightsToProfitOrIncome |
| Company | Project | direct | rightsGrantedByContract |
| Government Agency | Project | indirect | controlByLegalFramework |

In [9]:
bods_relationship_schemas = {
    "country_government": {
        "subject": "",
        "interestedParty": "",
        "interests": [{
            "type": "controlByLegalFramework",
            "directOrIndirect": "direct",
            "beneficialOwnershipOrControl": False,
        }],
        "isComponent": False
    },
    "government_soe": {
        "subject": "",
        "interestedParty": "",
        "interests": [
            {
                "type": "controlByLegalFramework",
                "directOrIndirect": "direct",
                "beneficialOwnershipOrControl": False,
            },
            {
                "type": "rightsToProfitOrIncome",
                "directOrIndirect": "direct",
                "beneficialOwnershipOrControl": False,
                "details": []
            }
        ],
        "isComponent": True
    },
    "government_company": {
        "subject": "",
        "interestedParty": "",
        "interests": [{
            "type": "rightsToProfitOrIncome",
            "directOrIndirect": "direct",
            "beneficialOwnershipOrControl": False,
            "details": []
        }],
        "isComponent": True
    },
    "company_project": {
        "subject": "",
        "interestedParty": "",
        "interests": [{
            "type": "rightsGrantedByContract",
            "directOrIndirect": "direct",
            "beneficialOwnershipOrControl": False,
            "details": []
        }],
        "isComponent": True
    },
    "government_project": {
        "subject": "",
        "interestedParty": "",
        "interests": [{
            "type": "controlByLegalFramework",
            "directOrIndirect": "indirect",
            "beneficialOwnershipOrControl": False,
        }],
        "isComponent": False,
        "componentRecords": []
    }
}

In [16]:
relationship_dicts = {
    "country_government": {},
    "government_soe": {},
    "government_company": {},
    "company_project": {},
    "government_project": {},
}

def populate_relationships(df, relationship_type, schema, subject_col, interested_party_col, start_date_col):
    relationship_list = []
    total_rows = len(df)

    for index, row in df.iterrows():
        if index % 100 == 0:  # Update progress every 100 rows
            print(f"\rProcessing row {index+1}/{total_rows}", end='\r')

        if pd.notna(row[subject_col]) and pd.notna(row[interested_party_col]):
            relationship = copy.deepcopy(schema)
            relationship["subject"] = row[subject_col]
            relationship["interestedParty"] = row[interested_party_col]
            for interest in relationship["interests"]:
                interest["startDate"] = row[start_date_col]
                if "details" in interest:
                    detail = {
                        "revenue_stream_name": row["revenue_stream_name"],
                        "revenue_value": row["revenue_value"],
                        "reporting_currency": row["reporting_currency"]
                    }
                    if pd.notna(row["in_kind_volume"]):
                        detail["in_kind_volume"] = row["in_kind_volume"]
                    if pd.notna(row["in_kind_unit"]):
                        detail["in_kind_unit"] = row["in_kind_unit"]
                    interest["details"].append(detail)
            relationship_list.append(((index, row['eiti_id_declaration']), relationship))
    
    relationship_dicts[relationship_type] = {k: json.dumps(v, indent=2, ensure_ascii=False) for k, v in relationship_list}

# Pre-filter DataFrame to avoid repetitive filtering
df_soes = df_part5[df_part5['company_type'] == "State-owned enterprises & public corporations"]
df_private = df_part5[df_part5['company_type'] == "Private"]

# Populate relationships
populate_relationships(df_part5, "country_government", bods_relationship_schemas["country_government"], "government_entity", "iso_alpha2_code", "start_date")
populate_relationships(df_soes, "government_soe", bods_relationship_schemas["government_soe"], "government_entity", "company_name", "start_date")
populate_relationships(df_private, "government_company", bods_relationship_schemas["government_company"], "government_entity", "company_name", "start_date")
populate_relationships(df_part5, "company_project", bods_relationship_schemas["company_project"], "company_name", "project_name", "start_date")
populate_relationships(df_part5, "government_project", bods_relationship_schemas["government_project"], "government_entity", "project_name", "start_date")

# Print the number of items in each dictionary
for relationship_type, relationships in relationship_dicts.items():
    print(f"{relationship_type}: {len(relationships)} items")

country_government: 31826 items
government_soe: 2611 items
government_company: 28889 items
company_project: 12320 items
government_project: 11832 items


In [21]:
# Function to sample a random item from each dictionary
def sample_relationships(relationship_dicts):
    samples = {}
    for relationship_type, relationships in relationship_dicts.items():
            random_key = random.choice(list(relationships.keys()))
            samples[relationship_type] = (random_key, relationships[random_key])
    return samples  

# Sample a random item from each relationship dictionary
sampled_relationships = sample_relationships(relationship_dicts)

print(f"Number of relationship entities: {sum(len(relationships) for relationships in relationship_dicts.values())}")

# Print the sampled relationships
for relationship_type, (sample_key, sample_value) in sampled_relationships.items():
    print(f"Sample from {relationship_type}:")
    print(f"Key: {sample_key}")
    print(f"Value: {sample_value}\n{separator}")

Number of relationship entities: 87478
Sample from country_government:
Key: (8935, '0d9c43a6-f72e-398b-b1eb-771b510ea6eb')
Value: {
  "subject": "DIRECTION GÉNÉRALE DU TRÉSOR (DGT)",
  "interestedParty": "CG",
  "interests": [
    {
      "type": "controlByLegalFramework",
      "directOrIndirect": "direct",
      "beneficialOwnershipOrControl": false,
      "startDate": "2017-01-01"
    }
  ],
  "isComponent": false
}
----------------------------------------
Sample from government_soe:
Key: (32396, 'fef32215-a021-3118-bdc3-a44079a72bdd')
Value: {
  "subject": "STATE TAX SERVICE OF UKRAINE",
  "interestedParty": "UKRNAFTA PJSC",
  "interests": [
    {
      "type": "controlByLegalFramework",
      "directOrIndirect": "direct",
      "beneficialOwnershipOrControl": false,
      "startDate": "2020-01-01"
    },
    {
      "type": "rightsToProfitOrIncome",
      "directOrIndirect": "direct",
      "beneficialOwnershipOrControl": false,
      "details": [
        {
          "revenue_stre

In [28]:
# Assuming relationship_dicts and statement_dict are already defined

combined_relationships_dict = {}

for relationship_type, relationships in relationship_dicts.items():
    for (index, eiti_id_declaration), relationship in relationships.items():
        if eiti_id_declaration in statement_dict:
            statement = json.loads(statement_dict[eiti_id_declaration])
            relationship_data = json.loads(relationship)
            
            # Add relationship data to the statement
            statement["recordDetails"] = relationship_data
            
            # Set recordId and recordType in statement_dict
            statement["recordId"] = generate_uuid3(relationship_data["subject"], relationship_data["interestedParty"])  # Updated line
            statement["recordType"] = 'relationship'
            
            combined_relationships_dict[(relationship_type, index)] = json.dumps(statement, indent=2, ensure_ascii=False)

# Print the length of the combined dictionary
print(f"Number of combined relationship entries: {len(combined_relationships_dict)}")



Number of combined relationship entries: 87478


In [29]:
# Function to sample a random item from a flat dictionary
def sample_relationships(flat_dict):
    random_key = random.choice(list(flat_dict.keys()))
    return {random_key: flat_dict[random_key]}

# Sample a random item from the combined relationship dictionary
sampled_relationship = sample_relationships(combined_relationships_dict)

# Separator for clarity
separator = "-" * 40

# Print the sampled relationships
for (relationship_type, index), sample in sampled_relationship.items():
    print(f"Sample from {relationship_type} (index {index}):")
    print(f"Value: {sample}\n{separator}")

Sample from government_company (index 17241):
Value: {
  "statementId": "",
  "statementDate": "2020-03-30",
  "annotations": [],
  "publicationDetails": {
    "publicationDate": "2018-03-31",
    "bodsVersion": "0.4",
    "license": "http://opendatacommons.org/licenses/pddl/1.0/",
    "publisher": {
      "name": "Extractive Industries Transparency Initiative",
      "url": "https://eiti.org/open-data"
    }
  },
  "source": {
    "type": [
      "officialRegister",
      "verified"
    ],
    "description": "",
    "url": "https://eiti.portaljs.com",
    "retrievedAt": "2024-05-27",
    "assertedBy": [
      {
        "name": "Sami Sakka",
        "uri": "sami.sakka@bdo-ifi.com"
      }
    ]
  },
  "declaration": "MM-20170401-20180331",
  "declarationSubject": "MM",
  "recordId": "d78ddf9f-a808-3da1-b8f2-f0c7d217b343",
  "recordType": "relationship",
  "recordDetails": {
    "subject": "DEPARTMENT OF MINES (DOM)",
    "interestedParty": "SHWE PIN LE MINING & INDUSTRY COMPANY LIMITED

In [32]:
# Create unified dictionary
unified_dict = {}

# Update statementId and add to unified dictionary using original index
for key, value in combined_dict.items():
    statement = json.loads(value)
    statement["statementId"] = generate_uuid4()
    unified_dict[key] = statement

# Prepare to sort relationships
relationships_list = []
for relationship_type, relationships in relationship_dicts.items():
    for (index, eiti_id_declaration), relationship in relationships.items():
        relationship_data = json.loads(relationship)
        statement = json.loads(statement_dict[eiti_id_declaration])
        statement["recordDetails"] = relationship_data
        statement["statementId"] = generate_uuid4()
        statement["recordType"] = 'relationship'
        relationships_list.append((relationship_type, statement, relationship_data["interests"][0]["startDate"], eiti_id_declaration, index))

# Sort relationships by start_date and eiti_id_relationship
relationships_list.sort(key=lambda x: (x[2], x[3]))

# Create grouped relationships dictionary
grouped_relationships = {}
for relationship_type, statement, _, eiti_id_declaration, index in relationships_list:
    if eiti_id_declaration not in grouped_relationships:
        grouped_relationships[eiti_id_declaration] = []
    grouped_relationships[eiti_id_declaration].append((relationship_type, statement, index))

# Order and update componentRecords for government_project items
for eiti_id_declaration, relations in grouped_relationships.items():
    sorted_relations = sorted(relations, key=lambda x: ['country_government', 'government_company', 'government_soe', 'company_project', 'government_project'].index(x[0]))
    for relationship_type, statement, index in sorted_relations:
        unified_dict[index] = statement
        if relationship_type == 'government_project':
            component_records = [s for t, s, idx in sorted_relations if t in ['government_company', 'government_soe', 'company_project']]
            if component_records:
                unified_dict[index]["recordDetails"]["componentRecords"] = [r["recordDetails"] for r in component_records]

# Print the number of combined entries
print(f"Number of combined entries: {len(unified_dict)}")

Number of combined entries: 32715


In [38]:
# Function to sample a random item from each type
def sample_random_items(unified_dict):
    samples = {
        "companies": [],
        "soe": [],
        "gov_agency": [],
        "relationship": []
    }

    for key, value in unified_dict.items():
        record_details = value.get("recordDetails", {})
        entity_type = record_details.get("entityType", {}).get("type", "")
        record_type = value.get("recordType", "")

        if entity_type == "registeredEntity" and record_type == "entity":
            samples["companies"].append((key, value))
        elif entity_type == "stateOwnedEntity" and record_type == "entity":
            samples["soe"].append((key, value))
        elif entity_type == "stateBody" and record_type == "entity":
            samples["gov_agency"].append((key, value))
        elif record_type == "relationship":
            samples["relationship"].append((key, value))

    return {type_: random.choice(items) if items else None for type_, items in samples.items()}

# Get random samples
random_samples = sample_random_items(unified_dict)

# Print the samples with separators
for entity_type, sample in random_samples.items():
    if sample:
        key, value = sample
        print(f"Sample from {entity_type}:")
        print(f"Key: {key}")
        print(f"Value: {json.dumps(value, indent=2, ensure_ascii=False)}")
        print(separator)

Sample from companies:
Key: 4002
Value: {
  "statementId": "0fa1ce01-94f0-45da-9369-ce22108d02fb",
  "statementDate": NaN,
  "annotations": [],
  "publicationDetails": {
    "publicationDate": "2018-12-31",
    "bodsVersion": "0.4",
    "license": "http://opendatacommons.org/licenses/pddl/1.0/",
    "publisher": {
      "name": "Extractive Industries Transparency Initiative",
      "url": "https://eiti.org/open-data"
    }
  },
  "source": {
    "type": [
      "officialRegister",
      "verified"
    ],
    "description": "",
    "url": "https://eiti.portaljs.com",
    "retrievedAt": "2024-05-27",
    "assertedBy": [
      {
        "name": "Elyes Kooli",
        "uri": "e.kooli@bdo.tn"
      }
    ]
  },
  "declaration": "ZM-20180101-20181231",
  "declarationSubject": "ZM",
  "recordId": "e1732945-555e-4e95-9e8f-fefc890e4a9b",
  "recordType": "entity",
  "recordDetails": {
    "isComponent": false,
    "entityType": {
      "type": "registeredEntity",
      "subtype": "",
      "deta

In [46]:
# Function to ensure proper UTF-8 encoding
def ensure_utf8(value):
    if isinstance(value, str):
        return value.encode('utf-8', errors='replace').decode('utf-8')
    return value

# Select a random eiti_id_declaration from unified_dict
random_declaration = random.choice([value['declaration'] for value in unified_dict.values()])

# Filter the unified_dict for entries matching the selected eiti_id_declaration
filtered_entries = [ensure_utf8(value) for value in unified_dict.values() if value.get('declaration') == random_declaration]

# Print the number of filtered entries
print(f"Number of entries for eiti_id_declaration '{random_declaration}': {len(filtered_entries)}")

# Output the filtered entries as a single JSON array
output_file = f"filtered_entries_{random_declaration}.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(filtered_entries, f, ensure_ascii=False, indent=2)

# Print a message confirming the file creation
print(f"Filtered entries saved to {output_file}")

Number of entries for eiti_id_declaration 'GH-20180101-20181231': 291
Filtered entries saved to filtered_entries_GH-20180101-20181231.json
