In [1]:
import pandas as pd
import json
import random



In [2]:

url_part1=('https://raw.githubusercontent.com/civicliteracies/EITI_SDT_data_verification_and_validation/sqlite/4_clean/2_data_editing/output/eiti-data_part1_1.3.csv')
url_part5=('https://raw.githubusercontent.com/civicliteracies/EITI_SDT_data_verification_and_validation/sqlite/4_clean/2_data_editing/output/eiti-data_part5-0.11.8.csv')

df_part1 = pd.read_csv(url_part1)
df_part5 = pd.read_csv(url_part5)

  df_part5 = pd.read_csv(url_part5)


# Part 1 - Generating statements

1. Starting from the consolidated part 1 of the summary data file, we perform an initial mapping of the dataset columns to the relevant fields of the [BODS 0.4 schema](https://github.com/openownership/data-standard/tree/main/schema, using 
    * a [mapping reference](https://docs.google.com/spreadsheets/d/1CPeZ_5FiqIRCmHGHh7Gz1McpxmwN1EoBwkMYtRqFWFo/edit?pli=1#gid=134387124) made possible by [flattening the BODS json schema](https://github.com/civicliteracies/EITI_SDT_data_verification_and_validation/blob/sqlite/4_clean/3_bods_mapping/02_schema_flattening.ipynb) files
    * a dictionary modelled after the BODS statemement schema.
    * a loop that processes the Part 1's data using the instructions in the mapping reference.
2. We store the created JSON statements in a dictionary using the statemeent IDs as keys to facilitate matching with the future JSON files containing recordDetails info.


In [24]:
# BODS statement structure template
bods_statement_schema = {
    "statementId": "",
    "statementDate": "",
    "annotations": [],
    "publicationDetails": {
        "publicationDate": "",
        "bodsVersion": "",
        "license": "",
        "publisher": {
            "name": "",
            "url": ""
        }
    },
    "source": {
        "type": [],
        "description": "",
        "url": "",
        "retrievedAt": "",
        "assertedBy": [
            {
                "name": "",
                "uri": ""
            }
        ]
    },
    "declaration": "",
    "declarationSubject": "",
    "recordId": "",
    "recordType": "",
    "recordDetails": {}
}

# Dictionary to hold the JSON strings
statement_dict = {}

# Iterate over each row in df_part1
for index, row in df_part1.iterrows():
    bods_statement = bods_statement_schema.copy()

    # Fill the bods_statement with data from the row
    bods_statement["statementId"] = ''
    bods_statement["statementDate"] = row['eiti_data_publication_date']
    bods_statement["publicationDetails"]["publicationDate"] = row['end_date']
    bods_statement["publicationDetails"]["bodsVersion"] = '0.4'
    bods_statement["publicationDetails"]["license"] = 'http://opendatacommons.org/licenses/pddl/1.0/'
    bods_statement["publicationDetails"]["publisher"]["name"] = 'Extractive Industries Transparency Initiative'
    bods_statement["publicationDetails"]["publisher"]["url"] = 'https://eiti.org/open-data'
    bods_statement["source"]["type"] = ['officialRegister', 'verified']
    bods_statement["source"]["url"] = 'https://eiti.portaljs.com'
    bods_statement["source"]["retrievedAt"] = pd.Timestamp('today').strftime('%Y-%m-%d')
    bods_statement["source"]["assertedBy"][0]["name"] = row['submitter_name']
    bods_statement["source"]["assertedBy"][0]["uri"] = row['submitter_email']
    bods_statement["declaration"] = f"{row['iso_alpha2_code']}-{row['start_date'].replace('-', '')}-{row['end_date'].replace('-', '')}"
    bods_statement["declarationSubject"] = row['iso_alpha2_code']
    bods_statement["recordId"] = ''
    bods_statement["recordType"] = ''
    
    # Create a variable name based on the statement identifier
    variable_name = row['eiti_id_declaration']
    
    # Save the JSON string in the dictionary
    statement_dict[variable_name] = json.dumps(bods_statement, indent=2, ensure_ascii=False)

# Print a sample of 2 random iterms froom the dictionary containing JSON strings
separator = "-" * 40
random_keys = random.sample(list(statement_dict.keys()), 2)

print(f"The dictionnary has {len(statement_dict.keys())} items\n")

for random_key in random_keys:
    print(f"{random_key}: {statement_dict[random_key]}\n{separator}\n")

The dictionnary has 73 items

0d9c43a6-f72e-398b-b1eb-771b510ea6eb: {
  "statementId": "",
  "statementDate": "n/v",
  "annotations": [],
  "publicationDetails": {
    "publicationDate": "2017-12-31",
    "bodsVersion": "0.4",
    "license": "http://opendatacommons.org/licenses/pddl/1.0/",
    "publisher": {
      "name": "Extractive Industries Transparency Initiative",
      "url": "https://eiti.org/open-data"
    }
  },
  "source": {
    "type": [
      "officialRegister",
      "verified"
    ],
    "description": "",
    "url": "https://eiti.portaljs.com",
    "retrievedAt": "2024-05-21",
    "assertedBy": [
      {
        "name": "Maher Kabsi",
        "uri": "Maher.Kabsi@bdo-ifi.com"
      }
    ]
  },
  "declaration": "CG-20170101-20171231",
  "declarationSubject": "CG",
  "recordId": "",
  "recordType": "",
  "recordDetails": {}
}
----------------------------------------

4a8b4137-6af1-306b-a321-7c9cab4ad6a7: {
  "statementId": "",
  "statementDate": "2019-12-31",
  "annotatio

# Part 2. Generating Entities

TODO: 
1. need to change back the dictionary construction to generate one json per recordDetails
2. There is one statement per record details. Does it mean that I have to match then duplicate the statements for each recordDetails?


In [32]:
# Extract unique entities and add entity type
unique_companies = df_part5[['company_name', 'eiti_id_company', 'iso_alpha2_code', 'country', 'company_public_listing_or_website', 'start_date', 'end_date', 'eiti_id_declaration']].dropna(subset=['eiti_id_company']).drop_duplicates().assign(entity_type='registeredEntity')
unique_projects = df_part5[['project_name', 'eiti_id_project', 'iso_alpha2_code', 'country', 'start_date', 'end_date', 'eiti_id_declaration']].dropna(subset=['eiti_id_project']).drop_duplicates().assign(entity_type='arrangement')
unique_government = df_part5[['government_entity', 'eiti_id_government', 'iso_alpha2_code', 'country', 'start_date', 'end_date', 'eiti_id_declaration']].dropna(subset=['eiti_id_government']).drop_duplicates().assign(entity_type='stateBody')


# Combine into a single DataFrame
df_combined = pd.concat([unique_companies, unique_projects, unique_government], ignore_index=True)

print(f"The dataframe has {len(df_combined.index)} rows\n")

The dataframe has 8242 rows



In [33]:
# Define the BODS entity structure based on the JSON schema
bods_entity_schema = {
    "isComponent": False,
    "entityType": {
        "type": "",
        "subtype": "",
        "details": ""
    },
    "name": "",
    "jurisdiction": {
        "name": "",
        "code": ""
    },
    "identifiers": [],
    "addresses": [],
    "uri": "",
    "publicListing": None,
    "formedByStatute": None
}

# Create a dictionary to hold the JSON files using declaration_reference as the key
entity_dict = {}

# Iterate over each row in df_combined to create JSON files
for index, row in df_combined.iterrows():
    if index % 100 == 0:  # Update progress every 100 rows
        print(f"\rProcessing row {index+1}/{len(df_combined)}", end='')

    bods_entity = bods_entity_schema.copy()

    bods_entity["isComponent"] = False
    bods_entity["entityType"]["type"] = row['entity_type']
    bods_entity["entityType"]["subtype"] = (
        'governmentDepartment' if row['entity_type'] == 'stateBody' and 'minist' in str(row['government_entity']).lower() else
        'stateAgency' if row['entity_type'] == 'stateBody' else ''
    )
    bods_entity["name"] = (
        row['company_name'] if row['entity_type'] == 'registeredEntity' else
        row['project_name'] if row['entity_type'] == 'arrangement' else
        row['government_entity']
    )
    bods_entity["jurisdiction"]["name"] = row['country']
    bods_entity["jurisdiction"]["code"] = row['iso_alpha2_code']
    bods_entity["identifiers"] = [{
        "id": (
            row['eiti_id_company'] if row['entity_type'] == 'registeredEntity' else
            row['eiti_id_project'] if row['entity_type'] == 'arrangement' else
            row['eiti_id_government']
        ),
        "scheme": "XI-EITI",
        "schemeName": "Extractive Industries Transparency Initiative",
        "uri": f"/entity_statement/{row['eiti_id_company'] if row['entity_type'] == 'registeredEntity' else row['eiti_id_project'] if row['entity_type'] == 'arrangement' else row['eiti_id_government']}"
    }]
    bods_entity["uri"] = row['company_public_listing_or_website']
    
    # Create a variable name based on the statement identifier
    entity_dict_key = (index, row['eiti_id_declaration'])

    # Add the JSON string to the dictionary with the declaration_reference as the key
    entity_dict[entity_dict_key] = json.dumps(bods_entity, indent=2, ensure_ascii=False)

# Ensure the progress line is cleared after completion
print(f"\rProcessing completed. {len(df_combined)} rows processed.\n")
print(f"The dictionnary has {len(entity_dict.keys())} items")


Processing completed. 8242 rows processed.

The dictionnary has 8242 items


In [40]:
# Display 2 random items for quality check

random_entity = random.sample(list(entity_dict.keys()), 2)

for random_key in random_entity:
    print(f"{random_key}: {entity_dict[random_key]}\n{separator}\n")

(5155, '1fec5e66-7e78-3028-81d7-8d97e3348ded'): {
  "isComponent": false,
  "entityType": {
    "type": "arrangement",
    "subtype": "",
    "details": ""
  },
  "name": "P2489",
  "jurisdiction": {
    "name": "United Kingdom",
    "code": "GB"
  },
  "identifiers": [
    {
      "id": "8c51be43-5696-4e10-b35d-41dd59fcdec8",
      "scheme": "XI-EITI",
      "schemeName": "Extractive Industries Transparency Initiative",
      "uri": "/entity_statement/8c51be43-5696-4e10-b35d-41dd59fcdec8"
    }
  ],
  "addresses": [],
  "uri": NaN,
  "publicListing": null,
  "formedByStatute": null
}
----------------------------------------

(1111, 'ccfb08b7-f235-3638-9b1e-b2627cc5d1d9'): {
  "isComponent": false,
  "entityType": {
    "type": "registeredEntity",
    "subtype": "",
    "details": ""
  },
  "name": "TENKE FUNGURUME MINING SA (TFM)",
  "jurisdiction": {
    "name": "Democratic Republic of Congo",
    "code": "CD"
  },
  "identifiers": [
    {
      "id": "754f47b2-32b2-457b-b59c-bbc6889

# Part 3 - Matching entities with statements

In [42]:
combined_dict = {}

for (index, eiti_id_declaration) in entity_dict.keys():
    if eiti_id_declaration in statement_dict:
        statement = json.loads(statement_dict[eiti_id_declaration])
        entity = json.loads(entity_dict[(index, eiti_id_declaration)])
        statement["recordDetails"] = entity

        # Set recordId and recordType in statement_dict
        statement["recordId"] = entity["identifiers"][0]["id"]
        statement["recordType"] = 'entity'
        
        combined_dict[index] = json.dumps(statement, indent=2, ensure_ascii=False)

# Print the length of the combined dictionary
print(f"Number of combined entries: {len(combined_dict)}")

Number of combined entries: 8242


In [43]:
# Display 2 random items for quality check

random_combined = random.sample(list(combined_dict.keys()), 2)

for random_key in random_combined:
    print(f"{random_key}: {combined_dict[random_key]}\n{separator}\n")


626: {
  "statementId": "",
  "statementDate": NaN,
  "annotations": [],
  "publicationDetails": {
    "publicationDate": "2018-12-31",
    "bodsVersion": "0.4",
    "license": "http://opendatacommons.org/licenses/pddl/1.0/",
    "publisher": {
      "name": "Extractive Industries Transparency Initiative",
      "url": "https://eiti.org/open-data"
    }
  },
  "source": {
    "type": [
      "officialRegister",
      "verified"
    ],
    "description": "",
    "url": "https://eiti.portaljs.com",
    "retrievedAt": "2024-05-21",
    "assertedBy": [
      {
        "name": "EITI Albania Secretariat",
        "uri": "sekretariati@albeiti.gov.al"
      }
    ]
  },
  "declaration": "AL-20180101-20181231",
  "declarationSubject": "AL",
  "recordId": "255a4a53-d287-4a8a-b740-d84f9e9a309d",
  "recordType": "entity",
  "recordDetails": {
    "isComponent": false,
    "entityType": {
      "type": "registeredEntity",
      "subtype": "",
      "details": ""
    },
    "name": "BABASI COO SHPK"

# Part 4 - Relationships

TODO 
1. Generate the relationship record details
2. Need to add a payment details sub-structure in relationships

We defined 5 types of relationships an assigned the following attributes

| InterestedParty | Subject | directOrIndirect | descriptor |
| ---- | ---- | ---- | ---- |
| Country | Government Agency | direct | controlByLegalFramework |
| Government Agency | Company (SOE) | direct | controlByLegalFramework, rightsToProfitOrIncome |
| Government Agency | Company (Private) | direct | rightsToProfitOrIncome |
| Company | Project | direct | rightsGrantedByContract |
| Government Agency | Project | indirect | controlByLegalFramework |

In [67]:
bods_relationship_schemas = {
    "country_government": {
        "subject": "",
        "interestedParty": "",
        "interests": [{
            "type": "controlByLegalFramework",
            "directOrIndirect": "direct",
            "beneficialOwnershipOrControl": False,
            "startDate": ""
        }],
        "isComponent": False
    },
    "government_soe": {
        "subject": "",
        "interestedParty": "",
        "interests": [
            {
                "type": "controlByLegalFramework",
                "directOrIndirect": "direct",
                "beneficialOwnershipOrControl": False,
                "startDate": ""
            },
            {
                "type": "rightsToProfitOrIncome",
                "directOrIndirect": "direct",
                "beneficialOwnershipOrControl": False,
                "startDate": ""
            }
        ],
        "isComponent": True
    },
    "government_company": {
        "subject": "",
        "interestedParty": "",
        "interests": [{
            "type": "rightsToProfitOrIncome",
            "directOrIndirect": "direct",
            "beneficialOwnershipOrControl": False,
            "startDate": ""
        }],
        "isComponent": True
    },
    "company_project": {
        "subject": "",
        "interestedParty": "",
        "interests": [{
            "type": "rightsGrantedByContract",
            "directOrIndirect": "direct",
            "beneficialOwnershipOrControl": False,
            "startDate": ""
        }],
        "isComponent": True
    },
    "government_project": {
        "subject": "",
        "interestedParty": "",
        "interests": [{
            "type": "controlByLegalFramework",
            "directOrIndirect": "indirect",
            "beneficialOwnershipOrControl": False,
            "startDate": ""
        }],
        "isComponent": False,
        "componentRecords": []
    }
}

In [68]:
relationship_dicts = {
    "country_government": {},
    "government_soe": {},
    "government_company": {},
    "company_project": {},
    "government_project": {},
}

def populate_relationships(
        df, 
        relationship_type, 
        schema, 
        subject_col, 
        interested_party_col, 
        start_date_col
    ):

    for index, row in df.iterrows():
        if pd.notna(row[subject_col]) and pd.notna(row[interested_party_col]):
            relationship = schema.copy()
            relationship["subject"] = row[subject_col]
            relationship["interestedParty"] = row[interested_party_col]
            for interest in relationship["interests"]:
                interest["startDate"] = row[start_date_col]
            relationship_dicts[relationship_type][(index, row['eiti_id_declaration'])] = json.dumps(relationship, indent=2, ensure_ascii=False)

populate_relationships(
    df_part5, 
    "country_government", 
    bods_relationship_schemas["country_government"], 
    "government_entity", 
    "iso_alpha2_code", 
    "start_date"
)

# Check for SOEs and apply the appropriate schema
populate_relationships(
    df_part5[df_part5['company_type'] == "State-owned enterprises & public corporations"], 
    "government_soe", 
    bods_relationship_schemas["government_soe"],
    "company_name", 
    "government_entity",  
    "start_date"
)

# Apply the standard government-company schema to non-SOEs
populate_relationships(
    df_part5[df_part5['company_type'] == "Private"], 
    "government_company", 
    bods_relationship_schemas["government_company"],
    "company_name", 
    "government_entity",  
    "start_date"
)

populate_relationships(
    df_part5, 
    "company_project", 
    bods_relationship_schemas["company_project"], 
    "project_name",
    "company_name", 
    "start_date"
)

populate_relationships(
    df_part5, 
    "government_project", 
    bods_relationship_schemas["government_project"],
    "project_name",
    "government_entity",  
    "start_date"
)

# Print the number of items in each dictionary
for relationship_type, relationships in relationship_dicts.items():
    print(f"{relationship_type}: {len(relationships)} items")

country_government: 31826 items
government_soe: 2611 items
government_company: 28889 items
company_project: 12320 items
government_project: 11832 items


In [66]:
# Function to sample a random item from each dictionary
def sample_relationships(relationship_dicts):
    samples = {}
    for relationship_type, relationships in relationship_dicts.items():
            random_key = random.choice(list(relationships.keys()))
            samples[relationship_type] = relationships[random_key]
    return samples  

# Sample a random item from each relationship dictionary
sampled_relationships = sample_relationships(relationship_dicts)

print(f"Number of combined entries: {len(combined_dict)}")

# Print the sampled relationships
for relationship_type, sample in sampled_relationships.items():
    print(f"Sample from {relationship_type}:")
    print(f"{sample}\n{separator}")

Sample from country_government:
{
  "subject": "KISHAPU DISTRICT COUNCIL",
  "interestedParty": "TZ",
  "interests": [
    {
      "type": "controlByLegalFramework",
      "directOrIndirect": "direct",
      "beneficialOwnershipOrControl": false,
      "startDate": "2017-07-01"
    }
  ],
  "isComponent": false
}
----------------------------------------
Sample from government_soe:
{
  "subject": "LYSYCHANSKVUHILLYA PJSC",
  "interestedParty": "STATE TAX SERVICE OF UKRAINE",
  "interests": [
    {
      "type": "controlByLegalFramework",
      "directOrIndirect": "direct",
      "beneficialOwnershipOrControl": false,
      "startDate": "2018-01-01"
    },
    {
      "type": "rightsToProfitOrIncome",
      "directOrIndirect": "direct",
      "beneficialOwnershipOrControl": false,
      "startDate": "2018-01-01"
    }
  ],
  "isComponent": false
}
----------------------------------------
Sample from government_company:
{
  "subject": "COMERCIR",
  "interestedParty": "ALBANIAN CUSTOMS ADM