We first import the libraries that we need.

In [1]:
import sqlite3
import pandas as pd
from faker import Faker
import random
import string
import uuid
from collections import defaultdict

We then set the parameters needed to generate the fake data

In [2]:
# Set Faker seed for reproducibility
fake = Faker()
Faker.seed(4321)

# Number of records to generate
num_companies = 9000
num_agencies = 1500
num_projects = 11000
num_revenues = 50000

# List of EITI implementing countries
country_labels = [
    'Afghanistan', 'Albania', 'Angola', 'Argentina', 'Armenia', 'Burkina Faso', 'Cameroon',
    'Central African Republic', 'Chad', 'Colombia', 'Côte d\'Ivoire', 'Democratic Republic of the Congo',
    'Dominican Republic', 'Ecuador', 'Ethiopia', 'Gabon', 'Germany', 'Ghana', 'Guatemala', 'Guinea',
    'Guyana', 'Honduras', 'Indonesia', 'Iraq', 'Kazakhstan', 'Kyrgyz Republic', 'Liberia', 'Madagascar',
    'Malawi', 'Mali', 'Mauritania', 'Mexico', 'Mongolia', 'Mozambique', 'Myanmar', 'Netherlands', 'Niger',
    'Nigeria', 'Norway', 'Papua New Guinea', 'Peru', 'Philippines', 'Republic of the Congo', 'Senegal',
    'Seychelles', 'Sierra Leone', 'Suriname', 'São Tomé and Príncipe', 'Tajikistan', 'Tanzania',
    'Timor-Leste', 'Togo', 'Trinidad and Tobago', 'Uganda', 'Ukraine', 'United Kingdom', 'Zambia'
]

# List of ISO 3166-1 alpha-2 codes for EITI implementing countries
countries = [
    'AF', 'AL', 'AO', 'AR', 'AM', 'BF', 'CM', 'CF', 'TD', 'CO', 'CI', 'CD', 'DO', 'EC', 'ET', 'GA', 'DE', 'GH', 'GT', 'GN', 'GY', 'HN', 'ID',
    'IQ', 'KZ', 'KG', 'LR', 'MG', 'MW', 'ML', 'MR', 'MX', 'MN', 'MZ', 'MM', 'NL', 'NE', 'NG', 'NO', 'PG', 'PE', 'PH', 'CG', 'SN', 'SC', 'SL',
    'SR', 'ST', 'TJ', 'TZ', 'TL', 'TG', 'TT', 'UG', 'UA', 'GB', 'ZM'
]

# Company and agency types
company_types = ['Private', 'Listed', 'Public corporations & state-owned entreprises', 'Other']
agency_types = ['Local government', 'Central government', 'State government', 'Public corporations & state-owned entreprises', 'other']

# Project statuses and units
project_statuses = ['Production', 'Development', 'Exploration', 'Other', 'Not reported']
project_units = ['tonnes', 'barrels', 'grams']

# EITI contributing companies and small companies
EITI_contrib_companies = ['Africa Oil Corp', 'Alcoa', 'AMG', 'Anglo American', 'AngloGold Ashanti', 'Arcelor Mittal', 'Barrick Gold', 'Base Titanium', 'BHP', 'BP', 'Capricorn Energy PLC', 'Centerra Gold', 'Chevron', 'Eni', 'Equinor', 'Eramet', 'ExxonMobil', 'Freeport-McMoRan', 'Glencore', 'Gold Fields', 'Gunvor Group', 'Hess Corporation', 'Inpex Corporation', 'Ivanhoe Mines', 'JX Nippon Mining & Metals', 'Kinross Gold', 'Kosmos Energy', 'Lundin Foundation', 'Minsur', 'MMG', 'Neptune Energy', 'Newcrest Mining', 'Newmont', 'NNPC', 'Orano Mining', 'Petronor E&P', 'Poderosa', 'Pt Pertamina', 'Qatar Energy', 'Repsol Group', 'Rio Tinto', 'Royal Dutch Shell', 'Santos Limited', 'Savannah Energy', 'Shell plc', 'Sibanye Stillwater', 'South 32', 'Southern Copper', 'St Barbara Limited', 'Staatsolie Maatschappij Suriname N.V.', 'Sumitomo Metal Mining', 'Teck', 'TotalEnergies SE', 'Trafigura', 'Tullow oil', 'Vale', 'Woodside']
small_companies = ['ORYX ENERGIES', 'Petra Quarries Ltd', 'Platinum Asphalt & Crushing Company Limited', 'Mothercat Limited Nig. Ltd.', 'Moulds NIG. Ltd', 'MTP SA', 'Mufkad mines & Invest Ltd']

# Possible commodities
commodities = ['gold', 'silver', 'diamonds', 'oil', 'coal', 'iron', 'aluminium', 'copper', 'zinc', 'lead']

# Stock exchanges
stock_exchanges = ['NYSE', 'NASDAQ', 'LSE', 'Euronext', 'SSE', 'HKEX', 'TSX', 'SZSE', 'Deutsche Börse', 'ASX', 'JSE', 'BSE', 'NSE']

# Sectors
sectors = ['Oil and gas', 'Mining', 'Energy', 'Metals', 'Chemicals']

# Agency names
agency_names = ["Oil Resource Management Agency", "Extractives Industry Regulation Directorate", "Land Management Office", "Ministry of Finances", "National Mining Agency"]

# revenue parameters
revenue_streams = ["PPT", "ROYALTY (OIL)", "EDUCATIONAL TAX"]
currencies = ["USD", "EUR", "NGN", "GHS"]

The following is a function that generates synthetic data for the countries table

In [3]:
def generate_countries():
    data = []

    for id, (label, country_code) in enumerate(zip(country_labels, countries), start=1):
        self = f'https://www.eiti.org/{country_code}'
        type = 'country'
        iso2 = country_code
        join_date = '2015-01-01'
        leave_date = None
        local_website = f'https://eiti.org/countries/{country_code}'

        data.append([id, label, self, type, iso2, join_date, leave_date, local_website])

    return pd.DataFrame(data, columns=['id', 'label', 'self', 'type', 'iso2', 'join_date', 'leave_date', 'local_website'])

# Generate countries data
df_countries = generate_countries()

The following is a function that generates synthetic data for the companies table

In [4]:
def generate_companies(num_companies):
    data = []
    generated_ids = set()

    for _ in range(num_companies):
        is_small_company = random.choice([True, False])

        if is_small_company:
            label = random.choice(small_companies)
            country_code = random.choice(df_countries['iso2'])
            id = '00000000000000000000-' + country_code + '-' + str(random.randint(100000,999999)) + '-1'
        else:
            label = random.choice(EITI_contrib_companies)
            id = fake.bothify(text='????????????????????-##-######-#')

        # Ensure uniqueness of id
        while id in generated_ids:
            if is_small_company:
                id = '00000000000000000000-' + country_code + '-' + str(random.randint(100000,999999)) + '-1'
            else:
                id = fake.bothify(text='????????????????????-##-######-#')

        generated_ids.add(id)
        
        self = 'https://www.eiti.org/' + id
        company_type = random.choice(company_types)
        sector = random.choice(sectors)
        stock_exchange_listing = random.choice(stock_exchanges) if company_type == 'Listed' else None
        audited_financial_state = 'https://www.eiti.org/financial_report/' + id
        summary_iso2 = random.choice(df_countries['iso2'])
        summary_year = random.randint(2017, 2021)
        
        data.append([id, label, self, 'company', id, sector, company_type, stock_exchange_listing, audited_financial_state, summary_iso2, summary_year])
        
    return pd.DataFrame(data, columns=['id', 'label', 'self', 'type', 'identification', 'sector', 'company_type', 'stock_exchange_listing', 'audited_financial_state', 'summary_data_iso2', 'summary_data_year'])

# Generate companies data
df_companies = generate_companies(num_companies)

The following is a function that generates synthetic data for the agencies table

In [5]:
def generate_agencies(num_agencies):
    data = []
    counter_by_country = defaultdict(int)
    
    for _ in range(num_agencies):
        country_code = random.choice(df_countries['iso2'])
        agency_type = random.choice(['National', 'Local'])
        
        counter_by_country[(country_code, agency_type)] += 1
        counter = counter_by_country[(country_code, agency_type)]
        
        self = f"https://eiti.org/agency/{country_code}/{counter}"
        label = random.choice(agency_names)
        summary_iso2 = random.choice(countries)
        summary_year = random.randint(2017, 2021)
        
        # Making sure the ID is unique by using a counter
        id = f"{country_code}-{agency_type[0]}-{str(counter).zfill(6)}"
        
        data.append([id, label, self, 'agency', id, agency_type, summary_iso2, summary_year])
        
    return pd.DataFrame(data, columns=['id', 'label', 'self', 'type', 'identification', 'agency_type', 'summary_data_iso2', 'summary_data_year'])

# Generate agencies data
df_agencies = generate_agencies(num_agencies)

The following is a function that generates synthetic data for the projects table

In [6]:
def generate_projects(num_projects):
    data = []
    
    # Start a counter for project IDs
    project_id_counter = 100000
    
    for _ in range(num_projects):
        # Incrementally assign unique IDs
        id = str(project_id_counter)
        project_id_counter += 1
        
        label = random.choice(['Subsoil use special permit No.' + str(random.randint(1000,9999)), fake.bothify(text='???? Fields')])
        self = 'https://www.eiti.org/' + id
        project_legal_agreement = 'https://www.eiti.org/legal_agreement/' + id
        project_affiliated_companies_start = ', '.join(random.sample(EITI_contrib_companies, k=random.randint(1,3)))
        project_commodities = ', '.join(random.sample(commodities, k=random.randint(1,3)))
        project_status = random.choice(project_statuses)
        project_production_volume = str(random.randint(1000, 1000000))
        project_unit = random.choice(project_units)
        project_production = str(random.randint(1000, 1000000))
        summary_iso2 = random.choice(countries)
        summary_year = random.randint(2017, 2021)
        
        data.append([id, label, self, 'project', project_legal_agreement, project_affiliated_companies_start, project_commodities, project_status, project_production_volume, project_unit, project_production, summary_iso2, summary_year])
        
    return pd.DataFrame(data, columns=['id', 'label', 'self', 'type', 'project_legal_agreement', 'project_affiliated_companies_start', 'project_commodities', 'project_status', 'project_production_volume', 'project_unit', 'project_production', 'summary_data_iso2', 'summary_data_year'])

# Generate projects data
df_projects = generate_projects(num_projects)


The following is a function that generates the values of the project_companies link table

In [7]:
def generate_project_companies(num_relations):
    data = []
    added_combinations = set()
    
    for _ in range(num_relations):
        project_id = random.choice(df_projects['id'])
        company_id = random.choice(df_companies['id'])

        # Check if the combination is already in the set
        while (project_id, company_id) in added_combinations:
            project_id = random.choice(df_projects['id'])
            company_id = random.choice(df_companies['id'])

        # Add the combination to the set
        added_combinations.add((project_id, company_id))
        
        data.append([project_id, company_id])
        
    return pd.DataFrame(data, columns=['project_id', 'company_id'])

# Define the number of relations to generate
num_relations = 15000

# Generate project_companies data
df_project_companies = generate_project_companies(num_relations)


The following is a function that generates synthetic data for the revenues table

In [8]:
# Extract the unique identifiers from the relevant DataFrames
company_ids = df_companies['id'].tolist()
agency_ids = df_agencies['id'].tolist()
project_ids = df_projects['id'].tolist()

def generate_revenue(num_revenues, company_id, agency_id, project_id):
    data = []
    
    for _ in range(num_revenues):
        revenue_id = uuid.uuid4().hex
        country_iso2 = random.choice(countries)
        year = random.randint(2017, 2021)
        company_id = random.choice(company_ids)
        revenue_stream = random.choice(revenue_streams)
        agency_id = random.choice(agency_ids)
        levied_by_project = random.choice(['Yes', 'No'])
        reported_by_project = random.choice(['Yes', 'No'])
        project_id = random.choice(project_ids)
        original_value = random.uniform(1000, 1000000)
        original_currency = random.choice(currencies)
        revenues_USD = original_value * random.uniform(0.8, 1.2)
        payment_in_kind = random.choice(['Yes', 'No'])
        in_kind_volume = None if payment_in_kind == 'No' else random.uniform(1, 100)
        in_kind_volume_unit = None if payment_in_kind == 'No' else 'Barrels'
        comments = 'N/A'
        gfs_code = str(random.randint(100, 999))
        gfs_code2 = str(random.randint(100, 999))
        
        # Adding other GFS Levels
        gfs_level_1 = "Level 1"
        gfs_level_2 = "Level 2"
        gfs_level_3 = "Level 3"
        gfs_level_4 = "Level 4"
        gfs_level_5 = "Level 5"
        gfs_level_6 = "Level 6"

        data.append([revenue_id, country_iso2, year, company_id, revenue_stream, agency_id, levied_by_project, reported_by_project, project_id, original_value, original_currency, revenues_USD, payment_in_kind, in_kind_volume, in_kind_volume_unit, comments, gfs_code, gfs_code2, gfs_level_1, gfs_level_2, gfs_level_3, gfs_level_4, gfs_level_5, gfs_level_6])

    return pd.DataFrame(data, columns=['revenue_id', 'country_iso2', 'year', 'company_id', 'revenue_stream', 'agency_id', 'levied_by_project', 'reported_by_project', 'project_id', 'original_value', 'original_currency', 'revenues_USD', 'payment_in_kind', 'in_kind_volume', 'in_kind_volume_unit', 'comments', 'gfs_code', 'gfs_code2', 'gfs_level_1', 'gfs_level_2', 'gfs_level_3', 'gfs_level_4', 'gfs_level_5', 'gfs_level_6'])

# Generate revenue data
df_revenue = generate_revenue(num_revenues, company_ids, agency_ids, project_ids)


We now connect create the SQLITE database

In [9]:
# Connect to SQLite database (it will be created if it doesn't exist)
conn = sqlite3.connect('EITI_Database.db')

# Create a cursor object to interact with the database
cursor = conn.cursor()

# Enable foreign key support
cursor.execute("PRAGMA foreign_keys = ON;")

<sqlite3.Cursor at 0x11e439e40>

We're now going to create and add the countries table as well as insert the data into it to ensure that other tables can respect the foreign key constraints. 

In [10]:
# Create the countries table
cursor.execute('''
CREATE TABLE IF NOT EXISTS countries (
    id INTEGER PRIMARY KEY,
    label TEXT,
    self TEXT,
    type TEXT,
    iso2 TEXT UNIQUE,
    join_date TEXT,
    leave_date TEXT,
    local_website TEXT
);
''')

# Committing to the database to validate foreign key constraints
conn.commit()

# Insert countries data into the countries table
df_countries.to_sql('countries', con=conn, if_exists='append', index=False)

57

create and add the other tables

In [11]:
# Create the companies table
cursor.execute('''
CREATE TABLE IF NOT EXISTS companies (
    id TEXT PRIMARY KEY,
    label TEXT,
    self TEXT,
    type TEXT,
    identification TEXT,
    sector TEXT,
    company_type TEXT,
    stock_exchange_listing TEXT,
    audited_financial_state TEXT,
    summary_data_iso2 TEXT REFERENCES countries(iso2),
    summary_data_year TEXT
)
''')

# Create the agencies table
cursor.execute('''
CREATE TABLE IF NOT EXISTS agencies (
    id TEXT PRIMARY KEY,
    label TEXT,
    self TEXT,
    type TEXT,
    identification TEXT,
    agency_type TEXT,
    summary_data_iso2 TEXT REFERENCES countries(iso2),
    summary_data_year TEXT
)
''')

# Create the projects table
cursor.execute('''
CREATE TABLE IF NOT EXISTS projects (
    id TEXT PRIMARY KEY,
    label TEXT,
    self TEXT,
    type TEXT,
    project_legal_agreement TEXT,
    project_affiliated_companies_start TEXT,
    project_commodities TEXT,
    project_status TEXT,
    project_production_volume TEXT,
    project_unit TEXT,
    project_production TEXT,
    summary_data_iso2 TEXT REFERENCES countries(iso2),
    summary_data_year TEXT
)
''')

# Create the project_companies table
cursor.execute('''
CREATE TABLE IF NOT EXISTS project_companies (
    project_id TEXT REFERENCES projects(id),
    company_id TEXT REFERENCES companies(id),
    PRIMARY KEY (project_id, company_id)
)
''')

# Create the revenue table               
cursor.execute('''
CREATE TABLE IF NOT EXISTS revenue (
    revenue_id TEXT PRIMARY KEY,
    country_iso2 TEXT REFERENCES countries(iso2),
    year TEXT,
    company_id TEXT REFERENCES companies(id),
    revenue_stream TEXT,
    agency_id TEXT REFERENCES agencies(id),
    levied_by_project TEXT,
    reported_by_project TEXT,
    project_id TEXT REFERENCES projects(id),
    original_value REAL,
    original_currency TEXT,
    revenues_USD REAL,
    payment_in_kind TEXT,
    in_kind_volume TEXT,
    in_kind_volume_unit TEXT,
    comments TEXT,
    gfs_code TEXT,
    gfs_code2 TEXT,
    gfs_level_1 TEXT,
    gfs_level_2 TEXT,
    gfs_level_3 TEXT,
    gfs_level_4 TEXT,
    gfs_level_5 TEXT,
    gfs_level_6 TEXT
);
''')

# Commit changes to the database
conn.commit()

We then insert the fake data for all the other tables into the tables we created

In [12]:
# Insert data into the companies table
df_companies.to_sql('companies', conn, if_exists='append', index=False)

# Insert data into the agencies table
df_agencies.to_sql('agencies', conn, if_exists='append', index=False)

# Insert data into the projects table
df_projects.to_sql('projects', conn, if_exists='append', index=False)

# Insert data into the project_companies table
df_project_companies.to_sql('project_companies', conn, if_exists='append', index=False)

# Insert revenue data into the revenue table
df_revenue.to_sql('revenue', conn, if_exists='append', index=False)

50000

And lastly close the database

In [13]:
# Close the connection to the database
conn.close()