In [1]:
import sqlite3
import pandas as pd
from faker import Faker
import random
import string
from collections import defaultdict

In [2]:
# Set Faker seed for reproducibility
fake = Faker()
Faker.seed(4321)

# Number of records to generate
num_companies = 1000
num_agencies = 500
num_projects = 2000

# List of EITI participating countries
countries = ['AF', 'AL', 'AZ', 'BF', 'BI', 'CD', 'CI', 'CM', 'CO', 'GH', 'GN', 'GQ', 'ID', 'IN', 'IQ', 'KG', 'LR', 'MA', 'MG', 'MN', 'MZ', 'NG', 'NP', 'PE', 'PG', 'PH', 'RS', 'SL', 'SN', 'TJ', 'TN', 'TZ', 'UA', 'UG', 'UZ', 'YE', 'ZM']

# Company and agency types
company_types = ['Private', 'Listed', 'Public corporations & state-owned entreprises', 'Other']
agency_types = ['Local government', 'Central government', 'State government', 'Public corporations & state-owned entreprises', 'other']

# Project statuses and units
project_statuses = ['Production', 'Development', 'Exploration', 'Other', 'Not reported']
project_units = ['tonnes', 'barrels', 'grams']

# EITI contributing companies and small companies
EITI_contrib_companies = ['Africa Oil Corp', 'Alcoa', 'AMG', 'Anglo American', 'AngloGold Ashanti', 'Arcelor Mittal', 'Barrick Gold', 'Base Titanium', 'BHP', 'BP', 'Capricorn Energy PLC', 'Centerra Gold', 'Chevron', 'Eni', 'Equinor', 'Eramet', 'ExxonMobil', 'Freeport-McMoRan', 'Glencore', 'Gold Fields', 'Gunvor Group', 'Hess Corporation', 'Inpex Corporation', 'Ivanhoe Mines', 'JX Nippon Mining & Metals', 'Kinross Gold', 'Kosmos Energy', 'Lundin Foundation', 'Minsur', 'MMG', 'Neptune Energy', 'Newcrest Mining', 'Newmont', 'NNPC', 'Orano Mining', 'Petronor E&P', 'Poderosa', 'Pt Pertamina', 'Qatar Energy', 'Repsol Group', 'Rio Tinto', 'Royal Dutch Shell', 'Santos Limited', 'Savannah Energy', 'Shell plc', 'Sibanye Stillwater', 'South 32', 'Southern Copper', 'St Barbara Limited', 'Staatsolie Maatschappij Suriname N.V.', 'Sumitomo Metal Mining', 'Teck', 'TotalEnergies SE', 'Trafigura', 'Tullow oil', 'Vale', 'Woodside']
small_companies = ['ORYX ENERGIES', 'Petra Quarries Ltd', 'Platinum Asphalt & Crushing Company Limited', 'Mothercat Limited Nig. Ltd.', 'Moulds NIG. Ltd', 'MTP SA', 'Mufkad mines & Invest Ltd']

# Possible commodities
commodities = ['gold', 'silver', 'diamonds', 'oil', 'coal', 'iron', 'aluminium', 'copper', 'zinc', 'lead']

# Stock exchanges
stock_exchanges = ['NYSE', 'NASDAQ', 'LSE', 'Euronext', 'SSE', 'HKEX', 'TSX', 'SZSE', 'Deutsche Börse', 'ASX', 'JSE', 'BSE', 'NSE']

# Sectors
sectors = ['Oil and gas', 'Mining', 'Energy', 'Metals', 'Chemicals']

# Agency names
agency_names = ["Oil Resource Management Agency", "Extractives Industry Regulation Directorate", "Land Management Office", "Ministry of Finances", "National Mining Agency"]

In [3]:
def generate_companies(num_companies):
    data = []
    for _ in range(num_companies):
        is_small_company = random.choice([True, False])
        if is_small_company:
            label = random.choice(small_companies)
            id = '00000000000000000000-' + random.choice(countries) + '-' + str(random.randint(100000,999999)) + '-1'
        else:
            label = random.choice(EITI_contrib_companies)
            id = fake.bothify(text='????????????????????-##-######-#')
        
        self = 'https://www.eiti.org/' + id
        company_type = random.choice(company_types)
        sector = random.choice(sectors)
        stock_exchange_listing = random.choice(stock_exchanges) if company_type == 'Listed' else None
        audited_financial_state = 'https://www.eiti.org/financial_report/' + id
        summary_label = random.choice(countries)
        summary_iso2 = summary_label
        summary_year = random.randint(2017, 2021)
        
        data.append([id, label, self, 'company', id, sector, company_type, stock_exchange_listing, audited_financial_state, summary_label, summary_iso2, summary_year])
        
    return pd.DataFrame(data, columns=['id', 'label', 'self', 'type', 'identification', 'sector', 'company_type', 'stock_exchange_listing', 'audited_financial_state', 'summary_data_label', 'summary_data_iso2', 'summary_data_year'])

# Generate companies data
df_companies = generate_companies(num_companies)

In [4]:
def generate_agencies(num_agencies):
    data = []
    counter_by_country = defaultdict(int)
    
    for _ in range(num_agencies):
        country_code = random.choice(countries)
        agency_type = random.choice(['National', 'Local'])
        
        counter_by_country[(country_code, agency_type)] += 1
        counter = counter_by_country[(country_code, agency_type)]
        
        self = f"https://eiti.org/agency/{country_code}/{counter}"
        label = random.choice(agency_names)
        summary_label = random.choice(countries)
        summary_iso2 = summary_label
        summary_year = random.randint(2017, 2021)
        
        # Making sure the ID is unique by using a counter
        id = f"{country_code}-{agency_type[0]}-{str(counter).zfill(6)}"
        # Rest of the code
        
        data.append([id, label, self, 'agency', id, agency_type, summary_label, summary_iso2, summary_year])
        
    return pd.DataFrame(data, columns=['id', 'label', 'self', 'type', 'identification', 'agency_type', 'summary_data_label', 'summary_data_iso2', 'summary_data_year'])

# Generate agencies data
df_agencies = generate_agencies(num_agencies)

In [5]:
def generate_projects(num_projects):
    data = []
    
    # Start a counter for project IDs
    project_id_counter = 100000
    
    for _ in range(num_projects):
        # Incrementally assign unique IDs
        id = str(project_id_counter)
        project_id_counter += 1
        
        label = random.choice(['Subsoil use special permit No.' + str(random.randint(1000,9999)), fake.bothify(text='???? Fields')])
        self = 'https://www.eiti.org/' + id
        project_legal_agreement = 'https://www.eiti.org/legal_agreement/' + id
        project_affiliated_companies_start = ', '.join(random.sample(EITI_contrib_companies, k=random.randint(1,3)))
        project_commodities = ', '.join(random.sample(commodities, k=random.randint(1,3)))
        project_status = random.choice(project_statuses)
        project_production_volume = str(random.randint(1000, 1000000))
        project_unit = random.choice(project_units)
        project_production = str(random.randint(1000, 1000000))
        summary_label = random.choice(countries)
        summary_iso2 = summary_label
        summary_year = random.randint(2017, 2021)
        
        data.append([id, label, self, 'project', project_legal_agreement, project_affiliated_companies_start, project_commodities, project_status, project_production_volume, project_unit, project_production, summary_label, summary_iso2, summary_year])
        
    return pd.DataFrame(data, columns=['id', 'label', 'self', 'type', 'project_legal_agreement', 'project_affiliated_companies_start', 'project_commodities', 'project_status', 'project_production_volume', 'project_unit', 'project_production', 'summary_data_label', 'summary_data_iso2', 'summary_data_year'])

# Generate projects data
num_projects = 50  # Or any number you like
df_projects = generate_projects(num_projects)


In [6]:
def generate_project_companies(num_relations):
    data = []
    for _ in range(num_relations):
        project_id = random.choice(df_projects['id'])
        company_id = random.choice(df_companies['id'])
        
        data.append([project_id, company_id])
        
    return pd.DataFrame(data, columns=['project_id', 'company_id'])

# Define the number of relations you want to generate
num_relations = 50

# Generate project_companies data
df_project_companies = generate_project_companies(num_relations)

In [7]:
import sqlite3

# Connect to SQLite database (it will be created if it doesn't exist)
conn = sqlite3.connect('EITI_Database.db')

# Create a cursor object to interact with the database
cursor = conn.cursor()

In [8]:
# Create the companies table
cursor.execute('''
CREATE TABLE IF NOT EXISTS companies (
    id TEXT PRIMARY KEY,
    label TEXT,
    self TEXT,
    type TEXT,
    identification TEXT,
    sector TEXT,
    company_type TEXT,
    stock_exchange_listing TEXT,
    audited_financial_state TEXT,
    summary_data_label TEXT,
    summary_data_iso2 TEXT,
    summary_data_year INTEGER
)
''')

# Create the agencies table
cursor.execute('''
CREATE TABLE IF NOT EXISTS agencies (
    id TEXT PRIMARY KEY,
    label TEXT,
    self TEXT,
    type TEXT,
    identification TEXT,
    agency_type TEXT,
    summary_data_label TEXT,
    summary_data_iso2 TEXT,
    summary_data_year INTEGER
)
''')

# Create the projects table
cursor.execute('''
CREATE TABLE IF NOT EXISTS projects (
    id TEXT PRIMARY KEY,
    label TEXT,
    self TEXT,
    type TEXT,
    project_legal_agreement TEXT,
    project_affiliated_companies_start TEXT,
    project_commodities TEXT,
    project_status TEXT,
    project_production_volume TEXT,
    project_unit TEXT,
    project_production TEXT,
    summary_data_label TEXT,
    summary_data_iso2 TEXT,
    summary_data_year INTEGER
)
''')

# Create the project_companies table
cursor.execute('''
CREATE TABLE IF NOT EXISTS project_companies (
    project_id TEXT,
    company_id TEXT,
    PRIMARY KEY (project_id, company_id)
)
''')

# Commit changes to the database
cursor.execute("DELETE FROM agencies")
conn.commit()

In [9]:
# Insert data into the companies table
df_companies.to_sql('companies', conn, if_exists='append', index=False)

# Insert data into the agencies table
df_agencies.to_sql('agencies', conn, if_exists='append', index=False)

# Insert data into the projects table
df_projects.to_sql('projects', conn, if_exists='append', index=False)

# Insert data into the project_companies table
df_project_companies.to_sql('project_companies', conn, if_exists='append', index=False)


50

In [10]:
# Close the connection to the database
conn.close()