# Ideal and Actual tables

In [1]:
# import libraries and data

import pandas as pd
import numpy as np
from os import path
from functools import reduce
from pprint import pprint
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from itertools import combinations

file_dir = "data/consolidated/"
file_dir_old = "data/consolidated/backup/old"

# load the csvs into data frames
df_part_1 = pd.read_csv(path.join(file_dir, "Part 1 - About.csv"))
df_part_3a = pd.read_csv(path.join(file_dir, "Part 3 - Reporting companies' list.csv"))
df_part_3b = pd.read_csv(path.join(file_dir, "Part 3 - Reporting government entities list.csv"))
df_part_3c = pd.read_csv(path.join(file_dir, "Part 3 - Reporting projects' list.csv"))
df_part_4 = pd.read_csv(path.join(file_dir, "Part 4 - Government revenues.csv"))
df_part_5 = pd.read_csv(path.join(file_dir, "Part 5 - Company data.csv"))
# df_part_5 = pd.read_csv(path.join(file_dir, "Part 5 - Company data.csv"), low_memory=False)

df_list = [df_part_1, df_part_3a, df_part_3b, df_part_3c, df_part_4, df_part_5]
df_dict = {"Part 1 - About.csv": df_part_1,
           "Part 3 - Reporting companies' list.csv": df_part_3a,
           "Part 3 - Reporting government entities list.csv": df_part_3b,
           "Part 3 - Reporting projects' list.csv": df_part_3c,
           "Part 4 - Government revenues.csv": df_part_4,
           "Part 5 - Company data.csv": df_part_5
          }

# OPTIONAL COLUMNS
part_3a_opt = ["Stock exchange listing or company website", 
               "Audited financial statement (or balance sheet, cash flows, profit/loss statement if unavailable)"]
part_3b_opt = ["ID number (if applicable)"]
part_5_opt = ["In-kind volume (if applicable)", "Unit (if applicable)", "Comments"]

# only include fields that are non-optional
df_part_1_non_opt = df_part_1.copy()
df_part_3a_non_opt = df_part_3a.copy().drop(columns=part_3a_opt)               
df_part_3b_non_opt = df_part_3b.copy().drop(columns=part_3b_opt)
df_part_3c_non_opt = df_part_3c.copy()
df_part_4_non_opt = df_part_4.copy()
df_part_5_non_opt = df_part_5.copy().drop(columns=part_5_opt)

df_list_non_opt = [df_part_1_non_opt, df_part_3a_non_opt, df_part_3b_non_opt, df_part_3c_non_opt, df_part_4_non_opt, df_part_5_non_opt]
df_dict_non_opt = {"Part 1 - About.csv": df_part_1_non_opt,
           "Part 3 - Reporting companies' list.csv": df_part_3a_non_opt,
           "Part 3 - Reporting government entities list.csv": df_part_3b_non_opt,
           "Part 3 - Reporting projects' list.csv": df_part_3c_non_opt,
           "Part 4 - Government revenues.csv": df_part_4_non_opt,
           "Part 5 - Company data.csv": df_part_5_non_opt
          }

In [3]:
def compare_tables_drop_duplicates(df1, df2, common_columns_df1, common_columns_df2):
    '''
    Compare two tables based on specified columns and drop duplicates.

    Parameters:
    - df1 (pandas.DataFrame): The first DataFrame.
    - df2 (pandas.DataFrame): The second DataFrame.
    - common_columns_df1 (list): Columns used in df1 to find common rows.
    - common_columns_df2 (list): Columns used in df2 to find common rows.

    Returns:
    - common_rows (pandas.DataFrame): Rows common to both DataFrames with duplicates dropped.
    - unique_rows_df1 (pandas.DataFrame): Rows unique to df1 with duplicates dropped.
    - unique_rows_df2 (pandas.DataFrame): Rows unique to df2 with duplicates dropped.

    Example:
    >>> df1 = pd.DataFrame({'Company': ['A', 'B', 'C'], 'Project name': ['P1', 'P2', 'P3'], 'Country': ['X', 'Y', 'Z'], 'Year': [2020, 2021, 2022]})
    >>> df2 = pd.DataFrame({'Full company name': ['A Corp', 'B Corp', 'D Corp'], 'Company type': ['Type1', 'Type2', 'Type3'], 'Company ID number': [101, 102, 103], 'Country': ['X', 'Y', 'Z'], 'Year': [2020, 2021, 2023]})
    >>> common_cols_df1 = ['Country', 'Year']
    >>> common_cols_df2 = ['Country', 'Year']
    >>> common, unique_df1, unique_df2 = compare_tables_drop_duplicates(df1, df2, common_cols_df1, common_cols_df2)
    >>> print(common)
      Country  Year
    0       X  2020
    1       Y  2021
    >>> print(unique_df1)
      Company Project name
    2       C           P3
    >>> print(unique_df2)
      Full company name Company type  Company ID number
    2            D Corp       Type3                103
    '''

    # Find common rows
    common_rows = pd.merge(df1, df2, left_on=common_columns_df1, right_on=common_columns_df2, how='inner')

    # Drop duplicates in common rows
    common_rows = common_rows.drop_duplicates(subset=common_columns_df1)

    # Drop duplicates in unique rows in df1
    unique_rows_df1 = df1[~df1.set_index(common_columns_df1).index.isin(common_rows.set_index(common_columns_df1).index)]
    unique_rows_df1 = unique_rows_df1.drop_duplicates(subset=common_columns_df1)

    # Drop duplicates in unique rows in df2
    unique_rows_df2 = df2[~df2.set_index(common_columns_df2).index.isin(common_rows.set_index(common_columns_df2).index)]
    unique_rows_df2 = unique_rows_df2.drop_duplicates(subset=common_columns_df2)

    return {"in table 1 but not in table 2": unique_rows_df1, 
            "in table 2 but not in table 1": unique_rows_df2,
            "in both tables": common_rows}


## Part 3a - Reporting companies and Part 5 - Company data

Logic
- Both Part 3a and Part 5 should contain the same companies
- Compute how many rows 
- Create a complete Part 5 by adding the missing companies from Part 3a
- Get the ideal Part 3a (list of companies) from the complete Part 5

Dataframes
- companies_3a_actual = list of companies in part 3a
- companies_5_actual = list of companies in part 5
- companies_3a_missing = list of companies in part 5 but not in part 3a
- companies_5_missing = list of companies in part 3a but not in part 5
- companies_5_actual_complete = companies_5_actual + companies_5_missing
- companies_3a_ideal = list of companies in companies_5_actual_complete

Outcomes
- 1 company is NaN

In [8]:
df_part_5_allcaps = df_part_5.copy()
df_part_5_allcaps["Company"] = df_part_5_allcaps["Company"].str.upper()
df_part_5_allcaps["Government entity"] = df_part_5_allcaps["Government entity"].str.upper()
df_part_5_allcaps["Project name"] = df_part_5_allcaps["Project name"].str.upper()
df_part_3a_allcaps = df_part_3a.copy()
df_part_3a_allcaps["Full company name"] = df_part_3a_allcaps["Full company name"].str.upper()

common_columns_3a5 = ["Full company name", "Country", "Year"]
common_columns_53a = ["Company", "Country", "Year"]


compare_3a5_allcaps = compare_tables_drop_duplicates(df_part_3a_allcaps, df_part_5_allcaps, common_columns_3a5, common_columns_53a)

print("Duplicate rows removed")
for key, data in compare_3a5_allcaps.items():
    print(f'{key}: {data.shape[0]} rows')

companies_3a_missing_unformatted = compare_3a5_allcaps["in table 2 but not in table 1"]
companies_5_missing_unformatted = compare_3a5_allcaps["in table 1 but not in table 2"]

# display(df_part_5_allcaps)
# display(companies_5_missing_unformatted)

c2k = ["Full company name", "Payments to Governments Report", "Country", "ISO Code", "Year", "Start Date", "End Date"]
companies_5_missing = companies_5_missing_unformatted[c2k].copy()
companies_5_missing = companies_5_missing.rename(columns={"Full company name": "Company", "Payments to Governments Report": "Revenue value"})

companies_5_actual_complete = pd.concat([df_part_5_allcaps, companies_5_missing], ignore_index=True)

print(f"# part 5 rows: {df_part_5_allcaps.shape[0]} \n# missing companies in part 5: {companies_5_missing.shape[0]} \n# of ideal rows in part 5: {df_part_5_allcaps.shape[0] + companies_5_missing.shape[0]} \n# of rows in updated part 5: {companies_5_actual_complete.shape[0]}")

# display(companies_5_missing)
# display(companies_5_actual_complete)

companies_5_actual_complete["Revenue value"] = pd.to_numeric(companies_5_actual_complete["Revenue value"], errors="coerce")
companies_5_actual_complete.to_csv("data/outputs/companies_5_actual_complete.csv", index=False)
pivot_table = companies_5_actual_complete.pivot_table(index=["Company", "Country", "Year", "ISO Code", "Start Date", "End Date"], aggfunc={"Revenue value": "sum"})

display(pivot_table)

# Group the data by Country and Year
grouped_data = companies_5_actual_complete.groupby(['Country', 'Year'])

pivot_tables = []

# Iterate over each group and create a pivot table based on Company
for (country, year), group_df in grouped_data:
    pivot_table = group_df.pivot_table(index='Company', aggfunc={"Revenue value": "sum"}) 
    pivot_table['Country'] = country
    pivot_table['Year'] = year
    pivot_tables.append(pivot_table)

result_df = pd.concat(pivot_tables).reset_index()

result_df = result_df.rename(columns={"Company": "Full company name", "Revenue value": "Payments to Governments Report"})

result_df.to_csv("data/outputs/companies_3a_ideal.csv", index=False)

companies_3a_ideal = result_df.copy()

display(companies_3a_ideal)

Duplicate rows removed
in table 1 but not in table 2: 317 rows
in table 2 but not in table 1: 140 rows
in both tables: 3469 rows
# part 5 rows: 31882 
# missing companies in part 5: 317 
# of ideal rows in part 5: 32199 
# of rows in updated part 5: 32199


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Revenue value
Company,Country,Year,ISO Code,Start Date,End Date,Unnamed: 6_level_1
1 1 1,Myanmar,2018,MMR,4/1/2017,3/31/2018,2.503175e+09
1/1999 NO. 35/4 – AUTHORIZED ENTITY HALS-K PRJSC (31566427),Ukraine,2019,UKR,2019-01-01,2019-12-31,1.391783e+08
11 HERONJTE BATER,Albania,2017,ALB,2017-01-01,2017-12-31,1.521589e+07
11 HERONJTE BATER,Albania,2018,ALB,2018-01-01,2018-12-31,1.375198e+07
24 HOUR MINING & INDUSTRY COMPANY LIMITED,Myanmar,2018,MMR,4/1/2017,3/31/2018,7.548560e+07
...,...,...,...,...,...,...
“VAYK GOLD” LLC,Armenia,2018,ARM,2018-01-01,2018-12-31,0.000000e+00
“VAYK GOLD” LLC,Armenia,2019,ARM,2019-01-01,2019-12-31,1.226506e+07
“ZANGEZUR COPPER-MOLYBDENUM COMBINE” CJSC,Armenia,2018,ARM,2018-01-01,2018-12-31,4.286911e+10
“ZANGEZUR COPPER-MOLYBDENUM COMBINE” CJSC,Armenia,2019,ARM,2019-01-01,2019-12-31,4.650406e+10


Unnamed: 0,Full company name,Payments to Governments Report,Country,Year
0,ABAAN RAYAN LIMITED,1.025688e+07,Afghanistan,2018
1,ABAS GHAZNAVI LIMITED,3.624451e+07,Afghanistan,2018
2,ABDUL FATAH,1.394220e+05,Afghanistan,2018
3,ABDUL RAOUF,5.092500e+04,Afghanistan,2018
4,ABDUL WAHAB ARIOBWAL LIMITED,4.588710e+05,Afghanistan,2018
...,...,...,...,...
3920,MAAMBA COLLIERIES LIMITED,4.703749e+08,Zambia,2019
3921,MOPANI COPPER MINES PLC,5.143412e+08,Zambia,2019
3922,NFC AFRICA MINING PLC,2.559570e+08,Zambia,2019
3923,SINO METALS,1.007475e+08,Zambia,2019


In [11]:
compare_3aideal_5comp_allcaps = compare_tables_drop_duplicates(companies_3a_ideal, companies_5_actual_complete, common_columns_3a5, common_columns_53a)

print("Duplicate rows removed")
for key, data in compare_3aideal_5comp_allcaps.items():
    print(f'{key}: {data.shape[0]} rows')

compare_3aideal_5comp_allcaps["in table 2 but not in table 1"]

Duplicate rows removed
in table 1 but not in table 2: 0 rows
in table 2 but not in table 1: 1 rows
in both tables: 3925 rows


Unnamed: 0,Company,Government entity,Revenue stream name,Levied on project (Y/N),Reported by project (Y/N),Project name,Reporting currency,Revenue value,Payment made in-kind (Y/N),In-kind volume (if applicable),Unit (if applicable),Comments,Country,ISO Code,Year,Start Date,End Date
3193,,MINISTRY OF MINES AND PETROLEUM (REVENUE DEPAR...,Royalties,Yes,Yes,SSML-KABU 3/2016,AFN,28023.0,Not applicable,Not applicable,Not applicable,,Afghanistan,AFG,2019,2018-12-21,2019-12-20


In [21]:
# df_part_3a_allcaps["Full company name"].isna().sum()
df_part_5_allcaps[df_part_5_allcaps["Company"].isnull()]

Unnamed: 0,Company,Government entity,Revenue stream name,Levied on project (Y/N),Reported by project (Y/N),Project name,Reporting currency,Revenue value,Payment made in-kind (Y/N),In-kind volume (if applicable),Unit (if applicable),Comments,Country,ISO Code,Year,Start Date,End Date
3193,,MINISTRY OF MINES AND PETROLEUM (REVENUE DEPAR...,Royalties,Yes,Yes,SSML-KABU 3/2016,AFN,28023,Not applicable,Not applicable,Not applicable,,Afghanistan,AFG,2019,2018-12-21,2019-12-20
3194,,MINISTRY OF MINES AND PETROLEUM (REVENUE DEPAR...,Royalties,Yes,Yes,SSML-KABU 2/2014,AFN,15040,Not applicable,Not applicable,Not applicable,,Afghanistan,AFG,2019,2018-12-21,2019-12-20
3195,,MINISTRY OF MINES AND PETROLEUM (REVENUE DEPAR...,Penalty Fee,Yes,Yes,SSML-KABU 2/2014,AFN,14334,Not applicable,Not applicable,Not applicable,,Afghanistan,AFG,2019,2018-12-21,2019-12-20


## Part 3b - Reporting govt agencies and Part 4 - Government revenues

Logic
- Both Part 3b and Part 4 should have the sme government agencies
- Create a complete Part 4 by adding the missing companies from Part 3b
- Get the ideal Part 3b (list of companies) from the complete Part 4

Dataframes
- agencies_3b_actual = list of agencies in part 3b
- agencies_4_actual = list of agencies in part 4
- agencies_3b_missing = list of agencies in part 4 but not in part 3b
- agencies_4_missing = list of agencies in part 3b but not in part 4
- agencies_4_actual_complete = agencies_4_actual + agencies_4_missing
- agencies_3b_ideal = list of agencies in agencies_5_actual_complete

In [50]:
df_part_4_allcaps = df_part_4.copy()
df_part_4_allcaps["Government entity"] = df_part_4_allcaps["Government entity"].str.upper()
df_part_3b_allcaps = df_part_3b.copy()
df_part_3b_allcaps["Full name of agency"] = df_part_3b_allcaps["Full name of agency"].str.upper()

common_columns_3b4 = ["Full name of agency", "Country", "Year"]
common_columns_43b = ["Government entity", "Country", "Year"]


compare_3b4_allcaps = compare_tables_drop_duplicates(df_part_3b_allcaps, df_part_4_allcaps, common_columns_3b4, common_columns_43b)

print("Duplicate rows removed")
for key, data in compare_3b4_allcaps.items():
    print(f'{key}: {data.shape[0]} rows')

agencies_3b_missing_unformatted = compare_3b4_allcaps["in table 2 but not in table 1"]
agencies_4_missing_unformatted = compare_3b4_allcaps["in table 1 but not in table 2"]

# display(agencies_3b_missing_unformatted)
display(agencies_4_missing_unformatted)

c2ka = ["Full name of agency", "Total reported", "Country", "ISO Code", "Year", "Start Date", "End Date"]
agencies_4_missing = agencies_4_missing_unformatted[c2ka].copy()
agencies_4_missing = agencies_4_missing.rename(columns={"Full name of agency": "Government entity", "Total reported": "Revenue value"})

agencies_4_actual_complete = pd.concat([df_part_4_allcaps, agencies_4_missing], ignore_index=True)

# print(f"# part 5 rows: {df_part_5_allcaps.shape[0]} \n# missing companies in part 5: {companies_5_missing.shape[0]} \n# of ideal rows in part 5: {df_part_5_allcaps.shape[0] + companies_5_missing.shape[0]} \n# of rows in updated part 5: {companies_5_actual_complete.shape[0]}")

# display(df_part_4_allcaps)
# display(agencies_4_actual_complete)

# # display(companies_5_missing)
# display(agencies_4_missing.sort_values(by="Revenue value"))
# display(agencies_4_actual_complete)

agencies_4_actual_complete["Revenue value"] = pd.to_numeric(agencies_4_actual_complete["Revenue value"], errors="coerce")
agencies_4_actual_complete.to_csv("data/outputs/agencies_4_actual_complete.csv", index=False)
# companies_5_actual_complete.to_csv("data/outputs/companies_5_actual_complete.csv", index=False)
agencies_pivot_table = agencies_4_actual_complete.pivot_table(index=["Government entity", "Country", "Year", "ISO Code", "Start Date", "End Date"], aggfunc={"Revenue value": "sum"})

# display(agencies_pivot_table)

# # Group the data by Country and Year
agencies_grouped_data = agencies_4_actual_complete.groupby(['Country', 'Year'])

agencies_pivot_tables = []

# Iterate over each group and create a pivot table based on Company
for (country, year), agencies_group_df in agencies_grouped_data:
    agencies_pivot_table = agencies_group_df.pivot_table(index='Government entity', aggfunc={"Revenue value": "sum"}) 
    agencies_pivot_table['Country'] = country
    agencies_pivot_table['Year'] = year
    agencies_pivot_tables.append(agencies_pivot_table)

agencies_result_df = pd.concat(agencies_pivot_tables).reset_index()

agencies_result_df = agencies_result_df.rename(columns={"Government entity": "Full name of agency", "Revenue value": "Total reported"})

agencies_result_df.to_csv("data/outputs/agencies_3b_ideal.csv", index=False)

agencies_3b_ideal = agencies_result_df.copy()

display(agencies_3b_ideal)

Duplicate rows removed
in table 1 but not in table 2: 80 rows
in table 2 but not in table 1: 8 rows
in both tables: 463 rows


Unnamed: 0,Full name of agency,Agency type,ID number (if applicable),Total reported,Country,ISO Code,Year,Start Date,End Date
4,MINISTRY OF INDUSTRY AND COMMERCE,Central goverment,Not applicable,,Afghanistan,AFG,2018,2017/12/21,2018/12/20
9,MINISTRY OF INDUSTRY AND COMMERCE,Central goverment,Not applicable,,Afghanistan,AFG,2019,2018/12/21,2019/12/20
14,DIRECTION DES PARTICIPATIONS ET DE LA PRIVATIS...,Central goverment,No applicable,,Cote d'Ivoire,CIV,2017,2017/01/01,2017/12/31
18,OTHERS,Central goverment,No applicable,,Cote d'Ivoire,CIV,2017,2017/01/01,2017/12/31
23,DIRECTION DES PARTICIPATIONS ET DE LA PRIVATIS...,Central goverment,Non applicable,,Cote d'Ivoire,CIV,2018,2018/01/01,2018/12/31
...,...,...,...,...,...,...,...,...,...
535,COMMUNE DE KOUDALWA,Local government,Non applicable,,Chad,TCD,2017,2017/01/01,2017/12/31
540,DIRECTION GÉNÉRALE DES SERVICES DE DOUANES ET ...,Central goverment,Non applicable,,Chad,TCD,2018,2018/01/01,2018/12/31
544,MINISTÈRE DES FINANCES,Central goverment,Non applicable,,Chad,TCD,2018,2018/01/01,2018/12/31
545,COMMUNE DE DOBA,Local government,Non applicable,,Chad,TCD,2018,2018/01/01,2018/12/31


Unnamed: 0,Full name of agency,Total reported,Country,Year
0,MINISTRY OF FINANCE (CUSTOMS DEPARTMENT),1.459912e+09,Afghanistan,2018
1,MINISTRY OF FINANCE (REVENUE DEPARTMENT),1.089699e+09,Afghanistan,2018
2,MINISTRY OF INDUSTRY AND COMMERCE,0.000000e+00,Afghanistan,2018
3,MINISTRY OF MINES AND PETROLEUM (REVENUE DEPAR...,2.285251e+09,Afghanistan,2018
4,NATIONAL ENVIRONMENTAL PROTECTION AGENCY,4.700000e+04,Afghanistan,2018
...,...,...,...,...
546,MINISTRY OF LANDS,9.827895e+05,Zambia,2019
547,MINISTRY OF MINES AND MINERALS DEVELOPMENT,4.128300e+07,Zambia,2019
548,MINISTRY OF MINES AND MINERALS DEVELOPMENT - P...,3.051347e+06,Zambia,2019
549,ZAMBIAN REVENUE AUTHORITY (ZRA),1.386263e+10,Zambia,2019


In [53]:
compare_3bideal_4comp_allcaps = compare_tables_drop_duplicates(agencies_3b_ideal, agencies_4_actual_complete, common_columns_3b4, common_columns_43b)

print("Duplicate rows removed")
for key, data in compare_3bideal_4comp_allcaps.items():
    print(f'{key}: {data.shape[0]} rows')

compare_3bideal_4comp_allcaps["in table 2 but not in table 1"]

Duplicate rows removed
in table 1 but not in table 2: 0 rows
in table 2 but not in table 1: 0 rows
in both tables: 551 rows


Unnamed: 0,GFS Classification,Sector,Revenue stream name,Government entity,Revenue value,Currency,Country,ISO Code,Year,Start Date,End Date


## Part 3c - Reporting projects and Part 5 - Company data

- projects_3c_actual = list of projects in part 3a
- projects_5_actual = list of projects in part 5
- projects_3c_missing = list of projects in part 5 but not in part 3a
- projects_5_missing = list of projects in part 3a but not in part 5
- projects_5_actual_complete = projects_5_actual + projects_5_missing
- projects_3c_ideal = list of projects in projects_5_actual_complete

In [None]:
# df_part_5_allcaps = df_part_5.copy()
# df_part_5_allcaps["Company"] = df_part_5_allcaps["Company"].str.upper()
# df_part_5_allcaps["Government entity"] = df_part_5_allcaps["Government entity"].str.upper()
# df_part_5_allcaps["Project name"] = df_part_5_allcaps["Project name"].str.upper()
df_part_3c_allcaps = df_part_3c.copy()
df_part_3c_allcaps["Full project name"] = df_part_3c_allcaps["Full project name"].str.upper()

common_columns_3c5 = ["Full project name", "Country", "Year"]
common_columns_53c = ["Project name", "Country", "Year"]


compare_3c5_allcaps = compare_tables_drop_duplicates(df_part_3c_allcaps, df_part_5_allcaps, common_columns_3c5, common_columns_53c)

print("Duplicate rows removed")
for key, data in compare_3c5_allcaps.items():
    print(f'{key}: {data.shape[0]} rows')

companies_3c_missing_unformatted = compare_3c5_allcaps["in table 2 but not in table 1"]
companies_5_missing_unformatted = compare_3c5_allcaps["in table 1 but not in table 2"]

# display(df_part_5_allcaps)
# display(companies_5_missing_unformatted)

c2kc = ["Full project name", "Payments to Governments Report", "Country", "ISO Code", "Year", "Start Date", "End Date"]
companies_5_missing = companies_5_missing_unformatted[c2kc].copy()
companies_5_missing = companies_5_missing.rename(columns={"Full project name": "Project name", "Payments to Governments Report": "Revenue value"})

companies_5_actual_complete = pd.concat([df_part_5_allcaps, companies_5_missing], ignore_index=True)

print(f"# part 5 rows: {df_part_5_allcaps.shape[0]} \n# missing companies in part 5: {companies_5_missing.shape[0]} \n# of ideal rows in part 5: {df_part_5_allcaps.shape[0] + companies_5_missing.shape[0]} \n# of rows in updated part 5: {companies_5_actual_complete.shape[0]}")

# display(companies_5_missing)
# display(companies_5_actual_complete)

companies_5_actual_complete["Revenue value"] = pd.to_numeric(companies_5_actual_complete["Revenue value"], errors="coerce")
companies_5_actual_complete.to_csv("data/outputs/companies_5_actual_complete.csv", index=False)
pivot_table = companies_5_actual_complete.pivot_table(index=["Company", "Country", "Year", "ISO Code", "Start Date", "End Date"], aggfunc={"Revenue value": "sum"})

display(pivot_table)

# Group the data by Country and Year
grouped_data = companies_5_actual_complete.groupby(['Country', 'Year'])

pivot_tables = []

# Iterate over each group and create a pivot table based on Company
for (country, year), group_df in grouped_data:
    pivot_table = group_df.pivot_table(index='Company', aggfunc={"Revenue value": "sum"}) 
    pivot_table['Country'] = country
    pivot_table['Year'] = year
    pivot_tables.append(pivot_table)

result_df = pd.concat(pivot_tables).reset_index()

result_df = result_df.rename(columns={"Company": "Full company name", "Revenue value": "Payments to Governments Report"})

result_df.to_csv("data/outputs/companies_3a_ideal.csv", index=False)

display(result_df)