# Ideal tables

In [1]:
# import libraries and data

import pandas as pd
import numpy as np
from os import path
from functools import reduce
from pprint import pprint
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from itertools import combinations

file_dir = "data/cleaned/"
file_dir_old = "data/consolidated/backup/old"

# load the csvs into data frames
companies_5_complete_cleaned = pd.read_csv(path.join(file_dir, "companies-5-complete-cleaned.csv"))

In [None]:
display(companies_5_complete_cleaned)

In [4]:
companies_grouped_data = companies_5_complete_cleaned.groupby(['Country', 'Year'])

companies_pivot_tables = []

# Iterate over each group and create a pivot table based on Company
for (country, year), group_df in companies_grouped_data:
    companies_pivot_table = group_df.pivot_table(index='Company', aggfunc={"Revenue value": "sum"}) 
    companies_pivot_table['Country'] = country
    companies_pivot_table['Year'] = year
    companies_pivot_tables.append(companies_pivot_table)

companies_result_df = pd.concat(companies_pivot_tables).reset_index()

companies_result_df = companies_result_df.rename(columns={"Company": "Full company name", "Revenue value": "Payments to Governments Report"})

companies_result_df.to_csv("data/cleaned/companies_3a_ideal.csv", index=False)

companies_3a_ideal = companies_result_df.copy()

display(companies_3a_ideal)

Unnamed: 0,Full company name,Payments to Governments Report,Country,Year
0,ABAAN RAYAN LIMITED,1.025688e+07,Afghanistan,2018
1,ABBAS GHAZNAVI LIMITED,3.624451e+07,Afghanistan,2018
2,ABDUL FATAH,1.394220e+05,Afghanistan,2018
3,ABDUL RAOUF,5.092500e+04,Afghanistan,2018
4,ABDUL WAHAB ARIOBWAL LIMITED,4.588710e+05,Afghanistan,2018
...,...,...,...,...
3848,MAAMBA COLLIERIES LIMITED,4.703749e+08,Zambia,2019
3849,MOPANI COPPER MINES PLC,5.143412e+08,Zambia,2019
3850,NFC AFRICA MINING PLC,2.559570e+08,Zambia,2019
3851,SINO METALS,1.007475e+08,Zambia,2019


In [None]:
df_part_4_allcaps = df_part_4.copy()
df_part_4_allcaps["Government entity"] = df_part_4_allcaps["Government entity"].str.upper()
df_part_3b_allcaps = df_part_3b.copy()
df_part_3b_allcaps["Full name of agency"] = df_part_3b_allcaps["Full name of agency"].str.upper()

common_columns_3b4 = ["Full name of agency", "Country", "Year"]
common_columns_43b = ["Government entity", "Country", "Year"]


compare_3b4_allcaps = compare_tables_drop_duplicates(df_part_3b_allcaps, df_part_4_allcaps, common_columns_3b4, common_columns_43b)

print("Duplicate rows removed")
for key, data in compare_3b4_allcaps.items():
    print(f'{key}: {data.shape[0]} rows')

agencies_3b_missing_unformatted = compare_3b4_allcaps["in table 2 but not in table 1"]
agencies_4_missing_unformatted = compare_3b4_allcaps["in table 1 but not in table 2"]

# display(agencies_3b_missing_unformatted)
display(agencies_4_missing_unformatted)

c2ka = ["Full name of agency", "Total reported", "Country", "ISO Code", "Year", "Start Date", "End Date"]
agencies_4_missing = agencies_4_missing_unformatted[c2ka].copy()
agencies_4_missing = agencies_4_missing.rename(columns={"Full name of agency": "Government entity", "Total reported": "Revenue value"})

agencies_4_actual_complete = pd.concat([df_part_4_allcaps, agencies_4_missing], ignore_index=True)

# print(f"# part 5 rows: {df_part_5_allcaps.shape[0]} \n# missing companies in part 5: {companies_5_missing.shape[0]} \n# of ideal rows in part 5: {df_part_5_allcaps.shape[0] + companies_5_missing.shape[0]} \n# of rows in updated part 5: {companies_5_actual_complete.shape[0]}")

# display(df_part_4_allcaps)
# display(agencies_4_actual_complete)

# # display(companies_5_missing)
# display(agencies_4_missing.sort_values(by="Revenue value"))
# display(agencies_4_actual_complete)

agencies_4_actual_complete["Revenue value"] = pd.to_numeric(agencies_4_actual_complete["Revenue value"], errors="coerce")
agencies_4_actual_complete.to_csv("data/outputs/agencies_4_actual_complete.csv", index=False)
# companies_5_actual_complete.to_csv("data/outputs/companies_5_actual_complete.csv", index=False)
agencies_pivot_table = agencies_4_actual_complete.pivot_table(index=["Government entity", "Country", "Year", "ISO Code", "Start Date", "End Date"], aggfunc={"Revenue value": "sum"})

# display(agencies_pivot_table)

# Group the data by Country and Year
agencies_grouped_data = agencies_4_actual_complete.groupby(['Country', 'Year'])

agencies_pivot_tables = []

# Iterate over each group and create a pivot table based on Company
for (country, year), agencies_group_df in agencies_grouped_data:
    agencies_pivot_table = agencies_group_df.pivot_table(index='Government entity', aggfunc={"Revenue value": "sum"}) 
    agencies_pivot_table['Country'] = country
    agencies_pivot_table['Year'] = year
    agencies_pivot_tables.append(agencies_pivot_table)

agencies_result_df = pd.concat(agencies_pivot_tables).reset_index()

agencies_result_df = agencies_result_df.rename(columns={"Government entity": "Full name of agency", "Revenue value": "Total reported"})

agencies_result_df.to_csv("data/outputs/agencies_3b_ideal.csv", index=False)

agencies_3b_ideal = agencies_result_df.copy()

display(agencies_3b_ideal)

agencies_4_actual_complete.to_csv("data/outputs/for_cleaning/agencies_4_actual_complete.csv", index=False)

In [None]:
compare_3bideal_4comp_allcaps = compare_tables_drop_duplicates(agencies_3b_ideal, agencies_4_actual_complete, common_columns_3b4, common_columns_43b)

print("Duplicate rows removed")
for key, data in compare_3bideal_4comp_allcaps.items():
    print(f'{key}: {data.shape[0]} rows')

compare_3bideal_4comp_allcaps["in table 2 but not in table 1"]

In [None]:
# df_part_5_allcaps = df_part_5.copy()
# df_part_5_allcaps["Company"] = df_part_5_allcaps["Company"].str.upper()
# df_part_5_allcaps["Government entity"] = df_part_5_allcaps["Government entity"].str.upper()
# df_part_5_allcaps["Project name"] = df_part_5_allcaps["Project name"].str.upper()
df_part_3c_allcaps = df_part_3c.copy()
df_part_3c_allcaps["Full project name"] = df_part_3c_allcaps["Full project name"].str.upper()
df_part_3c_allcaps["Affiliated companies, start with Operator"] = df_part_3c_allcaps["Affiliated companies, start with Operator"].str.upper()


common_columns_3c5 = ["Full project name", "Country", "Year"]
common_columns_53c = ["Project name", "Country", "Year"]


compare_3c5_allcaps = compare_tables_drop_duplicates(df_part_3c_allcaps, df_part_5_allcaps, common_columns_3c5, common_columns_53c)

print("Duplicate rows removed")
for key, data in compare_3c5_allcaps.items():
    print(f'{key}: {data.shape[0]} rows')

projects_3c_missing_unformatted = compare_3c5_allcaps["in table 2 but not in table 1"]
projects_5_missing_unformatted = compare_3c5_allcaps["in table 1 but not in table 2"]

# display(df_part_5_allcaps)
# display(companies_5_missing_unformatted)

c2kc = ["Full project name", "Country", "ISO Code", "Year", "Start Date", "End Date"]
projects_5_missing = projects_5_missing_unformatted[c2kc].copy()
projects_5_missing = projects_5_missing.rename(columns={"Full project name": "Project name"})

projects_5_actual_complete = pd.concat([df_part_5_allcaps, projects_5_missing], ignore_index=True)

# print(f"# part 5 rows: {df_part_5_allcaps.shape[0]} \n# missing companies in part 5: {companies_5_missing.shape[0]} \n# of ideal rows in part 5: {df_part_5_allcaps.shape[0] + companies_5_missing.shape[0]} \n# of rows in updated part 5: {companies_5_actual_complete.shape[0]}")

# # display(companies_5_missing)
# # display(companies_5_actual_complete)

projects_5_actual_complete["Revenue value"] = pd.to_numeric(projects_5_actual_complete["Revenue value"], errors="coerce")
projects_5_actual_complete.to_csv("data/outputs/projects_5_actual_complete.csv", index=False)
# pivot_table = companies_5_actual_complete.pivot_table(index=["Company", "Country", "Year", "ISO Code", "Start Date", "End Date"], aggfunc={"Revenue value": "sum"})

# display(pivot_table)

# Group the data by Country and Year
projects_grouped_data = projects_5_actual_complete.groupby(['Country', 'Year'])

projects_pivot_tables = []

# Iterate over each group and create a pivot table based on Company
for (country, year), group_df in projects_grouped_data:
    projects_pivot_table = group_df.pivot_table(index='Project name', aggfunc={"Revenue value": "sum"}) 
    projects_pivot_table['Country'] = country
    projects_pivot_table['Year'] = year
    projects_pivot_tables.append(projects_pivot_table)

projects_result_df = pd.concat(projects_pivot_tables).reset_index()

projects_result_df = projects_result_df.rename(columns={"Project name": "Full project name"})

projects_result_df.to_csv("data/outputs/projects_3c_ideal.csv", index=False)

display(projects_result_df)

projects_5_actual_complete.to_csv("data/outputs/for_cleaning/projects_5_actual_complete_v1_simple.csv", index=False)

In [None]:
# display(projects_3c_missing_unformatted)
display(projects_5_missing_unformatted)

In [None]:
projects_5_missing_flat = projects_5_missing_unformatted.copy()
display(projects_5_missing_flat)
projects_5_missing_flat.to_csv("data/outputs/projects_5_missing_unformatted.csv", index=False)

projects_5_missing_flat['Affiliated companies, start with Operator'] = projects_5_missing_flat['Affiliated companies, start with Operator'].str.replace(', ', '/').str.replace(',', '/').str.split('/')
display(projects_5_missing_flat)

projects_5_missing_flat_2 = projects_5_missing_flat.explode('Affiliated companies, start with Operator')
projects_5_missing_flat_2['Affiliated companies, start with Operator'] = projects_5_missing_flat_2['Affiliated companies, start with Operator'].str.strip()
display(projects_5_missing_flat_2)
projects_5_missing_flat_2.to_csv("data/outputs/projects_3c_missing_flattened.csv", index=False)
projects_5_missing_flat_2.to_csv("data/outputs/for_cleaning/projects_3c_missing_flattened.csv", index=False)