# Ideal and Actual tables

In [1]:
# import libraries and data

import pandas as pd
import numpy as np
from os import path
from functools import reduce
from pprint import pprint
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from itertools import combinations

file_dir = "data/consolidated/"
file_dir_old = "data/consolidated/backup/old"

# load the csvs into data frames
df_part_1 = pd.read_csv(path.join(file_dir, "Part 1 - About.csv"))
df_part_3a = pd.read_csv(path.join(file_dir, "Part 3 - Reporting companies' list.csv"))
df_part_3b = pd.read_csv(path.join(file_dir, "Part 3 - Reporting government entities list.csv"))
df_part_3c = pd.read_csv(path.join(file_dir, "Part 3 - Reporting projects' list.csv"))
df_part_4 = pd.read_csv(path.join(file_dir, "Part 4 - Government revenues.csv"))
df_part_5 = pd.read_csv(path.join(file_dir, "Part 5 - Company data.csv"))
# df_part_5 = pd.read_csv(path.join(file_dir, "Part 5 - Company data.csv"), low_memory=False)

df_list = [df_part_1, df_part_3a, df_part_3b, df_part_3c, df_part_4, df_part_5]
df_dict = {"Part 1 - About.csv": df_part_1,
           "Part 3 - Reporting companies' list.csv": df_part_3a,
           "Part 3 - Reporting government entities list.csv": df_part_3b,
           "Part 3 - Reporting projects' list.csv": df_part_3c,
           "Part 4 - Government revenues.csv": df_part_4,
           "Part 5 - Company data.csv": df_part_5
          }

# OPTIONAL COLUMNS
part_3a_opt = ["Stock exchange listing or company website", 
               "Audited financial statement (or balance sheet, cash flows, profit/loss statement if unavailable)"]
part_3b_opt = ["ID number (if applicable)"]
part_5_opt = ["In-kind volume (if applicable)", "Unit (if applicable)", "Comments"]

# only include fields that are non-optional
df_part_1_non_opt = df_part_1.copy()
df_part_3a_non_opt = df_part_3a.copy().drop(columns=part_3a_opt)               
df_part_3b_non_opt = df_part_3b.copy().drop(columns=part_3b_opt)
df_part_3c_non_opt = df_part_3c.copy()
df_part_4_non_opt = df_part_4.copy()
df_part_5_non_opt = df_part_5.copy().drop(columns=part_5_opt)

df_list_non_opt = [df_part_1_non_opt, df_part_3a_non_opt, df_part_3b_non_opt, df_part_3c_non_opt, df_part_4_non_opt, df_part_5_non_opt]
df_dict_non_opt = {"Part 1 - About.csv": df_part_1_non_opt,
           "Part 3 - Reporting companies' list.csv": df_part_3a_non_opt,
           "Part 3 - Reporting government entities list.csv": df_part_3b_non_opt,
           "Part 3 - Reporting projects' list.csv": df_part_3c_non_opt,
           "Part 4 - Government revenues.csv": df_part_4_non_opt,
           "Part 5 - Company data.csv": df_part_5_non_opt
          }

In [12]:
def compare_tables_drop_duplicates(df1, df2, common_columns_df1, common_columns_df2):
    '''
    Compare two tables based on specified columns and drop duplicates.

    Parameters:
    - df1 (pandas.DataFrame): The first DataFrame.
    - df2 (pandas.DataFrame): The second DataFrame.
    - common_columns_df1 (list): Columns used in df1 to find common rows.
    - common_columns_df2 (list): Columns used in df2 to find common rows.

    Returns:
    - common_rows (pandas.DataFrame): Rows common to both DataFrames with duplicates dropped.
    - unique_rows_df1 (pandas.DataFrame): Rows unique to df1 with duplicates dropped.
    - unique_rows_df2 (pandas.DataFrame): Rows unique to df2 with duplicates dropped.

    Example:
    >>> df1 = pd.DataFrame({'Company': ['A', 'B', 'C'], 'Project name': ['P1', 'P2', 'P3'], 'Country': ['X', 'Y', 'Z'], 'Year': [2020, 2021, 2022]})
    >>> df2 = pd.DataFrame({'Full company name': ['A Corp', 'B Corp', 'D Corp'], 'Company type': ['Type1', 'Type2', 'Type3'], 'Company ID number': [101, 102, 103], 'Country': ['X', 'Y', 'Z'], 'Year': [2020, 2021, 2023]})
    >>> common_cols_df1 = ['Country', 'Year']
    >>> common_cols_df2 = ['Country', 'Year']
    >>> common, unique_df1, unique_df2 = compare_tables_drop_duplicates(df1, df2, common_cols_df1, common_cols_df2)
    >>> print(common)
      Country  Year
    0       X  2020
    1       Y  2021
    >>> print(unique_df1)
      Company Project name
    2       C           P3
    >>> print(unique_df2)
      Full company name Company type  Company ID number
    2            D Corp       Type3                103
    '''

    # Find common rows
    common_rows = pd.merge(df1, df2, left_on=common_columns_df1, right_on=common_columns_df2, how='inner')

    # Drop duplicates in common rows
    common_rows = common_rows.drop_duplicates(subset=common_columns_df1)

    # Drop duplicates in unique rows in df1
    unique_rows_df1 = df1[~df1.set_index(common_columns_df1).index.isin(common_rows.set_index(common_columns_df1).index)]
    unique_rows_df1 = unique_rows_df1.drop_duplicates(subset=common_columns_df1)

    # Drop duplicates in unique rows in df2
    unique_rows_df2 = df2[~df2.set_index(common_columns_df2).index.isin(common_rows.set_index(common_columns_df2).index)]
    unique_rows_df2 = unique_rows_df2.drop_duplicates(subset=common_columns_df2)

    return {"in table 1 but not in table 2": unique_rows_df1, 
            "in table 2 but not in table 1": unique_rows_df2,
            "in both tables": common_rows}


## Part 3a - Reporting companies and Part 5 - Company data

- companies_3a_actual = list of companies in part 3a
- companies_5_actual = list of companies in part 5
- companies_3a_missing = list of companies in part 5 but not in part 3a
- companies_5_missing = list of companies in part 3a but not in part 5
- companies_5_actual_complete = companies_5_actual + companies_5_missing
- companies_3a_ideal = list of companies in companies_5_actual_complete

In [5]:
df_part_5_allcaps = df_part_5.copy()
df_part_5_allcaps["Company"] = df_part_5_allcaps["Company"].str.upper()
# df_part_5_allcaps["Government entity"] = df_part_5_allcaps["Government entity"].str.upper()
# df_part_5_allcaps["Project name"] = df_part_5_allcaps["Project name"].str.upper()
df_part_3a_allcaps = df_part_3a.copy()
df_part_3a_allcaps["Full company name"] = df_part_3a_allcaps["Full company name"].str.upper()

In [13]:
common_columns_3a5 = ["Full company name", "Country", "Year"]
common_columns_53a = ["Company", "Country", "Year"]

# for key, data in compare_tables(df_part_3a, df_part_5, common_columns_3a5, common_columns_53a).items():
#     print(key)
#     display(data)

compare_3a5_allcaps = compare_tables_drop_duplicates(df_part_3a_allcaps, df_part_5_allcaps, common_columns_3a5, common_columns_53a)

print("Duplicate rows removed")
for key, data in compare_3a5_allcaps.items():
    print(f'{key}: {data.shape[0]} rows')

Duplicate rows removed
in table 1 but not in table 2: 317 rows
in table 2 but not in table 1: 140 rows
in both tables: 3469 rows


In [23]:
companies_3a_missing_unformatted = compare_3a5_allcaps["in table 2 but not in table 1"]
companies_5_missing_unformatted = compare_3a5_allcaps["in table 1 but not in table 2"]

In [24]:
display(df_part_5_allcaps)
display(companies_5_missing_unformatted)

Unnamed: 0,Company,Government entity,Revenue stream name,Levied on project (Y/N),Reported by project (Y/N),Project name,Reporting currency,Revenue value,Payment made in-kind (Y/N),In-kind volume (if applicable),Unit (if applicable),Comments,Country,ISO Code,Year,Start Date,End Date
0,NORTH COAL ENTERPRISE (NCE),Ministry of Mines and Petroleum (Revenue Depar...,Royalties,Yes,Yes,EXP 1/2014,AFN,442801100,No,Not applicable,Not applicable,2018-09-22,Afghanistan,AFG,2018,2017-12-21,2018-12-20
1,NORTH COAL ENTERPRISE (NCE),Ministry of Mines and Petroleum (Revenue Depar...,Royalties,Yes,Yes,EXP 1/2014,AFN,386169944,No,Not applicable,Not applicable,2018-06-24,Afghanistan,AFG,2018,2017-12-21,2018-12-20
2,NORTH COAL ENTERPRISE (NCE),Ministry of Mines and Petroleum (Revenue Depar...,Royalties,Yes,Yes,EXP 1/2014,AFN,336623658,No,Not applicable,Not applicable,2018-04-18,Afghanistan,AFG,2018,2017-12-21,2018-12-20
3,NORTH COAL ENTERPRISE (NCE),Ministry of Mines and Petroleum (Revenue Depar...,Royalties,Yes,Yes,EXP 1/2014,AFN,300000000,No,Not applicable,Not applicable,2018-11-19,Afghanistan,AFG,2018,2017-12-21,2018-12-20
4,HABIB SHAHAB TALC AND MARBLE EXPLOITATION AND ...,Ministry of Mines and Petroleum (Revenue Depar...,Penalties of Late Payment,,,,AFN,18,,,,,Afghanistan,AFG,2018,2017-12-21,2018-12-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31877,CHINA NATIONAL PETROLEUM CORPORATION INTERNATI...,Société des Hydrocarbures du Tchad (SHT),Redevance sur production collecté par la SHT,No,No,Non applicable,USD,-,Yes,3543915,Barrels,Quotes-parts de l'Etat (Redevance sur producti...,Chad,TCD,2018,2018-01-01,2018-12-31
31878,CHINA NATIONAL PETROLEUM CORPORATION INTERNATI...,Société des Hydrocarbures du Tchad (SHT),Profit Oil collecté par la SHT,No,No,Non applicable,USD,-,Yes,2541955,Barrels,Quotes-parts de l'Etat (Profit Oil SHT- 10%) d...,Chad,TCD,2018,2018-01-01,2018-12-31
31879,PETROCHAD MANGARA,Société des Hydrocarbures du Tchad (SHT),Redevance sur production collecté par la SHT,No,No,Non applicable,USD,-,Yes,545318,Barrels,Quotes-parts de l'Etat (Redevance sur producti...,Chad,TCD,2018,2018-01-01,2018-12-31
31880,PETROCHAD MANGARA,Société des Hydrocarbures du Tchad (SHT),Tax Oil collecté par la SHT,No,No,Non applicable,USD,-,Yes,393777,Barrels,Quotes-parts de l'Etat (Tax Oil SHT) dans le c...,Chad,TCD,2018,2018-01-01,2018-12-31


Unnamed: 0,Full company name,Company type,Company ID number,Sector,Commodities (comma-seperated),Stock exchange listing or company website,"Audited financial statement (or balance sheet, cash flows, profit/loss statement if unavailable)",Payments to Governments Report,Country,ISO Code,Year,Start Date,End Date
7,ABED HASAN ZADRAN LIMITED,Private,9005801197,Other,Coal,,Not available,,Afghanistan,AFG,2018,2017-12-21,2018-12-24
9,AFGHAN SHININK MINES EXTRACTION AND PROCESSING,Private,9002202316,Other,Talc,,,,Afghanistan,AFG,2018,2017-12-21,2018-12-24
95,"احمد علی ولد خداداد, احمدعلی AHAMD ALI SON OF ...",Private,9001263814,Other,Construction stone,Not applicable,Not available,-,Afghanistan,AFG,2018,2017-12-21,2018-12-24
131,شرکت استخراج معادن افغان اکتیف لمیتد AFGHAN AC...,Private,9001353375,Other,Chromite,Not applicable,Not available,-,Afghanistan,AFG,2018,2017-12-21,2018-12-24
138,شرکت استخراج معادن ذغال سنک افراسیاب AFRASYAB ...,Private,9001505461,Other,Coal,Not applicable,Not available,-,Afghanistan,AFG,2018,2017-12-21,2018-12-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3776,ETEP,Private,Not available,Mining,BTP,Not available,Not available,,Chad,TCD,2018,2018-01-01,2018-12-31
3778,ARAB CONTRACTORS,Private,600008358,Mining,BTP,https://www.arabcont.com/english/,Not available,,Chad,TCD,2018,2018-01-01,2018-12-31
3781,CHAD CONSTRUCTION MATERIALS S.A,Private,Not available,Mining,BTP,Not available,Not available,,Chad,TCD,2018,2018-01-01,2018-12-31
3787,SOCIÉTÉ NATIONALE DES MINES ET DE LA GÉOLOGIE ...,State-owned enterprises & public corporations,Not available,Mining,,Not available,Not available,,Chad,TCD,2018,2018-01-01,2018-12-31


In [37]:
c2k = ["Full company name", "Payments to Governments Report", "Country", "ISO Code", "Year", "Start Date", "End Date"]
companies_5_missing = companies_5_missing_unformatted[c2k].copy()
companies_5_missing = companies_5_missing.rename(columns={"Full company name": "Company", "Payments to Governments Report": "Revenue value"})

companies_5_actual_complete = pd.concat([df_part_5_allcaps, companies_5_missing], ignore_index=True)

print(f"# part 5 rows: {df_part_5_allcaps.shape[0]} \n# missing companies in part 5: {companies_5_missing.shape[0]} \n# of ideal rows in part 5: {df_part_5_allcaps.shape[0] + companies_5_missing.shape[0]} \n# of rows in updated part 5: {companies_5_actual_complete.shape[0]}")

display(companies_5_missing)
display(companies_5_actual_complete)

# part 5 rows: 31882 
# missing companies in part 5: 317 
# of ideal rows in part 5: 32199 
# of rows in updated part 5: 32199


Unnamed: 0,Company,Revenue value,Country,ISO Code,Year,Start Date,End Date
7,ABED HASAN ZADRAN LIMITED,,Afghanistan,AFG,2018,2017-12-21,2018-12-24
9,AFGHAN SHININK MINES EXTRACTION AND PROCESSING,,Afghanistan,AFG,2018,2017-12-21,2018-12-24
95,"احمد علی ولد خداداد, احمدعلی AHAMD ALI SON OF ...",-,Afghanistan,AFG,2018,2017-12-21,2018-12-24
131,شرکت استخراج معادن افغان اکتیف لمیتد AFGHAN AC...,-,Afghanistan,AFG,2018,2017-12-21,2018-12-24
138,شرکت استخراج معادن ذغال سنک افراسیاب AFRASYAB ...,-,Afghanistan,AFG,2018,2017-12-21,2018-12-24
...,...,...,...,...,...,...,...
3776,ETEP,,Chad,TCD,2018,2018-01-01,2018-12-31
3778,ARAB CONTRACTORS,,Chad,TCD,2018,2018-01-01,2018-12-31
3781,CHAD CONSTRUCTION MATERIALS S.A,,Chad,TCD,2018,2018-01-01,2018-12-31
3787,SOCIÉTÉ NATIONALE DES MINES ET DE LA GÉOLOGIE ...,,Chad,TCD,2018,2018-01-01,2018-12-31


Unnamed: 0,Company,Government entity,Revenue stream name,Levied on project (Y/N),Reported by project (Y/N),Project name,Reporting currency,Revenue value,Payment made in-kind (Y/N),In-kind volume (if applicable),Unit (if applicable),Comments,Country,ISO Code,Year,Start Date,End Date
0,NORTH COAL ENTERPRISE (NCE),Ministry of Mines and Petroleum (Revenue Depar...,Royalties,Yes,Yes,EXP 1/2014,AFN,442801100,No,Not applicable,Not applicable,2018-09-22,Afghanistan,AFG,2018,2017-12-21,2018-12-20
1,NORTH COAL ENTERPRISE (NCE),Ministry of Mines and Petroleum (Revenue Depar...,Royalties,Yes,Yes,EXP 1/2014,AFN,386169944,No,Not applicable,Not applicable,2018-06-24,Afghanistan,AFG,2018,2017-12-21,2018-12-20
2,NORTH COAL ENTERPRISE (NCE),Ministry of Mines and Petroleum (Revenue Depar...,Royalties,Yes,Yes,EXP 1/2014,AFN,336623658,No,Not applicable,Not applicable,2018-04-18,Afghanistan,AFG,2018,2017-12-21,2018-12-20
3,NORTH COAL ENTERPRISE (NCE),Ministry of Mines and Petroleum (Revenue Depar...,Royalties,Yes,Yes,EXP 1/2014,AFN,300000000,No,Not applicable,Not applicable,2018-11-19,Afghanistan,AFG,2018,2017-12-21,2018-12-20
4,HABIB SHAHAB TALC AND MARBLE EXPLOITATION AND ...,Ministry of Mines and Petroleum (Revenue Depar...,Penalties of Late Payment,,,,AFN,18,,,,,Afghanistan,AFG,2018,2017-12-21,2018-12-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32194,ETEP,,,,,,,,,,,,Chad,TCD,2018,2018-01-01,2018-12-31
32195,ARAB CONTRACTORS,,,,,,,,,,,,Chad,TCD,2018,2018-01-01,2018-12-31
32196,CHAD CONSTRUCTION MATERIALS S.A,,,,,,,,,,,,Chad,TCD,2018,2018-01-01,2018-12-31
32197,SOCIÉTÉ NATIONALE DES MINES ET DE LA GÉOLOGIE ...,,,,,,,,,,,,Chad,TCD,2018,2018-01-01,2018-12-31


In [67]:
companies_5_actual_complete["Revenue value"] = pd.to_numeric(companies_5_actual_complete["Revenue value"], errors="coerce")

# Group the data by Country and Year
grouped_data = companies_5_actual_complete.groupby(['Country', 'Year'])

pivot_tables = []

# Iterate over each group and create a pivot table based on Company
for (country, year), group_df in grouped_data:
    pivot_table = group_df.pivot_table(index='Company', aggfunc={"Revenue value": "sum"}) 
    pivot_table['Country'] = country
    pivot_table['Year'] = year
    pivot_tables.append(pivot_table)

result_df = pd.concat(pivot_tables).reset_index()

result_df = result_df.rename(columns={"Company": "Full company name", "Revenue value": "Payments to Governments Report"})

result_df.to_csv("data/outputs/companies_3a_ideal.csv", index=False)

display(result_df)

Unnamed: 0,Full company name,Payments to Governments Report,Country,Year
0,ABAAN RAYAN LIMITED,1.025688e+07,Afghanistan,2018
1,ABAS GHAZNAVI LIMITED,3.624451e+07,Afghanistan,2018
2,ABDUL FATAH,1.394220e+05,Afghanistan,2018
3,ABDUL RAOUF,5.092500e+04,Afghanistan,2018
4,ABDUL WAHAB ARIOBWAL LIMITED,4.588710e+05,Afghanistan,2018
...,...,...,...,...
3920,MAAMBA COLLIERIES LIMITED,4.703749e+08,Zambia,2019
3921,MOPANI COPPER MINES PLC,5.143412e+08,Zambia,2019
3922,NFC AFRICA MINING PLC,2.559570e+08,Zambia,2019
3923,SINO METALS,1.007475e+08,Zambia,2019


In [56]:
companies_5_actual_complete["Revenue value"] = pd.to_numeric(companies_5_actual_complete["Revenue value"], errors="coerce")

companies_5_actual_complete.to_csv("data/outputs/companies_5_actual_complete.csv", index=False)

pivot_table = companies_5_actual_complete.pivot_table(index=["Company", "Country", "Year", "ISO Code", "Start Date", "End Date"], aggfunc={"Revenue value": "sum"})

display(pivot_table)

Unnamed: 0_level_0,Revenue value
Company,Unnamed: 1_level_1
1 1 1,2.503175e+09
1/1999 NO. 35/4 – AUTHORIZED ENTITY HALS-K PRJSC (31566427),1.391783e+08
11 HERONJTE BATER,2.896787e+07
24 HOUR MINING & INDUSTRY COMPANY LIMITED,7.548560e+07
3A-160 SH.P.K,1.493788e+07
...,...
“TEGHOUT” CJSC,5.751841e+09
“VARDANI ZARTONK” LLC,1.259139e+07
“VAYK GOLD” LLC,1.226506e+07
“ZANGEZUR COPPER-MOLYBDENUM COMBINE” CJSC,8.937317e+10
