# Ideal tables

In [2]:
# import libraries and data

import pandas as pd
import numpy as np
from os import path
from functools import reduce
from pprint import pprint
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from itertools import combinations

file_dir = "data/cleaned/"
file_dir_old = "data/consolidated/backup/old"

## Ideal Companies (Part 3A)

From the complete companies cleaned data:
1. Per report (country, year), create a pivot table based on Company name
2. Combine the pivot tables into a list
3. The result is a list of companies per report (w/ revenue value or payments to governments report summarized)

From this data, we can get:
1. a consistent list of companies per report (country, year) in relation to Part 5
2. a list of all companies that has ever appeared in a country report

In [3]:
companies_5_complete_cleaned = pd.read_csv(path.join(file_dir, "companies-5-complete-cleaned.csv"))

companies_grouped_data = companies_5_complete_cleaned.groupby(['Country', 'Year'])

companies_pivot_tables = []

# Iterate over each group and create a pivot table based on Company
for (country, year), group_df in companies_grouped_data:
    companies_pivot_table = group_df.pivot_table(index='Company', aggfunc={"Revenue value": "sum"}) 
    companies_pivot_table['Country'] = country
    companies_pivot_table['Year'] = year
    companies_pivot_tables.append(companies_pivot_table)

companies_result_df = pd.concat(companies_pivot_tables).reset_index()

companies_result_df = companies_result_df.rename(columns={"Company": "Full company name", "Revenue value": "Payments to Governments Report"})

companies_result_df.to_csv("data/cleaned/companies_3a_ideal.csv", index=False)

companies_3a_ideal = companies_result_df.copy()

display(companies_3a_ideal)

Unnamed: 0,Full company name,Payments to Governments Report,Country,Year
0,ABAAN RAYAN LIMITED,1.025688e+07,Afghanistan,2018
1,ABBAS GHAZNAVI LIMITED,3.624451e+07,Afghanistan,2018
2,ABDUL FATAH,1.394220e+05,Afghanistan,2018
3,ABDUL RAOUF,5.092500e+04,Afghanistan,2018
4,ABDUL WAHAB ARIOBWAL LIMITED,4.588710e+05,Afghanistan,2018
...,...,...,...,...
3848,MAAMBA COLLIERIES LIMITED,4.703749e+08,Zambia,2019
3849,MOPANI COPPER MINES PLC,5.143412e+08,Zambia,2019
3850,NFC AFRICA MINING PLC,2.559570e+08,Zambia,2019
3851,SINO METALS,1.007475e+08,Zambia,2019


## Ideal Agencies (Part 3B)

From the complete agencies cleaned data:
1. Per report (country, year), create a pivot table based on Agency name
2. Combine the pivot tables into a list
3. The result is a list of government agencies per report (w/ revenue value or total reported summarized)

From this data, we can get:
1. a consistent list of agencies per report (country, year) in relation to Part 5
2. a list of all agencies that has ever appeared in a country report

In [5]:
agencies_4_complete_indexed_cleaned = pd.read_csv(path.join(file_dir, "agencies-4-actual-complete-indexed-cleaned.csv"))

agencies_grouped_data = agencies_4_complete_indexed_cleaned.groupby(['Country', 'Year'])

agencies_pivot_tables = []

# Iterate over each group and create a pivot table based on Company
for (country, year), group_df in agencies_grouped_data:
    agencies_pivot_table = group_df.pivot_table(index='Government entity', aggfunc={"Revenue value": "sum"}) 
    agencies_pivot_table['Country'] = country
    agencies_pivot_table['Year'] = year
    agencies_pivot_tables.append(agencies_pivot_table)

agencies_result_df = pd.concat(agencies_pivot_tables).reset_index()

agencies_result_df = agencies_result_df.rename(columns={"Government entity": "Full name of agency", "Revenue value": "Total reported"})

agencies_result_df.to_csv("data/cleaned/agencies_3b_ideal.csv", index=False)

agencies_3b_ideal = agencies_result_df.copy()

display(agencies_3b_ideal)

Unnamed: 0,Full name of agency,Total reported,Country,Year
0,MINISTRY OF FINANCE (CUSTOMS DEPARTMENT),1.459912e+09,Afghanistan,2018
1,MINISTRY OF FINANCE (REVENUE DEPARTMENT),1.089699e+09,Afghanistan,2018
2,MINISTRY OF INDUSTRY AND COMMERCE,0.000000e+00,Afghanistan,2018
3,MINISTRY OF MINES AND PETROLEUM (REVENUE DEPAR...,2.285251e+09,Afghanistan,2018
4,NATIONAL ENVIRONMENTAL PROTECTION AGENCY,4.700000e+04,Afghanistan,2018
...,...,...,...,...
537,MINISTRY OF LANDS,9.827895e+05,Zambia,2019
538,MINISTRY OF MINES AND MINERALS DEVELOPMENT,4.128300e+07,Zambia,2019
539,MINISTRY OF MINES AND MINERALS DEVELOPMENT - P...,3.051347e+06,Zambia,2019
540,ZAMBIA CONSOLIDATED COPPER MINES – INVESTMENT ...,0.000000e+00,Zambia,2019
