# Findings 
1. Most of the issues are in the following tables:
    - Part 3 - Reporting companies' list.csv
    - Part 3 - Reporting projects' list.csv, and
    - Part 5 - Company data.csv (add values later, don't include fields that are if applicable)
2. The top 5 countries with the most issues are (Nulls, #ERROR!, White-space):
    - Ukraine 	20964.0 	170.0 	0.0 	21134.0
    - Mongolia 	14216.0 	71.0 	0.0 	14287.0
    - Afghanistan 	9927.0 	480.0 	0.0 	10407.0
    - United Kingdom 	9334.0 	126.0 	0.0 	9460.0
    - Nigeria 	9395.0 	0.0 	0.0 	9395.0
3. 

In [88]:
# import libraries

import pandas as pd
from os import path
from functools import reduce

file_dir = "data/consolidated/"

# load the csvs into data frames
df_part_1 = pd.read_csv(path.join(file_dir, "Part 1 - About.csv"))
df_part_3a = pd.read_csv(path.join(file_dir, "Part 3 - Reporting companies' list.csv"))
df_part_3b = pd.read_csv(path.join(file_dir, "Part 3 - Reporting government entities list.csv"))
df_part_3c = pd.read_csv(path.join(file_dir, "Part 3 - Reporting projects' list.csv"))
df_part_4 = pd.read_csv(path.join(file_dir, "Part 4 - Government revenues.csv"))
df_part_5 = pd.read_csv(path.join(file_dir, "Part 5 - Company data.csv"))
# df_part_5 = pd.read_csv(path.join(file_dir, "Part 5 - Company data.csv"), low_memory=False)

df_list = [df_part_1, df_part_3a, df_part_3b, df_part_3c, df_part_4, df_part_5]
df_dict = {"Part 1 - About.csv": df_part_1,
           "Part 3 - Reporting companies' list.csv": df_part_3a,
           "Part 3 - Reporting government entities list.csv": df_part_3b,
           "Part 3 - Reporting projects' list.csv": df_part_3c,
           "Part 4 - Government revenues.csv": df_part_4,
           "Part 5 - Company data.csv": df_part_5
          }

## Table completeness counts the number of issues per table

### Countries per table

- 37 countries each for Part 1, 3a, 3b, 4
- 31 countries in Part 3c
- 36 countries in Part 5

In [2]:
def table_completeness(df):
    
    null_counts = df.isnull().sum()
    blank_counts = (df == '').sum()
    error_counts = (df == '#ERROR!').sum()

    # Creating a new DataFrame to combine results
    result_df = pd.DataFrame({
        'Field Name': null_counts.index,
        'Number of Nulls': null_counts.values,
        'Number of #ERROR!': error_counts.values,
        'Number of Blanks': blank_counts.values
    })

    return result_df

In [3]:
table_completeness(df_part_3a)

Unnamed: 0,Field Name,Number of Nulls,Number of #ERROR!,Number of Blanks
0,Full company name,0,0,0
1,Company type,355,0,0
2,Company ID number,187,0,0
3,Sector,0,0,0
4,Commodities (comma-seperated),461,0,0
5,Stock exchange listing or company website,2019,0,0
6,"Audited financial statement (or balance sheet,...",2196,0,0
7,Payments to Governments Report,260,993,0
8,Country,0,0,0
9,ISO Code,0,0,0


In [128]:
table_completeness(df_part_3c)

Unnamed: 0,Field Name,Number of Nulls,Number of #ERROR!,Number of Blanks
0,Full project name,6,0,0
1,"Legal agreement reference number(s): contract,...",240,0,0
2,"Affiliated companies, start with Operator",152,0,0
3,Commodities (one commodity/row),397,0,0
4,Status,404,0,0
5,Production (volume),1767,0,0
6,Unit,3391,0,0
7,Production (value),5455,117,0
8,Currency,4576,0,0
9,Country,2,0,0


In [124]:
table_completeness(df_part_5.drop(columns=['In-kind volume (if applicable)', 'Unit (if applicable)', 'Comments']))

Unnamed: 0,Field Name,Number of Nulls,Number of #ERROR!,Number of Blanks
0,Company,28,0,0
1,Government entity,34,0,0
2,Revenue stream name,35,0,0
3,Levied on project (Y/N),203,0,0
4,Reported by project (Y/N),185,0,0
5,Project name,21034,0,0
6,Reporting currency,31,0,0
7,Revenue value,6610,0,0
8,Payment made in-kind (Y/N),7042,0,0
9,Country,71,0,0


In [127]:
pd.DataFrame(df_part_5[df_part_5['Country'].isnull()]['Company'].unique())

Unnamed: 0,0
0,
1,Additional information
2,Any additional information that is not eligibl...
3,Unilateral Disclosure for Other Minerals sector
4,Unilateral Disclosure for Gems and Jade sector
5,Social contributions - not received by governm...
6,24 HOUR MINING & INDUSTRY COMPANY LIMITED
7,"Annawar Pearl Co., Ltd (Marine Pearl Company)"
8,"Aquagold Myanmar Co., Ltd"
9,Aung Hlaing Kyawlwa


## Count issues counts the number of issues and groups them

In [4]:
def count_issues(df, group):

    df_issues = df.copy()
    
    df_issues['Nulls'] = df_issues.isnull().sum(axis=1)
    df_issues['Errors'] = (df_issues == '#ERROR!').sum(axis=1)
    df_issues['Blanks'] = (df_issues == '').sum(axis=1)
    
    ngrp = df_issues.groupby(group)['Nulls'].sum().reset_index()
    egrp = df_issues.groupby(group)['Errors'].sum().reset_index()
    wgrp = df_issues.groupby(group)['Blanks'].sum().reset_index()
    
    merged = pd.merge(pd.merge(ngrp, egrp, on=group, how='inner'), wgrp, on=group, how='inner')
    
    merged['Total'] = merged['Nulls'] + merged['Errors'] + merged['Blanks']
    
    return merged

## Checking the consolidated tables (# of issues), per country

- Some countries do not have values for some tables so they don't have issues in that table

In [109]:
# list_issues = [count_issues(df_part_1, 'Country or area name')]
df_part_1_renamed = df_part_1.copy()
df_part_1_renamed['Country'] = df_part_1['Country or area name']

# list_issues = [count_issues(df_part_1_renamed, 'Country')]

new_df_list = [df_part_1_renamed] + df_list[1:]

merge_13a = pd.merge(count_issues(df_part_1_renamed, 'Country'), count_issues(df_part_3a, 'Country'), on='Country', how='left', suffixes=('_1', '_3a'))

# merge_13a
merge_13a3b = pd.merge(merge_13a, count_issues(df_part_3b, 'Country'), on='Country', how='left', suffixes=('', '_3b'))

merge_13a3b3c = pd.merge(merge_13a3b, count_issues(df_part_3c, 'Country'), on='Country', how='left', suffixes=('_3b', '_3c'))

merge_13a3b3c4 = pd.merge(merge_13a3b3c, count_issues(df_part_4, 'Country'), on='Country', how='left', suffixes=('_3c', '_4'))

merged_issues = pd.merge(merge_13a3b3c4, count_issues(df_part_5, 'Country'), on='Country', how='left', suffixes=('_4', '_5'))

merged_issues

merged_issues['Nulls'] = merged_issues[['Nulls_1', 'Nulls_3a', 'Nulls_3b', 'Nulls_3c', 'Nulls_4', 'Nulls_5']].sum(axis=1, skipna=True)
merged_issues['Errors'] = merged_issues[['Errors_1', 'Errors_3a', 'Errors_3b', 'Errors_3c', 'Errors_4', 'Errors_5']].sum(axis=1, skipna=True)
merged_issues['Blanks'] = merged_issues[['Blanks_1', 'Blanks_3a', 'Blanks_3b', 'Blanks_3c', 'Blanks_4', 'Blanks_5']].sum(axis=1, skipna=True)
merged_issues['Total'] = merged_issues[['Total_1', 'Total_3a', 'Total_3b', 'Total_3c', 'Total_4', 'Total_5']].sum(axis=1, skipna=True)
# merged_issues['Nulls'] = merged_issues['Nulls_1'] + merged_issues['Nulls_3a'] + merged_issues['Nulls_3b'] + merged_issues['Nulls_3c'] + merged_issues['Nulls_4'] + merged_issues['Nulls_5']
# merged_issues['Errors'] = merged_issues['Errors_1'] + merged_issues['Errors_3a'] + merged_issues['Errors_3b'] + merged_issues['Errors_3c'] + merged_issues['Errors_4'] + merged_issues['Errors_5']
# merged_issues['Blanks'] = merged_issues['Blanks_1'] + merged_issues['Blanks_3a'] + merged_issues['Blanks_3b'] + merged_issues['Blanks_3c'] + merged_issues['Blanks_4'] + merged_issues['Blanks_5']
# merged_issues['Total'] = merged_issues['Total_1'] + merged_issues['Total_3a'] + merged_issues['Total_3b'] + merged_issues['Total_3c'] + merged_issues['Total_4'] + merged_issues['Total_5']

merged_issues[['Country','Nulls', 'Errors', 'Blanks', 'Total']].sort_values(by='Total', ascending=False)
# merged_issues[['Country','Nulls']]
# merge_13a3b = pd.merge(merge
# for df in new_df_list:
#     list_issues.append(count_issues(df, 'Country'))

# merged_issues = reduce(lambda left, right: pd.merge(left, right, on='Country', how='inner'), list_issues)

# merged_issues

Unnamed: 0,Country,Nulls,Errors,Blanks,Total
34,Ukraine,20964.0,170.0,0.0,21134.0
21,Mongolia,14216.0,71.0,0.0,14287.0
0,Afghanistan,9927.0,480.0,0.0,10407.0
35,United Kingdom,9334.0,126.0,0.0,9460.0
24,Nigeria,9395.0,0.0,0.0,9395.0
23,Myanmar,7848.0,0.0,0.0,7848.0
1,Albania,6476.0,0.0,0.0,6476.0
26,Philippines,4713.0,0.0,0.0,4713.0
31,Tanzania,4545.0,0.0,0.0,4545.0
12,Ghana,4112.0,244.0,0.0,4356.0


## Issues per table

In [None]:
def table_completeness(df):
    
    null_counts = df.isnull().sum()
    blank_counts = (df == '').sum()
    error_counts = (df == '#ERROR!').sum()

    # Creating a new DataFrame to combine results
    result_df = pd.DataFrame({
        'Field Name': null_counts.index,
        'Number of Nulls': null_counts.values,
        'Number of #ERROR!': error_counts.values,
        'Number of Blanks': blank_counts.values
    })

    return result_df

In [122]:
print("Nulls per table (all)")
for k in df_dict:
    print(f'{k}: {df_dict[k].isnull().sum().sum()}')

Nulls per table (all)
Part 1 - About.csv: 268
Part 3 - Reporting companies' list.csv: 5478
Part 3 - Reporting government entities list.csv: 420
Part 3 - Reporting projects' list.csv: 16398
Part 4 - Government revenues.csv: 358
Part 5 - Company data.csv: 105562


In [121]:
print("Nulls per table")
df_part_5.drop(columns=['In-kind volume (if applicable)', 'Unit (if applicable)', 'Comments']).isnull().sum().sum()
    # print(f'{k}: {newdf.isnull().sum().sum()}')

Nulls per table


35586

In [116]:
print("Blanks per table")
for k in df_dict:
    print(f'{k}: {(df_dict[k] == '').sum().sum()}')

Blanks per table
Part 1 - About.csv: 0
Part 3 - Reporting companies' list.csv: 0
Part 3 - Reporting government entities list.csv: 0
Part 3 - Reporting projects' list.csv: 0
Part 4 - Government revenues.csv: 0
Part 5 - Company data.csv: 0


In [117]:
print("#ERROR! per table")
for k in df_dict:
    print(f'{k}: {(df_dict[k] == '#ERROR!').sum().sum()}')

#ERROR! per table
Part 1 - About.csv: 59
Part 3 - Reporting companies' list.csv: 993
Part 3 - Reporting government entities list.csv: 137
Part 3 - Reporting projects' list.csv: 117
Part 4 - Government revenues.csv: 0
Part 5 - Company data.csv: 44
