# This notebook is used to add missing headquarters for modern slavery statements extracted by me using modern slavery statements from modern slavery research package

In [1]:
# use if autocompletion is not working
%config Completer.use_jedi = False

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import pandas as pd
import json
import numpy as np
from tqdm import tqdm

from fuzzywuzzy import fuzz

from typing import List, Callable, Tuple
from copy import deepcopy
from sys import getsizeof
from time import sleep
from IPython.display import clear_output

import re

In [4]:
from modern_slavery_registry import get_root_path

In [5]:
PROJECT_PATH = get_root_path()
DATA_PATH = os.path.join(PROJECT_PATH, "data")

# COMPANIES_JSON = "companies_060421.json"

In [6]:
os.listdir(os.path.join(DATA_PATH, "sheets"))

['.ipynb_checkpoints',
 'fuzzy_matches.xlsx',
 'modern_slavery_dataset.csv',
 'subset_data.xlsx',
 'ext_shawn_creds.csv']

## Loading shared dataset from modern-slavery-research package

In [7]:
shared_statements = pd.read_csv(os.path.join(DATA_PATH, "sheets", "modern_slavery_dataset.csv"))

In [8]:
shared_statements.head()

Unnamed: 0,Company ID,Company,Is Publisher,Statement ID,URL,Override URL,Companies House Number,Industry,HQ,Is Also Covered,UK Modern Slavery Act,California Transparency in Supply Chains Act,Australia Modern Slavery Act,Period Covered,Text
0,7676,"""K"" Line Holding Europe Limited",True,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5005018.0,Marine,United Kingdom,False,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
1,28660,"""K"" Line Bulk Shipping (UK) Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,4830352.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
2,28659,"""K"" Line (Europe) Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5639474.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
3,28661,"""K"" Line LNG Shipping Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
4,28658,Polar LNG Shipping (UK) Limited,False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,2205323.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...


In [9]:
shared_statements.isna().sum(axis=0)

Company ID                                          0
Company                                           889
Is Publisher                                      889
Statement ID                                      889
URL                                                 0
Override URL                                    28399
Companies House Number                           6530
Industry                                          889
HQ                                                889
Is Also Covered                                   889
UK Modern Slavery Act                             889
California Transparency in Supply Chains Act      889
Australia Modern Slavery Act                      889
Period Covered                                   1552
Text                                             9795
dtype: int64

In [10]:
shared_statements["Period Covered"].value_counts()

2018         5743
2016         3683
2017-2018    3420
2017         3139
2018-2019    3089
2015-2016    2958
2019         1992
2016-2017    1988
2019-2020     695
2015           64
2020           44
2014           24
2020-2021      17
2015-2017       3
2016-2018       2
2014-2015       2
2018-2020       1
2015-2020       1
Name: Period Covered, dtype: int64

## Loading my statements

In [11]:
my_statements = pd.read_json(os.path.join(DATA_PATH, "subset_data_with_additional_info.json"))

In [12]:
my_statements["additional_info"].isna().sum()

950

## Companies with no additional information in my statements

In [13]:
comp_with_na_add_info = my_statements[my_statements["additional_info"].isna()]["Company"]
len(comp_with_na_add_info), len(comp_with_na_add_info.drop_duplicates())

(950, 895)

## Getting different headquarter values from my statements

In [14]:
comp_from_my_statements = set()

for i, add_info in tqdm(enumerate(my_statements["additional_info"].values), leave=False):
    if add_info is not None:
        for HQ in add_info["headquarters"]: 
            comp_from_my_statements.add(HQ)

comp_from_my_statements

                  

{'',
 'AE',
 'AF',
 'AT',
 'AU',
 'BE',
 'BH',
 'BM',
 'BR',
 'CA',
 'CH',
 'CL',
 'CN',
 'CY',
 'DE',
 'DK',
 'ES',
 'FI',
 'FR',
 'GB',
 'GE',
 'GG',
 'GL',
 'HK',
 'HN',
 'IE',
 'IL',
 'IN',
 'IO',
 'IT',
 'JE',
 'JP',
 'KR',
 'KW',
 'KZ',
 'LU',
 'MX',
 'MY',
 'NL',
 'NO',
 'NZ',
 None,
 'OM',
 'PH',
 'PL',
 'QA',
 'RU',
 'SA',
 'SE',
 'SG',
 'TH',
 'TR',
 'TW',
 'UG',
 'US',
 'VN',
 'ZA'}

## Checking additional info values where headquarter is either None or empty ("")

In [15]:
stop = False
for i, add_info in tqdm(enumerate(my_statements["additional_info"].values), leave=False):
    if add_info is not None:  
        for HQ in add_info["headquarters"]: 
            if HQ==None:
                print(i, add_info)
                stop=True
    if stop: break

stop = False
for i, add_info in tqdm(enumerate(my_statements["additional_info"].values), leave=False):
    if add_info is not None:
        for HQ in add_info["headquarters"]: 
            if HQ=="":
                print(i, add_info)
                stop=True
            
    if stop: break


                  

243 {'id': ['895747'], 'headquarters': [None], 'sectors': {'Professional Services': ['Law firms']}, 'matched_company': 'Akin Gump Strauss Hauer & Feld LLP'}
21 {'id': ['906211'], 'headquarters': [''], 'sectors': {'Military/weapons/security equipment': ['Military/weapons/security equipment: General'], 'Services': ['Security companies']}, 'matched_company': 'A J Walter Aviation Limited'}




## Finding companies with missing headquarters in my statements

Missing headquarters can be where there is no additional information or where `None` or `""` are present as values in headquarters

In [16]:
comp_with_na_hq = set()

for comp, add_info in tqdm(my_statements[["Company", "additional_info"]].values, leave=False):
    if add_info is None:
        comp_with_na_hq.add(comp)
    else:
        for hq in add_info["headquarters"]:
            if hq in [None, "", np.NaN]:
                comp_with_na_hq.add(comp)
                
                
print(f"Found {len(comp_with_na_hq)} with missing hq.")

comp_with_na_hq = pd.DataFrame(comp_with_na_hq, columns=["Company"])

                                        

Found 1214 with missing hq.




## Checking headquarters in shared statements

In [17]:
shared_statements["HQ"].value_counts()

United Kingdom                    20981
Country unknown                    2690
United States                      2182
Germany                             199
Ireland                             132
                                  ...  
Iraq                                  1
British Indian Ocean Territory        1
Uruguay                               1
Bahamas                               1
Sri Lanka                             1
Name: HQ, Length: 80, dtype: int64

## Extracting HQ from shared statements wherever available

In [18]:
comp_with_na_hq_fixed = pd.merge(
    comp_with_na_hq,
    shared_statements[shared_statements["HQ"]!="Country unknown"][["Company", "HQ"]].drop_duplicates().dropna(),
    on="Company",
    how="left")

comp_with_na_hq_fixed = comp_with_na_hq_fixed.set_index("Company").to_dict()["HQ"]

In [19]:
add_info_structure = {'id': [None], 'headquarters': [None], 'sectors': {}, 'matched_company': None}

## Adding missing HQ in my statements

In [20]:
fixed_rows = []
for row in tqdm(my_statements.iterrows(), leave=False):
    row = deepcopy(row[1].to_dict())
    if row["additional_info"] is None:
        row["additional_info"] = add_info_structure
    
    if len(set(["", None]).intersection(row["additional_info"]["headquarters"])) > 0:
        row["additional_info"]["headquarters"] = [comp_with_na_hq_fixed[row["Company"]]]
    
    fixed_rows.append(row)

                           

In [21]:
comp_with_na_hq = set()
comp_with_hq = set()

for row in tqdm(fixed_rows, leave=False):
    for hq in row["additional_info"]["headquarters"]:
        if hq in [None, "", np.NaN]:
            comp_with_na_hq.add(row["Company"])
        else:
            comp_with_hq.add(row["Company"])
                      
print(f"Found {len(comp_with_na_hq)} companies with missing hq.")
print(f"Found {len(comp_with_hq)} companies with hq.")

                                        

Found 75 companies with missing hq.
Found 8102 companies with hq.




In [22]:
print(f"Total {len(my_statements['Company'].unique())} unique companies")

Total 8177 unique companies


In [23]:
with open(os.path.join(DATA_PATH, 'subset-data-with-additional-info-v2.json'), 'w') as f:
    json.dump(fixed_rows, f)

In [24]:
my_statements = pd.read_json(os.path.join(DATA_PATH, "subset-data-with-additional-info-v2.json"))

In [25]:
headquarters = set()
for row in my_statements["additional_info"].values:
    for hq in row["headquarters"]:
        headquarters.add(hq)

In [26]:
headquarters

{'AE',
 'AF',
 'AT',
 'AU',
 'Australia',
 'Austria',
 'BE',
 'BH',
 'BM',
 'BR',
 'Belgium',
 'CA',
 'CH',
 'CL',
 'CN',
 'CY',
 'Canada',
 'DE',
 'DK',
 'ES',
 'FI',
 'FR',
 'Finland',
 'GB',
 'GE',
 'GG',
 'GL',
 'Germany',
 'HK',
 'HN',
 'Hong Kong',
 'IE',
 'IL',
 'IN',
 'IO',
 'IT',
 'India',
 'Ireland',
 'JE',
 'JP',
 'Japan',
 'KR',
 'KW',
 'KZ',
 'LU',
 'Luxembourg',
 'MX',
 'MY',
 'NL',
 'NO',
 'NZ',
 'Netherlands',
 None,
 'OM',
 'PH',
 'PL',
 'QA',
 'RU',
 'SA',
 'SE',
 'SG',
 'Spain',
 'Sweden',
 'TH',
 'TR',
 'TW',
 'Taiwan',
 'UG',
 'US',
 'United Arab Emirates',
 'United Kingdom',
 'United States',
 'VN',
 'ZA'}

In [27]:
len(my_statements)

9993