# This notebook is used to add missing sectors for modern slavery statements extracted by me using modern slavery statements from modern slavery research package

In [1]:
# use if autocompletion is not working
%config Completer.use_jedi = False

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import pandas as pd
import json
import numpy as np
from tqdm import tqdm

from fuzzywuzzy import fuzz

from typing import List, Callable, Tuple
from copy import deepcopy
from sys import getsizeof
from time import sleep
from IPython.display import clear_output

import re

In [4]:
from modern_slavery_registry import get_root_path

In [5]:
PROJECT_PATH = get_root_path()
DATA_PATH = os.path.join(PROJECT_PATH, "data")

# COMPANIES_JSON = "companies_060421.json"

In [6]:
os.listdir(os.path.join(DATA_PATH, "sheets"))

['.ipynb_checkpoints',
 'fuzzy_matches.xlsx',
 'modern_slavery_dataset.csv',
 'subset_data.xlsx',
 'ext_shawn_creds.csv']

## Loading shared dataset from modern-slavery-research package

In [7]:
shared_statements = pd.read_csv(os.path.join(DATA_PATH, "sheets", "modern_slavery_dataset.csv"))

In [8]:
shared_statements.head()

Unnamed: 0,Company ID,Company,Is Publisher,Statement ID,URL,Override URL,Companies House Number,Industry,HQ,Is Also Covered,UK Modern Slavery Act,California Transparency in Supply Chains Act,Australia Modern Slavery Act,Period Covered,Text
0,7676,"""K"" Line Holding Europe Limited",True,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5005018.0,Marine,United Kingdom,False,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
1,28660,"""K"" Line Bulk Shipping (UK) Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,4830352.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
2,28659,"""K"" Line (Europe) Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5639474.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
3,28661,"""K"" Line LNG Shipping Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
4,28658,Polar LNG Shipping (UK) Limited,False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,2205323.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...


In [9]:
shared_statements.isna().sum(axis=0)

Company ID                                          0
Company                                           889
Is Publisher                                      889
Statement ID                                      889
URL                                                 0
Override URL                                    28399
Companies House Number                           6530
Industry                                          889
HQ                                                889
Is Also Covered                                   889
UK Modern Slavery Act                             889
California Transparency in Supply Chains Act      889
Australia Modern Slavery Act                      889
Period Covered                                   1552
Text                                             9795
dtype: int64

## Sectors available in AWS shared data

In [10]:
shared_statements["Industry"].value_counts()

Industry unknown                                  6218
Professional Services                             1436
Commercial Services & Supplies                    1162
Specialty Retail                                  1028
Construction & Engineering                         909
                                                  ... 
Biotechnology                                       36
Wireless Telecommunication Services                 35
Thrifts & Mortgage Finance                          16
Health Care Technology                              16
Mortgage Real Estate Investment Trusts (REITs)       2
Name: Industry, Length: 71, dtype: int64

Checking again because of Industry unknown flag

In [11]:
np.unique(shared_statements["Industry"].astype(str))

array(['Aerospace & Defense', 'Air Freight & Logistics', 'Airlines',
       'Auto Components', 'Automobiles', 'Banks', 'Beverages',
       'Biotechnology', 'Building Products', 'Capital Markets',
       'Charity/Non-Profit', 'Chemicals',
       'Commercial Services & Supplies', 'Communications Equipment',
       'Construction & Engineering', 'Construction Materials',
       'Consumer Finance', 'Containers & Packaging', 'Distributors',
       'Diversified Consumer Services', 'Diversified Financial Services',
       'Diversified Telecommunication Services', 'Electric Utilities',
       'Electrical Equipment',
       'Electronic Equipment, Instruments & Components',
       'Energy Equipment & Services',
       'Equity Real Estate Investment Trusts (REITs)',
       'Food & Staples Retailing', 'Food Products', 'Gas Utilities',
       'Health Care Equipment & Supplies',
       'Health Care Providers & Services', 'Health Care Technology',
       'Hotels, Restaurants & Leisure', 'Household Dur

In [12]:
shared_statements_sectors = shared_statements[shared_statements["Industry"]!="Industry unknown"][["Company", "Industry"]].dropna().drop_duplicates()
shared_statements_sectors

Unnamed: 0,Company,Industry
0,"""K"" Line Holding Europe Limited",Marine
1,"""K"" Line Bulk Shipping (UK) Limited",Marine
2,"""K"" Line (Europe) Limited",Marine
3,"""K"" Line LNG Shipping Limited",Marine
4,Polar LNG Shipping (UK) Limited,Marine
...,...,...
27519,telent Technology Services Limited,IT Services
27521,voestalpine High Performance Metals UK Limited,Metals & Mining
27522,voestalpine Metsec plc,Metals & Mining
27524,wnDirect Limited,Air Freight & Logistics


In [13]:
shared_statements_sectors["Company"].nunique()

12884

## Loading my statements

In [14]:
my_statements = pd.read_json(os.path.join(DATA_PATH, "subset-data-with-additional-info-v2.json"))

In [15]:
my_statements.head()

Unnamed: 0,URL,Company,final_statement,years_with_and_without_act,years_with_act,final_statement_cleaned,len(final_statement_cleaned),additional_info
0,https://1spatial.com/who-we-are/legal/modern-s...,1Spatial Plc,1Spatial Modern Slavery Act Policy Statement H...,[2015],[2015],home solution government boundary law enforcem...,2128,"{'id': ['903081'], 'headquarters': ['GB'], 'se..."
1,https://www.shazans.com/slavery-and-human-traf...,1Stop Halal Limited,Slavery and Human Trafficking Statement – Shaz...,"[2015, 2018]","[2015, 2018]",shazans shazan food continue monitor covid 19 ...,1840,"{'id': ['903810'], 'headquarters': ['GB'], 'se..."
2,https://www.business-humanrights.org/sites/def...,1st Step Solutions Limited,7/28/2019 Modern Slavery Statement 2018 - 1st ...,"[2015, 2018]","[2015, 2018]",28 2019 2018 statement make pursuant sec 54 20...,1843,"{'id': ['900634'], 'headquarters': ['GB'], 'se..."
3,https://www.2agriculture.com/wp-content/upload...,2 Agriculture Limited,fh Modern Slavery Act 2015: slavery and human ...,"[2015, 2018]","[2015, 2018]",fh 2015 introduction uk act require business s...,1372,"{'id': ['901400'], 'headquarters': ['GB'], 'se..."
4,https://www.2agriculture.com/wp-content/upload...,2 Agriculture Limited,1 Modern Slavery Act 2015: slavery and human t...,"[2015, 2017]","[2015, 2017]",2015 introduction uk act require business stat...,1457,"{'id': ['901400'], 'headquarters': ['GB'], 'se..."


In [16]:
my_statements["additional_info"].iloc[0]

{'id': ['903081'],
 'headquarters': ['GB'],
 'sectors': {'Technology': ['Technology: General']},
 'matched_company': '1Spatial Plc'}

## Is there any company with multiple sectors?

In [17]:
for add_info in my_statements["additional_info"].values:
    if len(add_info["sectors"]) > 1:
        print(add_info)
        break

{'id': ['906211'], 'headquarters': [None], 'sectors': {'Military/weapons/security equipment': ['Military/weapons/security equipment: General'], 'Services': ['Security companies']}, 'matched_company': 'A J Walter Aviation Limited'}


Note: sectors are stored as key-value pairs. Need to fix sectors where values contains extact key keyword. 

For ex: {'id': ['906211'], 'headquarters': [None], 'sectors': {'Military/weapons/security equipment': ['Military/weapons/security equipment: General'], 'Services': ['Security companies']}, 'matched_company': 'A J Walter Aviation Limited'}

### Quick fix for above problem

In [18]:
updated_add_infos = []
for add_info in tqdm(my_statements["additional_info"].values, leave=False):
    
    updated_add_info = deepcopy(add_info)
    if updated_add_info["sectors"] is not None:
        for k, v in updated_add_info["sectors"].items():
            new_v = []
            for v_i in v: 
                new_v.append(" ".join(v_i.replace(f"{k}:","").split()))
            updated_add_info["sectors"][k] = list(set(new_v))

    updated_add_infos.append(updated_add_info)

                                                      

In [19]:
for add_info in updated_add_infos:
    if len(add_info["sectors"]) > 1:
        print(add_info)
        break

{'id': ['906211'], 'headquarters': [None], 'sectors': {'Military/weapons/security equipment': ['General'], 'Services': ['Security companies']}, 'matched_company': 'A J Walter Aviation Limited'}


In [20]:
if "additional_info" in my_statements.columns: my_statements.drop("additional_info", axis=1, inplace=True)

In [21]:
my_statements["additional_info"] = updated_add_infos
del updated_add_infos, updated_add_info

In [22]:
my_statements["additional_info"].iloc[0]["sectors"]

{'Technology': ['General']}

## Sectors available from my statements

In [23]:
sectors = dict()
for add_info in tqdm(my_statements["additional_info"].values, leave=False):
    for k,v in add_info["sectors"].items():
        if k in sectors:
            sectors[k]+=deepcopy(v)
        else:
            sectors[k]=deepcopy(v)
            
for k, v in tqdm(sectors.items(), leave=False):
    sectors[k] = np.unique(v)

                                        

In [24]:
sectors

{'Technology': array(['General', 'Internet & social media', 'Software & Services',
        'Technology, telecom & electronics'], dtype='<U33'),
 'Agriculture/food/beverage/tobacco/fishing': array(['Agricultural machinery', 'Agriculture & livestock',
        'Chocolate & cocoa', 'Coffee', 'Fishing', 'Food & beverage',
        'General', 'Tea', 'Tobacco'], dtype='<U23'),
 'Services': array(['Call centre', 'Catering & food services',
        'Cleaning & maintenance', 'Education companies', 'General',
        'Printing & copying', 'Recruitment agencies', 'Security companies'],
       dtype='<U24'),
 'Real estate': array(['General', 'Property development', 'Property management',
        'Real estate sales'], dtype='<U20'),
 'Consumer products/retail': array(['Cosmetics', 'Department stores', 'Electrical appliance',
        'General', 'Household products', 'Jewellery',
        'Luggage, backpacks & bags', 'Office equipment', 'Perfume',
        'Photographic', 'Retail', 'Supermarkets & grocer

In [25]:
my_statements["additional_info"].iloc[0]["sectors"]

{'Technology': ['General']}

## Companies with no sector information from my statements

In [26]:
my_statements["additional_info"].isna().sum()

0

A dictionary with null values is inserted by default where no actual additional information is available.

In [27]:
comp_with_na_sectors = []
for row in my_statements.iterrows():
    row = row[1].to_dict()
    if len(row["additional_info"]["sectors"]) == 0:
        comp_with_na_sectors.append(row["Company"])

print(f"Found {len(comp_with_na_sectors)} companies with no sectors.")
print(f"Found {len(set(comp_with_na_sectors))} unique companies with no sectors.")

Found 2548 companies with no sectors.
Found 2371 unique companies with no sectors.


In [28]:
comp_with_na_sectors = pd.DataFrame(set(comp_with_na_sectors), columns=["Company"])
comp_with_na_sectors

Unnamed: 0,Company
0,Hendy Automotive Limited
1,Celsus Group Limited
2,Can (Offshore) Limited
3,The Education Alliance
4,"Bemis Company, Inc."
...,...
2366,W R Ferris Limited
2367,Scot J C B Limited
2368,Chelmsford Star Co-operative Society Ltd
2369,Travelers Management Limited


## Merging sector information from AWS shared data into my statements for companies with no sector information

In [29]:
comp_with_na_sectors_fixed = pd.merge(
    comp_with_na_sectors,
    shared_statements_sectors,
    on="Company",
    how="inner").drop_duplicates()

comp_with_na_sectors_fixed["sectors"] = comp_with_na_sectors_fixed["Industry"].apply(lambda x: {x:[]})
comp_with_na_sectors_fixed.drop("Industry", axis=1, inplace=True)
print(f"Found sectors for {len(comp_with_na_sectors_fixed)} companies.")

comp_with_na_sectors_fixed.set_index("Company", inplace=True)
comp_with_na_sectors_fixed.head()

Found sectors for 606 companies.


Unnamed: 0_level_0,sectors
Company,Unnamed: 1_level_1
"Bemis Company, Inc.",{'Containers & Packaging': []}
Oxford Health NHS Foundation Trust,{'Public Entities': []}
Virgin Active Limited,"{'Hotels, Restaurants & Leisure': []}"
Asda Stores Limited,{'Food & Staples Retailing': []}
Lincolnshire Co-operative Limited,{'Commercial Services & Supplies': []}


## Adding additional sectors found to my statement

In [30]:
updated_add_infos = []

for row in tqdm(my_statements.iterrows(), leave=False):
    
    row = deepcopy(row[1].to_dict())
    
    
    if len(row["additional_info"]["sectors"])==0 and row["Company"] in comp_with_na_sectors_fixed.index:
        row["additional_info"]["sectors"] = comp_with_na_sectors_fixed.loc[row["Company"]].values[0]
    
    updated_add_infos.append(row["additional_info"])

                           

In [31]:
if "additional_info" in my_statements.columns: my_statements.drop("additional_info", axis=1, inplace=True)

In [32]:
my_statements["additional_info"] = updated_add_infos
del updated_add_infos

In [33]:
comp_with_na_sectors = []
for row in my_statements.iterrows():
    row = row[1].to_dict()
    if len(row["additional_info"]["sectors"]) == 0:
        comp_with_na_sectors.append(row["Company"])

print(f"Found {len(comp_with_na_sectors)} companies with no sectors.")
print(f"Found {len(set(comp_with_na_sectors))} unique companies with no sectors.")

Found 1765 companies with no sectors.
Found 1765 unique companies with no sectors.


In [36]:
rows = [row[1].to_dict() for row in my_statements.iterrows()]

In [38]:
with open(os.path.join(DATA_PATH, 'subset-data-with-additional-info-v3.json'), 'w') as f:
    json.dump(rows, f)

In [39]:
len(rows)

9993