In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from os import getcwd
import numpy as np
from tqdm import tqdm
import re
from typing import Tuple, Dict, Sequence, List, Union

In [3]:
from modern_slavery_registry.text_parser import clean_text

In [4]:
DATA_PATH = getcwd()
DATA_PATH = DATA_PATH.replace("notebooks", "")
DATA_PATH += "data"
SHEETS_PATH = DATA_PATH + "\\sheets"

In [5]:
prof_data = pd.read_json(f"{SHEETS_PATH}\\modern_slavery_dataset_prof.json")
cols_to_keep = ["Company ID", "Company", "Statement ID", "URL", "Text"]
prof_data.drop_duplicates(inplace=True)
prof_data.dropna(subset=["Company ID"], inplace=True)
prof_data.reset_index(drop=True, inplace=True)
prof_data = prof_data[cols_to_keep]
cols_type = ["int32", str, float, str, str]
for i, col in enumerate(prof_data.columns):
    prof_data[col] = prof_data[col].astype(cols_type[i])




my_data = pd.read_excel(f"{SHEETS_PATH}\\modern_slavery_dataset_nitin.xlsx")
cols_to_keep = ["Company ID", "Company", "Statement ID", "URL", "statement"]
my_data.drop_duplicates(inplace=True)
my_data.dropna(subset=["Company ID"], inplace=True)
my_data.reset_index(drop=True, inplace=True)
my_data = my_data[cols_to_keep]
for i, col in enumerate(my_data.columns):
    my_data[col] = my_data[col].astype(cols_type[i])

In [6]:
my_data.head(1)

Unnamed: 0,Company ID,Company,Statement ID,URL,statement
0,7676,"""K"" Line Holding Europe Limited",35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,66 99 “K” Line Holding (Europe) Limited kM K L...


In [7]:
prof_data.head(1)

Unnamed: 0,Company ID,Company,Statement ID,URL,Text
0,7676,"""K"" Line Holding Europe Limited",35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...


In [8]:
print(f"Length, prof. dataset : {len(prof_data)}, own dataset : {len(my_data)}")

Length, prof. dataset : 28361, own dataset : 27527


<font color="blue" size="5"> Combining both datasets 

In [9]:
combine_data = pd.merge(prof_data,
                        my_data,
                        on = ["Company ID", "Company", "Statement ID", "URL"], 
                        how = "outer")
combine_data.fillna("#NA", inplace=True)
combine_data["Text"] = [text if text != "nan" else "#NA" for text in combine_data["Text"]]
combine_data["statement"] = [text if text != "nan" else "#NA" for text in combine_data["statement"]]
combine_data["final_text"] = [
    combine_data["Text"].iloc[i] if combine_data["Text"].iloc[i] != "#NA" else combine_data["statement"].iloc[i] for i in range(
        len(combine_data))]
cols_to_drop = ["Text", "statement"]
for col in cols_to_drop:
    combine_data.drop(col, axis=1, inplace=True)

In [10]:
combine_data.head(1)

Unnamed: 0,Company ID,Company,Statement ID,URL,final_text
0,7676,"""K"" Line Holding Europe Limited",35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...


In [11]:
clean_text(combine_data["final_text"].iloc[0], 
           remove_urls=True, 
           remove_special_chars=True, 
           remove_digits=True,
           to_lower=True, 
           remove_stopwords=True)

'k line holdinc europe ltd modern slavery act transparency statement published march uk modern slavery act requires large entities carrying business uk publish statement detailing efforts combat human trafficking modern day slavery statement relates actions activities financial year april march part shipping industry k line group recognizes responsibility take robust approach slavery human trafficking k line group absolutely committed preventing slavery human trafficking corporate activities ensuring supply chains free slavery human trafficking organizational structure activities k line group global shipping entity headquartered tokyo japan network offices around globe including united kingdom uk group comprises k holding europe ltd k line europe ltd k line bulk shipping uk ltd k line lng shipping ltd polar lng shipping uk ltd k line group uk business units activities include car carrier dry bulk lng shipping management training relevant policies k line group companies adhere charter c

In [12]:
combine_data["final_text"].iloc[0]

'K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery Act Transparency Statement\nPublished: 22 March 2019\nThe UK Modern Slavery Act 2015 requires large entities carrying on a business in the UK to publish a\nstatement detailing their efforts (if any) to combat human trafficking and modern-day slavery. This statement\nrelates to actions and activities during the financial year 1 April 2018 to 31 March 2019.\nAs part of the shipping industry, "K" Line Group recognizes that it has a responsibility to take a robust\napproach to slavery and human trafficking.\n"K" Line Group is absolutely committed to preventing slavery and human trafficking in its corporate activities,\nand to ensuring that its supply chains are free from slavery and human trafficking.\nOrganizational structure and activities\nThe "K" Line Group is a global shipping entity headquartered in Tokyo, Japan but with a network of offices\naround the globe including the United Kingdom. The UK Group comprises "K" Holding (Europe) Ltd, 

In [39]:
def find_company_name_occurance(company_name: str,
                                text: str) -> Dict:
    """
    Find number of instances of company name in text.
    
    Company name is broken down into sequence of words.
    
    Parameters
    ----------
    company_name: str
    
    text: str
    
    Returns
    -------
    mapping: Dict, {str:int}
    
    Examples
    --------
    >>> find_company_name_occurance("General motors", 
        "General Motors Company(GM) is an American "
        "multinational corporation headquartered in Detroit. " 
        "General Motors manufactures vehicles in several countries.")
    {"general": 2, "general motors": 2}
    """
    company_name = clean_text(company_name)
    text = clean_text(text)
    company_name = company_name.split()
    mapping = {}
    for i in range(1, len(company_name)+1):
        name = company_name[:i]
        mapping[" ".join(name)] = len(re.findall(r"\b" + " ".join(name) + r"\b", text))
    return mapping

In [43]:
from typing import List, Tuple, Dict
def match_company(company : str, 
                  companies : List[str]) -> Tuple[bool, Dict, float, str]:

    best_mapping = None
    best_match_score = 0
    best_match_company = None
    is_present = False
    for i, _ in enumerate(companies):
        mapping = find_company_name_occurance(company, _)
        for k, v in mapping.items():
            if v > 0:
                is_present = True
                if len(k) > best_match_score:
                    best_match_score = len(k)
                    best_match_company = _
                    best_mapping = mapping
                    
    company = clean_text(company)
    return is_present, best_mapping, best_match_score/ len(company), best_match_company 

In [20]:
imp_comp = pd.read_excel(f"{DATA_PATH}\\sheets\\Listofcompanies.xlsx")

In [21]:
imp_comp.head()

Unnamed: 0,LegalNameoftheEntity,Has A Statement?
0,AAC Technologies Holdings Inc,
1,Acer Inc,
2,Advanced Micro Devices Inc (AMD),
3,AirTouch Communications Inc,
4,Astro Pyrotechnics Inc,


In [34]:
results = []
for company in tqdm(imp_comp["LegalNameoftheEntity"]):
    results.append(match_company(company, 
                                 companies=combine_data["Company"]))

100%|████████████████████████████████████████████████████████████████████████████████| 105/105 [03:44<00:00,  2.14s/it]


In [51]:
results_df = pd.DataFrame(results, 
                          columns=["match_found", 
                                   "mapping", 
                                   "best_match_score", 
                                   "best_match_company"])
results_df.head(10)
results_df = pd.concat([imp_comp["LegalNameoftheEntity"], results_df], axis = 1)
results_df.to_excel(f"{DATA_PATH}\\sheets\\Listofcompanies_filled.xlsx",
                    index=False)

In [58]:
combine_data["final_text"].iloc[7157]

'None'

In [59]:
for i in range(len(combine_data)):
    if combine_data["final_text"].iloc[i] == "None":
        print(i)
        break

5


In [60]:
find_company_name_occurance(combine_data["Company"].iloc[7157], combine_data["final_text"].iloc[7157])

{'construction': 0,
 'construction partnership': 0,
 'construction partnership uk': 0,
 'construction partnership uk limited': 0}