# Used to check if extracted statements are Modern Slavery Statement

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
from os import getcwd, path
import urllib
import pickle
from time import sleep
import functools
import glob
from func_timeout import func_timeout, FunctionTimedOut
import re
from langdetect import detect
from langdetect import lang_detect_exception

In [4]:
PROJECT_NAME = 'modern_slavery_registry'
PROJECT_PATH = f"{getcwd()[:getcwd().find(PROJECT_NAME)]}{PROJECT_NAME}"
DATA_PATH = f"{PROJECT_PATH}\\data"
PICKLE_PATH = f"{DATA_PATH}\\pickles"
PDF_PATH = f"{DATA_PATH}\\pdfs"
TEXT_PATH = f"{DATA_PATH}\\texts"
SLAVERY_TEXT_PATH = f"{DATA_PATH}\\slavery_texts"
MAX_DOWNLOAD_TIME = 5

In [5]:
def save_pickle(obj, file_name):
    pickle.dump(obj, file=open(f"{file_name}.pickle","wb"))
    
def load_pickle(file_name):
    return pickle.load(file=open(f"{file_name}.pickle","rb"))

In [6]:
pickle_name = f"{PICKLE_PATH}\\slavery_text_SIDs"
if path.exists(f"{pickle_name}.pickle"):
    slavery_text_SIDs = load_pickle(pickle_name)
    print(f"{pickle_name} already exists")
else:
    slavery_text_SIDs = []
    save_pickle(slavery_text_SIDs, file_name=pickle_name)
    print(f"{pickle_name} created")
    
    
pickle_name = f"{PICKLE_PATH}\\non_slavery_text_SIDs"
if path.exists(f"{pickle_name}.pickle"):
    non_slavery_text_SIDs = load_pickle(pickle_name)
    print(f"{pickle_name} already exists")
else:
    non_slavery_text_SIDs = []
    save_pickle(non_slavery_text_SIDs, file_name=pickle_name)
    print(f"{pickle_name} created")

E:\Projects\modern_slavery_registry\data\pickles\slavery_text_SIDs already exists
E:\Projects\modern_slavery_registry\data\pickles\non_slavery_text_SIDs already exists


In [7]:
df = pd.read_csv(f"{DATA_PATH}\modernslaveryregistry-2020-09-14.csv")
df.head()

Unnamed: 0,Company ID,Company,Is Publisher,Statement ID,URL,Override URL,Companies House Number,Industry,HQ,Is Also Covered,UK Modern Slavery Act,California Transparency in Supply Chains Act,Australia Modern Slavery Act,Period Covered
0,7676,"""K"" Line Holding Europe Limited",True,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5005018.0,Marine,United Kingdom,False,True,False,False,2018-2019
1,28660,"""K"" Line Bulk Shipping (UK) Limited",False,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,4830352.0,Marine,United Kingdom,True,True,False,False,2018-2019
2,28659,"""K"" Line (Europe) Limited",False,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5639474.0,Marine,United Kingdom,True,True,False,False,2018-2019
3,28661,"""K"" Line LNG Shipping Limited",False,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,,Marine,United Kingdom,True,True,False,False,2018-2019
4,28658,Polar LNG Shipping (UK) Limited,False,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,2205323.0,Marine,United Kingdom,True,True,False,False,2018-2019


In [8]:
UNIQUE_SIDs = np.unique(df['Statement ID'].values)
TOTAL_UNIQUE_SIDs = len(UNIQUE_SIDs)
print(f"Total enteries : {len(df)}, "
      f"total unique statements in main DB : {TOTAL_UNIQUE_SIDs}")

Total enteries : 27531, total unique statements in main DB : 17799


In [9]:
def get_pdf_url_from_sid(sid):
    """Returns PDF url for input sid."""
    return df[df['Statement ID']==sid]['URL'].values[0]

In [10]:
def get_sid_from_filename(file_name):
    """Extract and return SID from file_name."""
    return int(re.findall(pattern=r"[\d]+",string=path.basename(file_name))[0])

In [11]:
def clean_text(text):
    """Clean and return input text."""
    text = text.replace("\n", " ")
    text = text.replace("\t", " ")
    special_char = ["\\", "b'"]
    for char_ in special_char:
        text = text.replace(char_, " ")
    cleaned_text = []
    for word in text.split():
        if len(word) > 1 or word.isdigit():
            cleaned_text.append(word)
    cleaned_text = " ".join(cleaned_text) 
    return cleaned_text

In [12]:
def save_text(text, sid):
    """Save text in text file."""
    file_name = f"SID-{sid}" 
    file_obj = open(f"{TEXT_PATH}//{file_name}.txt", 'w', encoding='utf-8')
    file_obj.write(text)
    file_obj.close()

In [13]:
def save_slavery_text(text, sid):
    """Save text in text file."""
    file_name = f"SID-{sid}" 
    file_obj = open(f"{SLAVERY_TEXT_PATH}//{file_name}.txt", 'w', encoding='utf-8')
    file_obj.write(text)
    file_obj.close()

In [14]:
def print_loop_no(total_sids, sid, itr, type_, type_itr):
    """Print loop statements."""
    primary_text = f"Running - {itr}/{total_sids} , SID - {sid}"
    if type_ == "Success":
        primary_text = f"{primary_text} - Success - {type_itr}"
    elif type_ == "Failed":
        primary_text = f"{primary_text} - Failed - {type_itr}"
    else:
        primary_text = f"{primary_text} - Processed - {type_itr}"
    print(f"{'-'*8} {primary_text} {'-'*8}")        

In [15]:
text_files = glob.glob(pathname=f"{TEXT_PATH}\\*.txt")
TOTAL_TEXT_FILES = len(text_files)
print(f"Found {TOTAL_TEXT_FILES} text files")

Found 14610 text files


In [16]:
def open_text_file(text_file_name):
    """Open text file."""
    return open(text_file_name, "r", encoding="utf8")

In [67]:
def is_modern_slavery_text(text):
    """Return True if text fulfils modern slavery statement conditions."""
    text = text.lower()
    wanted_keywords = ["slavery", "human trafficking", "child labour"]
    unwanted_keywords = ["page not found", "403 forbidden", "404 not found"]
    text = re.sub('[^A-Za-z0-9]+', ' ', text) # removing special characters, ex. helps to identify human-trafficking
    text = " ".join(text.split())
    if any([word in text for word in wanted_keywords]) and all([word not in text for word in unwanted_keywords]):
        return True 
    return False

In [60]:
slavery_text_SIDs = []
non_slavery_text_SIDs = []

In [None]:
for i, text_file_name in enumerate(text_files):
    sid = get_sid_from_filename(file_name=text_file_name)
    if sid not in slavery_text_SIDs +  non_slavery_text_SIDs:
        text = open_text_file(text_file_name).read()
        if is_modern_slavery_text(text):
            save_slavery_text(text=text, sid=sid)
            slavery_text_SIDs.append(sid)
            print_loop_no(total_sids=TOTAL_TEXT_FILES,
                                sid=sid, 
                                itr=i+1, 
                                type_="Success",
                                type_itr= len(slavery_text_SIDs))
        else:
            non_slavery_text_SIDs.append(sid)
            print_loop_no(total_sids=TOTAL_TEXT_FILES,
                                sid=sid, 
                                itr=i+1,
                                type_="Failed",
                                type_itr= len(non_slavery_text_SIDs))
    else:
        print_loop_no(total_sids=TOTAL_TEXT_FILES,
                            sid=sid,
                            itr=i+1,
                            type_="Processed", 
                            type_itr= len(slavery_text_SIDs) + len(non_slavery_text_SIDs))

In [62]:
print(f"Total slavery statements : {len(slavery_text_SIDs)}")
print(f"Total non-slavery statements : {len(non_slavery_text_SIDs)}")
print(f"Total statements : {len(slavery_text_SIDs)+len(non_slavery_text_SIDs)}")

Total slavery statements : 11660
Total non-slavery statements : 2950
Total statements : 14610


In [98]:
pickle_name = "slavery_text_SIDs"
save_pickle(obj=slavery_text_SIDs,file_name= f"{PICKLE_PATH}//{pickle_name}")

pickle_name = "non_slavery_text_SIDs"
save_pickle(obj=non_slavery_text_SIDs,file_name= f"{PICKLE_PATH}//{pickle_name}")

In [97]:
# Checking random slavery statement
sid = np.random.choice(slavery_text_SIDs)
print(f"SID - {sid}")
open_text_file(f"{TEXT_PATH}\\SID-{sid}.txt").read()[:1000]

SID - 40676


'JML Social audit policy SLAVERY AND HUMAN TRAFFICKING STATEMENT 2019/2020 JML (John Mills Ltd) is a UK based company who specialise in sourcing and supplying TV products to retailers and consumers. We operate a global supply chain, managed by our Product Development team in the UK. Most of our products are sourced from China which represents the area of greatest risk of exposure to, and association with slavery and human trafficking. Procedures in place include polices covering corporate social responsibility, ethical trading, working conditions, child labour, Health & Safety and training for relevant people within the business. JML are committed to ensure the wellbeing of all involved in the manufacture and supply of our products. We recognize the ETI base code as the minimum level of compliance and require all our factories to sign up to our code of conduct to ensure procedures are in place to cover the following e Employment is freely chosen e Freedom of association and the right t

In [94]:
# Checking random non slavery statement
sid = np.random.choice(non_slavery_text_SIDs)
print(f"SID - {sid}")
open_text_file(f"{TEXT_PATH}\\SID-{sid}.txt").read()[:1000]

SID - 41596


'403 Forbidden 403 Forbidden nginx'

In [99]:
# ## Second batch
# second_batch_SIDs = [sid for sid in UNIQUE_SIDs if sid not in slavery_text_SIDs and sid not in pdf_not_found_SIDs]
# print(f"Total SIDs left : {len(second_batch_SIDs)}")

# pickle_name = "leftover_pdf_to_downloads_SIDs"
# save_pickle(obj=second_batch_SIDs,file_name= f"{PICKLE_PATH}//{pickle_name}")