# Used to check if extracted statements are Modern Slavery Statement

In [25]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [56]:
import pandas as pd
import numpy as np
from os import getcwd, path, remove
import urllib
import pickle
from time import sleep
import functools
import glob
from func_timeout import func_timeout, FunctionTimedOut
import re
from langdetect import detect
from langdetect import lang_detect_exception

In [117]:
PROJECT_NAME = 'modern_slavery_registry'
PROJECT_PATH = f"{getcwd()[:getcwd().find(PROJECT_NAME)]}{PROJECT_NAME}"
DATA_PATH = f"{PROJECT_PATH}\\data"
PICKLE_PATH = f"{DATA_PATH}\\pickles"
PDF_PATH = f"{DATA_PATH}\\pdfs"
TEXT_PATH = f"{DATA_PATH}\\texts"
SLAVERY_TEXT_PATH = f"{DATA_PATH}\\slavery_texts"
SHEETS_PATH = f"{DATA_PATH}\\sheets"
MAX_DOWNLOAD_TIME = 5

In [28]:
def save_pickle(obj, file_name):
    pickle.dump(obj, file=open(f"{file_name}.pickle","wb"))
    
def load_pickle(file_name):
    return pickle.load(file=open(f"{file_name}.pickle","rb"))

In [29]:
pickle_name = f"{PICKLE_PATH}\\slavery_text_SIDs"
if path.exists(f"{pickle_name}.pickle"):
    slavery_text_SIDs = load_pickle(pickle_name)
    print(f"{pickle_name} already exists")
else:
    slavery_text_SIDs = []
    save_pickle(slavery_text_SIDs, file_name=pickle_name)
    print(f"{pickle_name} created")
    
    
pickle_name = f"{PICKLE_PATH}\\non_slavery_text_SIDs"
if path.exists(f"{pickle_name}.pickle"):
    non_slavery_text_SIDs = load_pickle(pickle_name)
    print(f"{pickle_name} already exists")
else:
    non_slavery_text_SIDs = []
    save_pickle(non_slavery_text_SIDs, file_name=pickle_name)
    print(f"{pickle_name} created")

E:\Projects\modern_slavery_registry\data\pickles\slavery_text_SIDs already exists
E:\Projects\modern_slavery_registry\data\pickles\non_slavery_text_SIDs already exists


In [30]:
df = pd.read_csv(f"{DATA_PATH}\modernslaveryregistry-2020-09-14.csv")
df.head()

Unnamed: 0,Company ID,Company,Is Publisher,Statement ID,URL,Override URL,Companies House Number,Industry,HQ,Is Also Covered,UK Modern Slavery Act,California Transparency in Supply Chains Act,Australia Modern Slavery Act,Period Covered
0,7676,"""K"" Line Holding Europe Limited",True,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5005018.0,Marine,United Kingdom,False,True,False,False,2018-2019
1,28660,"""K"" Line Bulk Shipping (UK) Limited",False,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,4830352.0,Marine,United Kingdom,True,True,False,False,2018-2019
2,28659,"""K"" Line (Europe) Limited",False,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5639474.0,Marine,United Kingdom,True,True,False,False,2018-2019
3,28661,"""K"" Line LNG Shipping Limited",False,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,,Marine,United Kingdom,True,True,False,False,2018-2019
4,28658,Polar LNG Shipping (UK) Limited,False,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,2205323.0,Marine,United Kingdom,True,True,False,False,2018-2019


In [31]:
UNIQUE_SIDs = np.unique(df['Statement ID'].values)
TOTAL_UNIQUE_SIDs = len(UNIQUE_SIDs)
print(f"Total enteries : {len(df)}, "
      f"total unique statements in main DB : {TOTAL_UNIQUE_SIDs}")

Total enteries : 27531, total unique statements in main DB : 17799


In [32]:
def get_pdf_url_from_sid(sid):
    """Returns PDF url for input sid."""
    return df[df['Statement ID']==sid]['URL'].values[0]

In [33]:
def get_sid_from_filename(file_name):
    """Extract and return SID from file_name."""
    return int(re.findall(pattern=r"[\d]+",string=path.basename(file_name))[0])

In [34]:
def clean_text(text):
    """Clean and return input text."""
    text = text.replace("\n", " ")
    text = text.replace("\t", " ")
    special_char = ["\\", "b'"]
    for char_ in special_char:
        text = text.replace(char_, " ")
    cleaned_text = []
    for word in text.split():
        if len(word) > 1 or word.isdigit():
            cleaned_text.append(word)
    cleaned_text = " ".join(cleaned_text) 
    return cleaned_text

In [35]:
def save_text(text, sid):
    """Save text in text file."""
    file_name = f"SID-{sid}" 
    file_obj = open(f"{TEXT_PATH}//{file_name}.txt", 'w', encoding='utf-8')
    file_obj.write(text)
    file_obj.close()

In [36]:
def save_slavery_text(text, sid):
    """Save text in text file."""
    file_name = f"SID-{sid}" 
    file_obj = open(f"{SLAVERY_TEXT_PATH}//{file_name}.txt", 'w', encoding='utf-8')
    file_obj.write(text)
    file_obj.close()

In [75]:
def print_loop_no(total_sids, sid, itr, type_, type_itr):
    """Print loop statements."""
    primary_text = f"Running - {itr}/{total_sids} , SID - {sid}"
    if type_ == "Success":
        primary_text = f"{primary_text} - Success - {type_itr}"
    elif type_ == "Failed":
        primary_text = f"{primary_text} - Failed - {type_itr}"
    else:
        primary_text = f"{primary_text} - Processed - {type_itr}"
    if (itr + 1) % 1000 == 0: print(f"{'-'*8} {primary_text} {'-'*8}")        

In [38]:
text_files = glob.glob(pathname=f"{TEXT_PATH}\\*.txt")
TOTAL_TEXT_FILES = len(text_files)
print(f"Found {TOTAL_TEXT_FILES} text files")

Found 14640 text files


In [39]:
def open_text_file(text_file_name):
    """Open text file."""
    return open(text_file_name, "r", encoding="utf8")

In [40]:
def is_modern_slavery_text(text):
    """Return True if text fulfils modern slavery statement conditions."""
    text = text.lower()
    wanted_keywords = ["slavery", "human trafficking", "child labour"]
    unwanted_keywords = ["page not found", "403 forbidden", "404 not found"]
    text = re.sub('[^A-Za-z0-9]+', ' ', text) # removing special characters, ex. helps to identify human-trafficking
    text = " ".join(text.split())
    if any([word in text for word in wanted_keywords]) and all([word not in text for word in unwanted_keywords]):
        return True 
    return False

In [74]:
slavery_text_SIDs = []
non_slavery_text_SIDs = []

In [76]:
for i, text_file_name in enumerate(text_files):
    sid = get_sid_from_filename(file_name=text_file_name)
    if sid not in slavery_text_SIDs +  non_slavery_text_SIDs:
        text = open_text_file(text_file_name).read()
        if is_modern_slavery_text(text):
            save_slavery_text(text=text, sid=sid)
            slavery_text_SIDs.append(sid)
            print_loop_no(total_sids=TOTAL_TEXT_FILES,
                                sid=sid, 
                                itr=i+1, 
                                type_="Success",
                                type_itr= len(slavery_text_SIDs))
        else:
            non_slavery_text_SIDs.append(sid)
            print_loop_no(total_sids=TOTAL_TEXT_FILES,
                                sid=sid, 
                                itr=i+1,
                                type_="Failed",
                                type_itr= len(non_slavery_text_SIDs))
    else:
        print_loop_no(total_sids=TOTAL_TEXT_FILES,
                            sid=sid,
                            itr=i+1,
                            type_="Processed", 
                            type_itr= len(slavery_text_SIDs) + len(non_slavery_text_SIDs))

-------- Running - 999/14640 , SID - 11526 - Success - 746 --------
-------- Running - 1999/14640 , SID - 13092 - Success - 1518 --------
-------- Running - 2999/14640 , SID - 26834 - Success - 2314 --------
-------- Running - 3999/14640 , SID - 28522 - Failed - 911 --------
-------- Running - 4999/14640 , SID - 30485 - Failed - 1163 --------
-------- Running - 5999/14640 , SID - 32527 - Success - 4651 --------
-------- Running - 6999/14640 , SID - 34314 - Success - 5486 --------
-------- Running - 7999/14640 , SID - 37626 - Failed - 1663 --------
-------- Running - 8999/14640 , SID - 39608 - Success - 7140 --------
-------- Running - 9999/14640 , SID - 40721 - Success - 8004 --------
-------- Running - 10999/14640 , SID - 41837 - Success - 8852 --------
-------- Running - 11999/14640 , SID - 42942 - Success - 9703 --------
-------- Running - 12999/14640 , SID - 44060 - Success - 10549 --------
-------- Running - 13999/14640 , SID - 9023 - Success - 11381 --------


In [77]:
print(f"Total slavery statements : {len(slavery_text_SIDs)}")
print(f"Total non-slavery statements : {len(non_slavery_text_SIDs)}")
print(f"Total statements : {len(slavery_text_SIDs)+len(non_slavery_text_SIDs)}")

Total slavery statements : 11885
Total non-slavery statements : 2755
Total statements : 14640


In [78]:
pickle_name = "slavery_text_SIDs"
save_pickle(obj=slavery_text_SIDs,file_name= f"{PICKLE_PATH}//{pickle_name}")

pickle_name = "non_slavery_text_SIDs"
save_pickle(obj=non_slavery_text_SIDs,file_name= f"{PICKLE_PATH}//{pickle_name}")

In [47]:
# Checking random slavery statement
sid = np.random.choice(slavery_text_SIDs)
print(f"SID - {sid}")
open_text_file(f"{SLAVERY_TEXT_PATH}\\SID-{sid}.txt").read()[:1000]

SID - 12902


"Modern Slavery Act JavaScript seems to be disabled in your browser. You must have JavaScript enabled in your browser to utilize the functionality of this website. Login/Register Pharmacy Finder Search: Search My Account My Cart Checkout Flu Vaccinations My Pharmacy My Pharmacy Services Careers My Prescription My Medicine Re-order Pad Paperless Prescriptions My Health Key Services Flu Vaccinations Travel Clinic Health A-Z Health News About Day Lewis About Day Lewis Other Business Activities Careers Executive Directors' Review Contact us Contact Day Lewis Contact My Pharmacy Covid-19 Day Lewis Slavery and Human Trafficking Statement Organization Structure and Business Activities. Day Lewis PLC is a family company based in Croydon, Surrey. It is the holding company of a number of subsidiaries and is owned and managed by three members of the Patel family, each of whom is an Executive Director. Day Lewis PLC operates two principle businesses and several small subsidiary businesses, all of 

In [46]:
# Checking random non slavery statement
sid = np.random.choice(non_slavery_text_SIDs)
print(f"SID - {sid}")
open_text_file(f"{TEXT_PATH}\\SID-{sid}.txt").read()[:1000]

SID - 42226


'KM_C227-20190404093859'

In [None]:
# ## Second batch
# second_batch_SIDs = [sid for sid in UNIQUE_SIDs if sid not in slavery_text_SIDs and sid not in pdf_not_found_SIDs]
# print(f"Total SIDs left : {len(second_batch_SIDs)}")

# pickle_name = "leftover_pdf_to_downloads_SIDs"
# save_pickle(obj=second_batch_SIDs,file_name= f"{PICKLE_PATH}//{pickle_name}")

In [24]:
sid = 24940
print(f"SID - {sid}- {get_pdf_url_from_sid(sid)}")
open_text_file(f"{SLAVERY_TEXT_PATH}\\SID-{sid}.txt").read()

SID - 24940- http://www.writtle.com/wp-content/uploads/2017/11/Modern-Slavery-Act-Statement.pdf


'Modern Slavery Act Statement Section 54(1) of the Modern Slavery Act 2015 requires any organisation operating in any sector, which supplies goods and services, and carries on a business or part of a business in the UK, with an annual turnover of £36 million or more, to publish an annual slavery and human trafficking statement. This statement constitutes Writtle group’s slavery and human trafficking statement for the group’s financial year ending 31 December 2016. Writtle is a UK-centred marketing services group with an international client base. The group directly employs over 600 people across 12 operating companies in the UK and Hong Kong. We do not tolerate any form of slavery or human trafficking in any part of our business or in our supply chain and we will never knowingly deal with any organisation which is connected to slavery or human trafficking. Given the location of our business and supply chain, and the nature of the goods and services that we provide, we consider that we 

# To get idea about how many words on average in slavery statement

In [48]:
sum_lengths = 0
for sid in slavery_text_SIDs:
    text = open_text_file(f"{SLAVERY_TEXT_PATH}\\SID-{sid}.txt").read()
    sum_lengths += len(text.split())

In [51]:
print(f"Average words in slavery statements : {int(sum_lengths/len(slavery_text_SIDs))}")

Average words in slavery statements : 1227


In [79]:
# removing statements with words less than threshold number of words
threshold_num_words = 100
filtered_slavery_text_SIDs = []
for i, sid in enumerate(slavery_text_SIDs):
    if (i+1)%1000==0: print(f"{i+1}/{len(slavery_text_SIDs)}")
    text = open_text_file(f"{SLAVERY_TEXT_PATH}\\SID-{sid}.txt").read()
    if len(text.split()) > 100:
        filtered_slavery_text_SIDs.append(sid)

1000/11885
2000/11885
3000/11885
4000/11885
5000/11885
6000/11885
7000/11885
8000/11885
9000/11885
10000/11885
11000/11885


In [80]:
# removing slavery statements with words less than threshold number of words
remove_slavery_texts_SIDs = [sid for sid in slavery_text_SIDs if sid not in filtered_slavery_text_SIDs]
print(f"Remove {len(remove_slavery_texts_SIDs)} SIDs")

Remove 112 SIDs


In [81]:
for sid in remove_slavery_texts_SIDs:
    remove(f"{SLAVERY_TEXT_PATH}//SID-{sid}.txt")

In [82]:
t = len(glob.glob(f"{SLAVERY_TEXT_PATH}//*.txt"))
print(f"Final number of slavery statements : {t}, {len(filtered_slavery_text_SIDs)}")

Final number of slavery statements : 11773, 11773


# Summary Statistics

In [92]:
pdf_not_found_SIDs = load_pickle(f"{PICKLE_PATH}//pdf_not_found_SIDs")
failed_SIDs = [sid for sid in df['Statement ID'].drop_duplicates().values if sid not in pdf_not_found_SIDs + filtered_slavery_text_SIDs]
len_pdf_not_found_SIDs = len(pdf_not_found_SIDs)
len_failed_SIDs = len(failed_SIDs)

In [93]:
print(f"Successful slavery statements : {len(filtered_slavery_text_SIDs)}")
print(f"PDF not found                 : {len(pdf_not_found_SIDs)}")
print(f"PDF failed extraction         : {len(failed_SIDs)}")

Successful slavery statements : 11773
PDF not found                 : 2471
PDF failed extraction         : 3555


In [94]:
11773 + 2471 + 3555

17799

In [95]:
len(df['Statement ID'].drop_duplicates().values)

17799

# Compiling into CSV

In [99]:
filtered_slavery_text = {}
for sid in filtered_slavery_text_SIDs:
    filtered_slavery_text[sid] = open_text_file(f"{SLAVERY_TEXT_PATH}\\SID-{sid}.txt").read()

In [120]:
filtered_slavery_text_df = pd.Series(filtered_slavery_text).reset_index()
filtered_slavery_text_df.columns = ['Statement ID', 'statement']
final_df = pd.merge(df, filtered_slavery_text_df, on="Statement ID", how="left")
final_df['statement'].fillna("#N/A", inplace=True)
final_df.to_excel(f"{SHEETS_PATH}//modern_slavery_statements.xlsx", index=False)

In [121]:
len(filtered_slavery_text_df)

11773

In [122]:
len([sid for sid in df['Statement ID'].drop_duplicates().values if sid in filtered_slavery_text_SIDs])

11773

In [123]:
df.dtypes

Company ID                                       int64
Company                                         object
Is Publisher                                      bool
Statement ID                                     int64
URL                                             object
Override URL                                    object
Companies House Number                          object
Industry                                        object
HQ                                              object
Is Also Covered                                   bool
UK Modern Slavery Act                             bool
California Transparency in Supply Chains Act      bool
Australia Modern Slavery Act                      bool
Period Covered                                  object
dtype: object

In [135]:
filtered_slavery_text_df[filtered_slavery_text_df['statement'] == filtered_slavery_text_df['statement'].iloc[10166]]

Unnamed: 0,Statement ID,statement
5337,34139,Safeguarding against modern slavery | Seetec A...
10166,43687,Safeguarding against modern slavery | Seetec A...


In [139]:
get_pdf_url_from_sid(43687)

'https://www.seetec.co.uk/modern-slavery-statement'

In [138]:
final_df[final_df['Statement ID']==34139]['statement'].values

      dtype=object)

In [None]:
final_df