In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from os import getcwd
import numpy as np
from tqdm import tqdm
import re
from typing import Tuple, Dict, Sequence, List, Union
from stopit import threading_timeoutable
from copy import deepcopy
import hashlib

In [3]:
from modern_slavery_registry.text_parser import (find_ngrams_in_text, 
                                                 replace_urls,
                                                 replace_unicode, 
                                                 replace_special_chars, 
                                                 remove_stopwords)
from modern_slavery_registry.utils import fix_unicode

In [4]:
DATA_PATH = getcwd()
DATA_PATH = DATA_PATH.replace("notebooks", "")
DATA_PATH += "data"
SHEETS_PATH = DATA_PATH + "\\sheets"

In [5]:
prof_data = pd.read_json(f"{SHEETS_PATH}\\modern_slavery_dataset_prof.json")
cols_to_keep = ["Company ID", "Company", "Statement ID", "URL", "Text"]
prof_data.drop_duplicates(inplace=True)
prof_data.dropna(subset=["Company ID"], inplace=True)
prof_data.reset_index(drop=True, inplace=True)
prof_data = prof_data[cols_to_keep]
cols_type = ["int32", str, float, str, str]
for i, col in enumerate(prof_data.columns):
    prof_data[col] = prof_data[col].astype(cols_type[i])




my_data = pd.read_excel(f"{SHEETS_PATH}\\modern_slavery_dataset_nitin.xlsx")
cols_to_keep = ["Company ID", "Company", "Statement ID", "URL", "statement"]
my_data.drop_duplicates(inplace=True)
my_data.dropna(subset=["Company ID"], inplace=True)
my_data.reset_index(drop=True, inplace=True)
my_data = my_data[cols_to_keep]
for i, col in enumerate(my_data.columns):
    my_data[col] = my_data[col].astype(cols_type[i])

In [6]:
my_data.head(1)

Unnamed: 0,Company ID,Company,Statement ID,URL,statement
0,7676,"""K"" Line Holding Europe Limited",35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,66 99 “K” Line Holding (Europe) Limited kM K L...


In [7]:
prof_data.head(1)

Unnamed: 0,Company ID,Company,Statement ID,URL,Text
0,7676,"""K"" Line Holding Europe Limited",35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...


In [8]:
print(f"Length, prof. dataset : {len(prof_data)}, my dataset : {len(my_data)}")

Length, prof. dataset : 28361, my dataset : 27527


<font color="blue" size="5"> Combining both datasets 

In [9]:
combine_data = pd.merge(prof_data,
                        my_data,
                        on = ["Company ID", "Company", "Statement ID", "URL"], 
                        how = "outer")
combine_data.drop_duplicates(inplace=True)
combine_data.reset_index(drop=True, inplace=True)
combine_data.fillna("#NA", inplace=True)
combine_data["Text"] = ["#NA" if text in ("nan", "None") or text == "" else text for text in combine_data["Text"]]
combine_data["statement"] = ["#NA" if text in ("nan", "None") or text == "" else text for text in combine_data["Text"]]
statements = [
    ("E", combine_data["Text"].iloc[i]) if combine_data["Text"].iloc[i] != "#NA" else ("N", combine_data["statement"].iloc[i]) for i in range(len(combine_data))
]
choice = [_[0] for _ in statements]
statements = [_[1] for _ in statements]
combine_data["final_statement"] = statements
combine_data["final_statement(E/N)"] = choice
combine_data.rename(columns = {"Text": "statement(Edgar)", 
                               "statement": "statement(Nitin)"},  
                    inplace = True) 
# cols_to_drop = ["Text", "statement"]
# for col in cols_to_drop:
#     combine_data.drop(col, axis=1, inplace=True)

In [10]:
combine_data.head()

Unnamed: 0,Company ID,Company,Statement ID,URL,statement(Edgar),statement(Nitin),final_statement,final_statement(E/N)
0,7676,"""K"" Line Holding Europe Limited",35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,E
1,28660,"""K"" Line Bulk Shipping (UK) Limited",35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,E
2,28659,"""K"" Line (Europe) Limited",35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,E
3,28661,"""K"" Line LNG Shipping Limited",35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,E
4,28658,Polar LNG Shipping (UK) Limited,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,E


In [11]:
def clean_text(text):
    text = replace_unicode(text)
    text = text.replace("\n"," ")
    text = text.replace("\t"," ")
    text = replace_special_chars(text, replace_digits=True)
    text = remove_stopwords(text)
    return text

In [12]:
statements = [clean_text(statement) if statement != "#NA" else statement for statement in tqdm(combine_data["final_statement"])]
statements = [statement if statement != "" else "#NA" for statement in statements]

100%|███████████████████████████████████████████████████████████████████████████| 28364/28364 [01:15<00:00, 375.47it/s]


In [13]:
companies = [clean_text(company) if statements[i] != "#NA" else company for i, company in tqdm(enumerate(combine_data["Company"]))]

28364it [00:00, 56632.94it/s]


In [14]:
def remove_company_name(company_name, statement):
    mappings = find_ngrams_in_text(sentence=company_name,
                                   text=statement)
    # to remove big text first
    mappings = {k:v for k,v in sorted(mappings.items(), key=lambda x:x[1])} 
    for ngram, cnt in mappings.items():
        if cnt > 0:
            ngram_splitted = ngram.split()
            if not (len(ngram_splitted) == 1 and len(ngram_splitted[0]) == 1):
                statement = statement.replace(ngram," ")

    statement = " ".join(statement.split())
    return statement

In [15]:
cleaned_statements = []
for company, statement in tqdm(zip(companies, statements)):
    if statement != "#NA":
        statement = remove_company_name(company_name=company, 
                                     statement=statement)
    cleaned_statements.append(statement)

28364it [00:15, 1841.78it/s]


In [16]:
combine_data["final_statement_cleaned"] = cleaned_statements

In [17]:
combine_data = fix_unicode(combine_data)
combine_data.to_excel(f"{SHEETS_PATH}\\combined_modern_slavery_statements.xlsx", index = False)

In [18]:
combine_data.head()

Unnamed: 0,Company ID,Company,Statement ID,URL,statement(Edgar),statement(Nitin),final_statement,final_statement(E/N),final_statement_cleaned
0,7676,"""K"" Line Holding Europe Limited",35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,E,holdinc europe ltd modern slavery act transpar...
1,28660,"""K"" Line Bulk Shipping (UK) Limited",35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,E,holdinc europe ltd modern slavery act transpar...
2,28659,"""K"" Line (Europe) Limited",35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,E,holdinc europe ltd modern slavery act transpar...
3,28661,"""K"" Line LNG Shipping Limited",35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,E,holdinc europe ltd modern slavery act transpar...
4,28658,Polar LNG Shipping (UK) Limited,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,E,k line holdinc europe ltd modern slavery act t...
