In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import (CountVectorizer,
                                             TfidfVectorizer)
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
from os import getcwd
from tqdm import tqdm

In [3]:
from modern_slavery_registry.utils import (sort_dict)

In [4]:
RANDOM_STATE = 40

In [5]:
DATA_PATH = getcwd()
DATA_PATH = DATA_PATH.replace("notebooks", "")
DATA_PATH += "data"
SHEETS_PATH = DATA_PATH + "\\sheets"

In [6]:
data = pd.read_excel(f"{SHEETS_PATH}\\subset_data.xlsx")
data.fillna("#NA", inplace=True)
data.columns = ["URL", "final_statement_cleaned", "final_statement_cleaned_2", "duplicate_99per"]

In [7]:
data.head()

Unnamed: 0,URL,final_statement_cleaned,final_statement_cleaned_2,duplicate_99per
0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,holdinc europe ltd modern slavery act transpar...,holdinc europe ltd modern slavery act transpar...,False
1,https://1spatial.com/who-we-are/legal/modern-s...,independent research edison investments modern...,independent research edison investments modern...,False
2,https://www.shazans.com/slavery-and-human-traf...,slavery human trafficking statement slavery hu...,slavery human traffic statement slavery human ...,False
3,https://www.business-humanrights.org/sites/def...,modern slavery atement modern slavery atement ...,modern slavery atement modern slavery atement ...,False
4,https://www.2agriculture.com/wp-content/upload...,modern slavery act slavery human trafficking s...,modern slavery act slavery human traffic state...,False


In [8]:
N_GRAMS = (1, 1)
count_vect = CountVectorizer(ngram_range=N_GRAMS)
X = count_vect.fit_transform(data["final_statement_cleaned_2"].values) 

In [9]:
N_TOPICS = 10
LDA = LatentDirichletAllocation(n_components=N_TOPICS, n_jobs=-1, verbose=1, random_state=RANDOM_STATE)
LDA.fit(X)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


LatentDirichletAllocation(n_jobs=-1, random_state=40, verbose=1)

In [10]:
N_TOP_WORDS = 15 
feature_names = count_vect.get_feature_names()
topic_components = LDA.components_ 
# Note: topic words are words with high LDA component values
for i in range(N_TOPICS):
    print(f"Topic {i}: {[feature_names[j] for j in np.argsort(topic_components[i])[::-1][:N_TOP_WORDS]]}")
    print()

Topic 0: ['service', 'us', 'contact', 'support', 'care', 'news', 'new', 'solutions', 'home', 'people', 'career', 'products', 'energy', 'water', 'help']

Topic 1: ['slavery', 'modern', 'traffic', 'human', 'supply', 'chain', 'business', 'ensure', 'statement', 'policy', 'act', 'risk', 'suppliers', 'take', 'within']

Topic 2: ['use', 'term', 'may', 'site', 'information', 'shall', 'website', 'service', 'content', 'condition', 'include', 'right', 'order', 'time', 'without']

Topic 3: ['human', 'suppliers', 'supply', 'conduct', 'chain', 'traffic', 'code', 'supplier', 'labor', 'slavery', 'business', 'compliance', 'include', 'audit', 'employees']

Topic 4: ['www', 'https', 'com', 'uk', 'en', 'cking', 'class', 'store', 'div', 'co', 'tra', 'http', 'span', 'brand', 'shop']

Topic 5: ['information', 'use', 'data', 'us', 'personal', 'cookies', 'may', 'website', 'site', 'service', 'privacy', 'please', 'provide', 'contact', 'process']

Topic 6: ['service', 'firm', 'financial', 'include', 'risk', 'coun

# Removing duplicate statements (with 99% duplicacy) and topic modelling 

In [11]:
unique_statements = data[data["duplicate_99per"]==False][["URL", "final_statement_cleaned_2"]].reset_index()

In [12]:
print(f"Number of statements before removing duplicate: {len(data)}")
print(f"Number of statements after removing duplicate(99%): {len(unique_statements)}")

Number of statements before removing duplicate: 11967
Number of statements after removing duplicate(99%): 10780


### <font color="blue"> Getting idea about vocabulary

In [13]:
term_freq = {} # to keep track of term frequency
document_freq = {} # to keep track of document-term frequency
last_doc = {}
for i, statement in tqdm(enumerate(unique_statements["final_statement_cleaned_2"]),
                      position=0, 
                      leave=True):
    for word in statement.split():  
        if word not in term_freq:
            term_freq[word] = 1
            document_freq[word] = 1
        else:
            term_freq[word] += 1
            if last_doc[word] != i:
                document_freq[word] += 1
        last_doc[word] = i
        
total_docs = len(unique_statements)
document_freq = {k:v/total_docs for k,v in document_freq.items()} 
del last_doc

10780it [00:05, 1951.02it/s]


In [14]:
print(f"Vocab size: {len(term_freq)}")

Vocab size: 52259


In [15]:
MAX_DF = .7 # minimum document frequency
MIN_DF = 2e-4 # maximum document frequency

In [16]:
# removing below words
sort_dict({k:v for k,v in document_freq.items() if v > MAX_DF}, by=1, reverse=True)

{'slavery': 0.885899814471243,
 'supply': 0.8664192949907236,
 'human': 0.8648423005565863,
 'business': 0.8611317254174397,
 'chain': 0.8564935064935065,
 'act': 0.8551948051948052,
 'traffic': 0.8440630797773655,
 'statement': 0.8154916512059369,
 'suppliers': 0.8085343228200371,
 'modern': 0.8077922077922078,
 'ensure': 0.7928571428571428,
 'include': 0.7880333951762524,
 'risk': 0.7687384044526901,
 'take': 0.7603896103896104,
 'policy': 0.7221706864564007,
 'make': 0.7188311688311688,
 'company': 0.7157699443413729,
 'commit': 0.7151205936920223,
 'employees': 0.7086270871985157}

In [17]:
filtered_document_freq = {k:v for k,v in document_freq.items() if MIN_DF <= v <= MAX_DF}
print(f"vocab size before filtering based on document frequency  : {len(document_freq)}")
print(f"vocab size after filtering based on document frequency [{MIN_DF: .5f},{MAX_DF: .5f}]: {len(filtered_document_freq)}")

vocab size before filtering based on document frequency  : 52259
vocab size after filtering based on document frequency [ 0.00020, 0.70000]: 16480


In [18]:
N_GRAMS = (1, 1)
count_vect = CountVectorizer(ngram_range=N_GRAMS, min_df=MIN_DF, max_df=MAX_DF)
X = count_vect.fit_transform(unique_statements["final_statement_cleaned_2"].values) 
print(f"Count vectorizer matrix shape: {X.shape}")

Count vectorizer matrix shape: (10780, 16480)


In [19]:
N_TOPICS = 10
LDA = LatentDirichletAllocation(n_components=N_TOPICS,
                                n_jobs=-1, 
                                verbose=1, 
                                random_state=RANDOM_STATE)
LDA.fit(X)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


LatentDirichletAllocation(n_jobs=-1, random_state=40, verbose=1)

In [20]:
N_TOP_WORDS = 15 
feature_names = count_vect.get_feature_names()
topic_components = LDA.components_ 
# Note: topic words are words with high LDA component values
for i in range(N_TOPICS):
    print(f"Topic {i}: {[feature_names[j] for j in np.argsort(topic_components[i])[::-1][:N_TOP_WORDS]]}")
    print()

Topic 0: ['group', 'limit', 'within', 'uk', 'service', 'place', 'policies', 'part', 'process', 'staff', 'year', 'financial', 'train', 'review', 'work']

Topic 1: ['service', 'work', 'support', 'staff', 'council', 'trust', 'safeguard', 'people', 'care', 'uk', 'home', 'report', 'procurement', 'provide', 'new']

Topic 2: ['use', 'information', 'data', 'us', 'may', 'website', 'service', 'personal', 'site', 'cookies', 'term', 'privacy', 'provide', 'contact', 'right']

Topic 3: ['conduct', 'code', 'supplier', 'labor', 'compliance', 'audit', 'require', 'standards', 'laws', 'train', 'force', 'comply', 'applicable', 'right', 'provide']

Topic 4: ['right', 'work', 'global', 'value', 'management', 'principles', 'people', 'support', 'conduct', 'service', 'respect', 'report', 'world', 'corporate', 'policies']

Topic 5: ['group', 'right', 'conduct', 'code', 'labour', 'supplier', 'global', 'principles', 'uk', 'compliance', 'corporate', 'respect', 'sustainability', 'force', 'products']

Topic 6: ['org