In [None]:
# use if autocompletion is not working
%config Completer.use_jedi = False

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import numpy as np
import re
from copy import deepcopy
from typing import List, Tuple, Dict, Union
from collections import OrderedDict
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud, STOPWORDS
from matplotlib import pyplot as plt

In [None]:
from modern_slavery_registry import get_root_path
# from modern_slavery_registry.modern_slavery_statement_parser import clean_text, clean_corpus
from modern_slavery_registry import text_parser

In [None]:
PROJECT_PATH = get_root_path()
DATA_PATH = os.path.join(PROJECT_PATH, "data")
SHEETS_PATH = os.path.join(PROJECT_PATH, "data", "sheets")
PLOTS_PATH = os.path.join(PROJECT_PATH, "plots")

In [None]:
np.sort(os.listdir(SHEETS_PATH))

## Mapping continents to country codes

In [None]:
df = pd.read_excel(os.path.join(SHEETS_PATH, "subset_data_v7.xlsx"))

In [None]:
df[[col for col in df.columns if "statement" in col]].isna().sum(axis=0)

In [None]:
df.fillna("#NA", inplace=True)

In [None]:
# %%time
# cleaned_corpus = clean_corpus(corpus=df["statement"].values)

# cleaned_corpus_as_list = []
# for i in range(len(cleaned_corpus)):
#     cleaned_corpus_as_list.append(deepcopy(cleaned_corpus[i]))
    
# del cleaned_corpus

In [None]:
def compute_and_process_freqs(
    corpus: Union[np.ndarray, List[str]], 
    ngram: int) -> Dict[str, int]:
    """Compute ngram frequency from corpus."""
    freqs = text_parser.compute_ngram_freqs(
        corpus=corpus, 
        n=ngram, 
        verbose=True)
    
    new_freqs = OrderedDict()
    for k, v in freqs.items():
        k = re.sub(r"[.,:;?]+", "", " ".join(k))
        k = " ".join(k.split())
        if len(k) > 0: 
            new_freqs[k] = v 
            
    del freqs
    new_freqs = {
        k:v for k,v in sorted(new_freqs.items(), key=lambda x: x[1], reverse=True)}
    
    return new_freqs

## NGRAMS analysis on basis of continents

In [None]:
continents = [col for col in df.columns if col.startswith("HQ_Continent")]
continents

In [None]:
# plotting wordclouds per continent
MAX_WORDS = 200 
for continent in tqdm(continents, leave=False, position=0):
    freqs = compute_and_process_freqs(
        df[df[continent]==1]["statement_cleaned_v2"].values, ngram=1)
    wd = WordCloud(width=1800, height=1200, background_color="white", max_words=MAX_WORDS)
    wd.generate_from_frequencies(frequencies=freqs)
    fig, ax = plt.subplots(figsize=(12,12))
    title = f"Top {MAX_WORDS} words wordcloud for {continent}"
    ax.set_title(title)
    ax.imshow(wd)
    ax.set_axis_off()
    plt.tight_layout()
    plt.savefig(os.path.join(PLOTS_PATH, "_".join(title.split())))
    plt.show()

In [None]:
fig, axes = plt.subplots(figsize=(16,10), ncols=6, sharey=True)
for continent, ax in tqdm(zip(continents, axes.flatten()), leave=False, position=0):
    values = df[df[continent]==1]["statement_cleaned_v2"].values
    lens = [len(statement.split()) for statement in values]
    ax.boxplot(lens)
    ax.set_title(f"{continent.replace('_',' ').split()[-1]}: {len(values)} statements")
    sns.despine(ax=ax)
fig.text(x=.5, y=.98, s="Distribution of number of words in statements by continents", ha="center")
plt.tight_layout(pad=2)
plt.savefig(os.path.join(
    PLOTS_PATH,
    "Distribution_Number_of_Words_HQ_Continents"),
            dpi=100)
plt.show()

In [None]:
# # computing ngrams on basis of continents
# cols = [col for col in df.columns if col.startswith("HQ_Continent")]
# ngrams = [1,2,3]

# freqs = {}
# for col in cols:
#     col_freqs = {}
#     for ngram in ngrams:
#         col_freqs[ngram] = compute_and_process_freqs(
#             df[df[col]==1]["statement_cleaned_v2"].values, 
#             ngram=ngram)
#     freqs[col] = deepcopy(col_freqs)

In [None]:
# # plotting ngrams over all available continents
# top_n = 25
# fig, axes = plt.subplots(
#     nrows=2,
#     ncols=3, 
#     squeeze=False,
#     figsize=(30, 12))
# ngram = 3
# for col, ax in zip(cols, axes.flatten()):
#     pd.DataFrame.from_dict(
#         freqs[col][ngram], 
#         orient="index").head(n).sort_values(0).plot.barh(ax=ax)
#     title = f"{col}"
#     ax.set_title(title)
#     ax.get_legend().remove()
#     sns.despine(ax=ax)

# title = f"Top-{top_n} {ngram}-gram"
# fig.text(x=.5, y=.92, s=title, ha="center")
# plt.savefig(os.path.join(PLOTS_PATH, f"{title}_by_HQ_Continent.png"), dpi=200)
# plt.show()

In [None]:
# # saving top_n ngrams for each continent
# save_top_n = 10000
# for col in cols:
#     for ngram in ngrams:
#         name = (
#             f"{col}_{ngram}-gram_"
#             f"top_{save_top_n}_freq_dist.csv")
#         temp_df = pd.DataFrame.from_dict(
#             freqs[col][ngram], 
#             orient="index").head(
#             save_top_n)
        
#         temp_df.reset_index(inplace=True)
#         temp_df.columns = ["token", f"{ngram}-gram_freq"]
#         temp_df.to_csv(
#             os.path.join(SHEETS_PATH, name))

## NGRAMS analysis on basis of years

In [None]:
df["year"] = pd.to_numeric(df["year"], errors="coerce")

In [None]:
years = df["year"].value_counts().sort_index(na_position="first")
years.plot.bar()
plt.show()

In [None]:
# computing freqs by year and ngrams
all_freqs = {}
ngrams = np.arange(1,4)
for ngram in tqdm(ngrams, leave=False, position=0):
    year_freqs = {}
    for year in years.index:
        values = df[df["year"]==year]["statement_cleaned_v2"].values
        num_statements = len(values)
        freqs = compute_and_process_freqs(
            values, 
            ngram=ngram)
        freqs = pd.DataFrame.from_dict(
            freqs, 
            orient="index").sort_values(0, ascending=False)
        year_freqs[year] = (num_statements, freqs)
    all_freqs[ngram] = year_freqs

In [None]:
ngram = 3
n = 25
fig, axes = plt.subplots(
        nrows=4, ncols=3,
        figsize=(16,20),
        squeeze=False)
title = f"Top {n} {ngram}-ngrams by year"
fig.text(x=.6, y=.992, s=title, ha="center")
for year, ax in zip(years.index, axes.flatten()):
    all_freqs[ngram][year][1].head(n).sort_values(0).plot.barh(ax=ax)
    ax.set_title(f"{int(year)} - {all_freqs[ngram][year][0]} statements", y=.98, fontdict={"fontsize": 10})
    ax.get_legend().remove()
    sns.despine(ax=ax)
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_PATH, f"{title}.png"))

In [None]:
# plotting wordclouds per year
MAX_WORDS = 200 
for year in tqdm(years.index, leave=False, position=0):
    wd = WordCloud(
        width=1800, 
        height=1200,
        background_color="white", 
        max_words=MAX_WORDS)
    wd.generate_from_frequencies(frequencies=all_freqs[1][year][1].to_dict()[0])
    fig, ax = plt.subplots(figsize=(12,12))
    title = f"Top {MAX_WORDS} words wordcloud from {int(year)} - {all_freqs[1][year][0]} statements"
    ax.set_title(title)
    ax.imshow(wd)
    ax.set_axis_off()
    plt.tight_layout()
    fp = title = f"Top {MAX_WORDS} words wordcloud from {int(year)}"
    plt.savefig(os.path.join(PLOTS_PATH, "_".join(fp.split())))
    plt.show()

In [None]:
years

In [None]:
fig, axes = plt.subplots(figsize=(20,10), ncols=len(years), sharey=True)
for year, ax in tqdm(zip(years.index, axes.flatten()), leave=False, position=0):
    values = df[df["year"]==year]["statement_cleaned_v2"].values
    lens = [len(statement.split()) for statement in values]
    ax.boxplot(lens)
    ax.set_title(f"{int(year)}\n {len(values)} statement/s")
    sns.despine(ax=ax)
fig.text(x=.5, y=.98, s="Distribution of number of words in statements by year", ha="center")
plt.tight_layout(pad=2)
plt.savefig(os.path.join(
    PLOTS_PATH,
    "Distribution_Number_of_Words_By_Years"),
            dpi=100)
plt.show()