# Evaluating top2vec hyperparameters

## Setup

Designed to work on Google Colab. Well, after patching the dataloader in one place because Google Colab just has to use Python 3.7 with a similarly stone-aged Pandas version.

In [None]:
!pip install wget > /dev/null
!pip install top2vec[sentence_encoders] > /dev/null
!pip install top2vec[sentence_transformers] > /dev/null
!pip install fasttext-langdetect > /dev/null

In [3]:
import sys

module_path = os.path.abspath(os.path.join("../04_dataset_access/"))
if module_path not in sys.path:
    sys.path.append(module_path)

%load_ext autoreload
%autoreload 2
from dataloader import Dataloader

In [4]:
import sys
import pandas as pd

In [5]:
import multiprocessing
from top2vec import Top2Vec

In [6]:
df = pd.read_csv(
    "../24212003_requirements_for_artificial_intelligence/patched_feedbacks.csv"
)
df = Dataloader("page", limit_english=False, tokenize=False).from_folder(
    "../24212003_requirements_for_artificial_intelligence/attachments/", df
)
df.head()

As n_jobs=-1 <= 0, enabling multiprocessing with 2 cores!


Unnamed: 0,id,text,language,country,user_type,organization,surname,feedback,status,company_size,...,date_feedback,publication,publication_id,publication_status,tr_number,scope,governance_level,full_name,source,language_detected
0,2665651,Equinet welcomes the opportunity to provide co...,en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,2021-08-06 23:57:37,anonymous,24212003,closed,,,,,attachment,en
1,2665651,Equinet’s feedback to the European Commission'...,en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,2021-08-06 23:57:37,anonymous,24212003,closed,,,,,attachment,en
2,2665651,"equality bodies, alongside with other sectoral...",en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,2021-08-06 23:57:37,anonymous,24212003,closed,,,,,attachment,en
3,2665651,investment in 1) digital literacy for those wh...,en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,2021-08-06 23:57:37,anonymous,24212003,closed,,,,,attachment,en
4,2665651,4. Require equality and human rights impact as...,en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,2021-08-06 23:57:37,anonymous,24212003,closed,,,,,attachment,en


In [7]:
assert len(df) == 2282 + 299 - 13

In [8]:
doc_df = pd.read_csv(
    "../24212003_requirements_for_artificial_intelligence/patched_feedbacks.csv"
)
doc_df = Dataloader("document", limit_english=False, tokenize=False).from_folder(
    "../24212003_requirements_for_artificial_intelligence/attachments/", doc_df
)
doc_df.head()

As n_jobs=-1 <= 0, enabling multiprocessing with 2 cores!


Unnamed: 0,id,text,language,country,user_type,organization,surname,feedback,status,company_size,...,date_feedback,publication,publication_id,publication_status,tr_number,scope,governance_level,full_name,source,language_detected
0,2665651,Equinet welcomes the opportunity to provide co...,en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,2021-08-06 23:57:37,anonymous,24212003,closed,,,,,attachment,en
1,2665650,AI Austria welcomes the opportunity to comment...,en,AUT,ngo,AI Austria,Gorzala,AI Austria welcomes the opportunity to comment...,PUBLISHED,small,...,2021-08-06 23:55:26,withinfo,24212003,closed,,,,Jeannette Gorzala,attachment,en
2,2665649,This submission to the AIA consultation is sen...,en,DEU,ngo,Digitalcourage e.V.,,This submission to the AIA consultation is sen...,PUBLISHED,small,...,2021-08-06 23:53:39,anonymous,24212003,closed,,,,,attachment,en
3,2665648,The EU AI Act is an important step in the righ...,en,USA,academic_research_institution,UC Berkeley Center for Human-Compatible AI,,The EU AI Act is an important step in the righ...,PUBLISHED,small,...,2021-08-06 23:53:31,anonymous,24212003,closed,,,,,attachment,en
4,2665647,In response to the European Commission’s reque...,en,USA,company,CrowdStrike,,In response to the European Commission’s reque...,PUBLISHED,large,...,2021-08-06 23:50:16,anonymous,24212003,closed,,,,,attachment,en


In [9]:
line_df = pd.read_csv(
    "../24212003_requirements_for_artificial_intelligence/patched_feedbacks.csv"
)
line_df = Dataloader("line", limit_english=False, tokenize=False).from_folder(
    "../24212003_requirements_for_artificial_intelligence/attachments/", line_df
)
line_df.head()

As n_jobs=-1 <= 0, enabling multiprocessing with 2 cores!


Unnamed: 0,id,text,language,country,user_type,organization,surname,feedback,status,company_size,...,date_feedback,publication,publication_id,publication_status,tr_number,scope,governance_level,full_name,source,language_detected
0,2665651,Equinet welcomes the opportunity to provide co...,en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,2021-08-06 23:57:37,anonymous,24212003,closed,,,,,attachment,en
1,2665651,Equality is explicitly and prominently address...,en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,2021-08-06 23:57:37,anonymous,24212003,closed,,,,,attachment,en
2,2665651,Please find as an attachment the following key...,en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,2021-08-06 23:57:37,anonymous,24212003,closed,,,,,attachment,en
3,2665651,Equinet’s feedback to the European Commission'...,en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,2021-08-06 23:57:37,anonymous,24212003,closed,,,,,attachment,en
4,2665651,on Artificial Intelligence (AI) Systems,en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,2021-08-06 23:57:37,anonymous,24212003,closed,,,,,attachment,en


## Evaluating different hyperparameter combinations

In [10]:
import spacy

nlp = spacy.load("en_core_web_sm", exclude=["attribute_ruler", "lemmatizer", "ner"])


def sentencize(s):
    return [s.text for s in nlp(s).sents]

In [12]:
documents = [
    ("all_page", df["text"].to_list()),
    ("all_doc", doc_df["text"].to_list()),
    ("all_line", line_df["text"].to_list()),
    ("en_page", df.query("language_detected == 'en'")["text"].to_list()),
    ("en_doc", doc_df.query("language_detected == 'en'")["text"].to_list()),
    ("en_line", line_df.query("language_detected == 'en'")["text"].to_list()),
]

chunking = [
    ("none", dict()),
    ("sentencize", {"split_documents": True, "sentencizer": sentencize}),
    ("built-in_defaults", {"split_documents": True, "document_chunker": "sequential"}),
]

filename = "/content/drive/MyDrive/uni/thesis/top2vec/results_df.csv"
if os.path.isfile(filename):
    old_df = pd.read_csv(filename)
    print(f"Found existing results for {len(old_df)} combinations!")
else:
    old_df = pd.DataFrame(
        data=[], columns=["doc_mode", "chunk_mode", "model", "n_topics"]
    )

results = []
for (doc_mode, docs) in documents:
    for (chunk_mode, chunk_params) in chunking:
        models = [
            "doc2vec",
            "universal-sentence-encoder",
            "distiluse-base-multilingual-cased",
        ]
        if doc_mode.startswith("all"):
            models.append("universal-sentence-encoder-multilingual")

        for model in models:
            current = dict(
                doc_mode=doc_mode,
                chunk_mode=chunk_mode,
                model=model,
            )
            print(f"\n\n{current}")
            if (
                len(
                    old_df.query(
                        "doc_mode == @doc_mode and chunk_mode == @chunk_mode and model == @model"
                    )
                )
                > 0
            ):
                print("Skipping combination, as results already exist!")
                continue
            sys.stdout.flush()

            result = Top2Vec(
                documents=docs,
                embedding_model=model,
                **chunk_params,
                workers=multiprocessing.cpu_count(),
            )

            current["n_topics"] = result.get_num_topics()
            results.append(current)
            new_df = pd.DataFrame(results)
            pd.concat((old_df, new_df)).to_csv(filename, index=False)

Found existing results for 63 combinations!


{'doc_mode': 'all_page', 'chunk_mode': 'none', 'model': 'doc2vec'}
Skipping combination, as results already exist!


{'doc_mode': 'all_page', 'chunk_mode': 'none', 'model': 'universal-sentence-encoder'}
Skipping combination, as results already exist!


{'doc_mode': 'all_page', 'chunk_mode': 'none', 'model': 'distiluse-base-multilingual-cased'}
Skipping combination, as results already exist!


{'doc_mode': 'all_page', 'chunk_mode': 'none', 'model': 'universal-sentence-encoder-multilingual'}
Skipping combination, as results already exist!


{'doc_mode': 'all_page', 'chunk_mode': 'sentencize', 'model': 'doc2vec'}
Skipping combination, as results already exist!


{'doc_mode': 'all_page', 'chunk_mode': 'sentencize', 'model': 'universal-sentence-encoder'}
Skipping combination, as results already exist!


{'doc_mode': 'all_page', 'chunk_mode': 'sentencize', 'model': 'distiluse-base-multilingual-cased'}
Skipping combination, as results already exis