In [None]:
%load_ext blackcellmagic

In [1]:
import os
import json

os.chdir("data/")
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime

In [2]:
special_chars = [
    "\n",
    "\r",
    "!",
    '"',
    "#",
    "$",
    "%",
    "&",
    "'",
    "(",
    ")",
    "*",
    "+",
    ",",
    "-",
    ".",
    "/",
    ":",
    ";",
    "=",
    ">",
    "?",
    "@",
    "[",
    "]",
    "_",
    "`",
    "|",
    "~",
    "\x81",
    "\x8e",
    "\x92",
    "\xa0",
    "¡",
    "©",
    "«",
    "¬",
    "®",
    "°",
    "¶",
    "»",
    "÷",
]

replace_chars = {
    "¹": "1",
    "²": "2",
    "³": "3",
    "à": "a",
    "á": "a",
    "â": "a",
    "ã": "a",
    "å": "a",
    "æ": "a",
    "ç": "c",
    "è": "e",
    "é": "e",
    "ê": "e",
    "ë": "e",
    "ì": "i",
    "í": "i",
    "î": "i",
    "ï": "i",
    "ñ": "n",
    "ò": "o",
    "ó": "o",
    "ô": "o",
    "õ": "o",
    "ø": "o",
    "ù": "u",
    "ú": "u",
    "û": "u",
    "ý": "y",
    "ÿ": "y",
}

In [3]:
def cleanse_string(string_: str):
    substrings = string_.split()
    substrings[:] = [s.lower() for s in substrings]
    for sc in special_chars:
        substrings[:] = [s.replace(sc, "") for s in substrings]
    for key, val in replace_chars.items():
        substrings[:] = [s.replace(key, val) for s in substrings]
    substrings = list(filter(None, substrings))
    return " ".join(substrings)

In [4]:
test_string = "Children’s / Teenage fiction: Fantasy"
print(test_string)
print(cleanse_string(test_string))

Children’s / Teenage fiction: Fantasy
children’s teenage fiction fantasy


In [3]:
items = pd.read_csv("items.csv", delimiter="\|", engine="python")
items_c = pd.read_csv("items_stopc.csv", delimiter="\|", engine="python")
items_dedup_c = pd.read_csv("items_dedup_stopc.csv", delimiter="\|", engine="python")
eval_ = pd.read_csv("evaluation.csv", delimiter="\|", engine="python")

In [4]:
with open("publisher_dict.json") as f:
    data = f.read()
publisher_dict = json.loads(data)

In [5]:
publisher_dict["YFH"]

'Children’s / Teenage fiction: Fantasy'

In [8]:
publisher_dict_2 = None

In [9]:
publisher_dict_2 = dict()
for k, v in publisher_dict.items():
    topic = cleanse_string(str(k))
    publisher_dict_2[topic] = cleanse_string(str(v))

In [10]:
publisher_dict_2["yfh"]

'children’s teenage fiction fantasy'

In [14]:
books = []

for i in items_c.index:
    title = str(items_c.at[i, "title"])
    author = str(items_c.at[i, "author"])
    publisher = str(items_c.at[i, "publisher"])
    topic = str(items_c.at[i, "main topic"])
    #if str(items_c.at[i, "main topic"]) != "nan":
    #    topic = publisher_dict_2[str(items_c.at[i, "main topic"])]
    #else:
    #    topic = ""
    book_str = " ".join([title, author, publisher, topic])
    books.append(book_str)

In [16]:
books[333]

'frost maiden michelle m pillow the raven books llc fl'

In [20]:
len(books)

78334

In [49]:
topics = items_c["main topic"].values.tolist()
topics = [str(t) for t in topics]
#topics = [publisher_dict_2[str(a)] if str(a) != "nan" else "" for a in topics]
print(topics[4444])
len(topics)

yft


78334

In [None]:
from collections import Counter

Counter(topics).most_common(5)

In [None]:
titles = items_c["title"].values.tolist()
titles = [str(t) for t in titles]
print(titles[1111])
len(titles)

In [None]:
authors = items_c["author"].values.tolist()
authors = [str(x) for x in authors]
print(authors[1111])
len(authors)

In [None]:
publishers = items_c["publisher"].values.tolist()
publishers = [str(y) for y in publishers]
print(publishers[1111])
len(publishers)

In [50]:
from sentence_transformers import SentenceTransformer

#model = SentenceTransformer("bert-base-nli-mean-tokens")
model = SentenceTransformer("distiluse-base-multilingual-cased")

In [51]:
sentence_embeddings = model.encode(topics, show_progress_bar=True)

Batches: 100%|██████████| 2448/2448 [03:46<00:00, 10.81it/s]


In [52]:
np.savez_compressed(
    "embeddings/topic_embeddings_old_multilingual.npz", sentence_embeddings
)