## 1. Preprocess Data

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from src.data.preprocessing import preprocess_books

tqdm.pandas()

In [None]:
items_path = '../data/processed/20210526_items_df.csv'
books_path = "../data/external/items.csv"
evaluation_books_path = "../data/external/evaluation.csv"

In [None]:
query,document = preprocess_books(items_path, books_path, evaluation_books_path)

## 2. Filter on language

We need to filter all possible books, based on having the same language.

In [None]:
document_en = document.copy()
document_de = document.copy()
document_es = document[document.language == "Spanisch"]
document_it = document[document.language == "Italienisch"]
document_fr = document[document.language == "Französisch"]
document_pt = document[document.language == "Portugiesisch"]
document_sw = document[document.language == "Schwedisch"]
document_ba = document[document.language == "Baltisch"]
document_fi = document[document.language == "Finnisch"]
document_hi = document[document.language == "Hindi"]
document_un = document[document.language == "Ungarisch"]

In [None]:
document_en.drop(["language"], axis = 1, inplace = True)
document_de.drop(["language"], axis = 1, inplace = True)
document_es.drop(["language"], axis = 1, inplace = True)
document_it.drop(["language"], axis = 1, inplace = True)
document_fr.drop(["language"], axis = 1, inplace = True)
document_pt.drop(["language"], axis = 1, inplace = True)
document_sw.drop(["language"], axis = 1, inplace = True)
document_ba.drop(["language"], axis = 1, inplace = True)
document_fi.drop(["language"], axis = 1, inplace = True)
document_hi.drop(["language"], axis = 1, inplace = True)
document_un.drop(["language"], axis = 1, inplace = True)

In [None]:
query_en = query[query.language == "Englisch"]
query_de = query[query.language == "Deutsch"]
query_es = query[query.language == "Spanisch"]
query_it = query[query.language == "Italienisch"]
query_fr = query[query.language == "Französisch"]
query_pt = query[query.language == "Portugiesisch"]
query_sw = query[query.language == "Schwedisch"]
query_ba = query[query.language == "Baltisch"]
query_fi = query[query.language == "Finnisch"]
query_hi = query[query.language == "Hindi"]
query_un = query[query.language == "Ungarisch"]

In [None]:
query_en.drop(["language"], axis = 1, inplace = True)
query_de.drop(["language"], axis = 1, inplace = True)
query_es.drop(["language"], axis = 1, inplace = True)
query_it.drop(["language"], axis = 1, inplace = True)
query_fr.drop(["language"], axis = 1, inplace = True)
query_pt.drop(["language"], axis = 1, inplace = True)
query_sw.drop(["language"], axis = 1, inplace = True)
query_ba.drop(["language"], axis = 1, inplace = True)
query_fi.drop(["language"], axis = 1, inplace = True)
query_hi.drop(["language"], axis = 1, inplace = True)
query_un.drop(["language"], axis = 1, inplace = True)

In [None]:
recommendation_en = {}
for id in query_en.id_query:
    recommendation_en[id] = []
recommendation_de = {}
for id in query_de.id_query:
    recommendation_de[id] = []
recommendation_es = {}
for id in query_es.id_query:
    recommendation_es[id] = []
recommendation_it = {}
for id in query_it.id_query:
    recommendation_it[id] = []
recommendation_fr = {}
for id in query_fr.id_query:
    recommendation_fr[id] = []
recommendation_pt = {}
for id in query_pt.id_query:
    recommendation_pt[id] = []
recommendation_sw = {}
for id in query_sw.id_query:
    recommendation_sw[id] = []
recommendation_ba = {}
for id in query_ba.id_query:
    recommendation_ba[id] = []
recommendation_fi = {}
for id in query_fi.id_query:
    recommendation_fi[id] = []
recommendation_hi = {}
for id in query_hi.id_query:
    recommendation_hi[id] = []
recommendation_un = {}
for id in query_un.id_query:
    recommendation_un[id] = []

## 3. Englisch

### 3.1 Preprocess text

In [None]:
import numpy as np
import pandas as pd
import spacy
import en_core_web_sm
from src.data.preprocessing import preprocess_language
nlp_en = en_core_web_sm.load()
nlp_en.get_pipe("lemmatizer")

In [None]:
cross_en = preprocess_language(query_en, document_en, nlp_en)

In [None]:
from src.models.rule_based import search_recommendation

In [None]:
search_recommendation(cross_en, recommendation_en)

In [None]:
index_en = []
for key, values in recommendation_en.items():
    for value in values:
        index_en.append((key, value))

In [None]:
cross_en.set_index(["id_query", "id_document"], inplace = True)

In [None]:
cross_en.loc[index_en]

## 4. Deutsch

In [None]:
import de_core_news_sm
nlp_de = de_core_news_sm.load()
nlp_de.get_pipe("lemmatizer")

In [None]:
cross_de = preprocess_language(query_de, document_de, nlp_de)

In [None]:
search_recommendation(cross_de, recommendation_de)

In [None]:
cross_de.set_index(["id_query", "id_document"], inplace = True)

In [None]:
index_de = []
for key, values in recommendation_de.items():
    for value in values:
        index_de.append((key, value))

In [None]:
cross_de.loc[index_de]

## 5. Spanisch

In [None]:
import es_core_news_sm
nlp_es = es_core_news_sm.load()
nlp_es.get_pipe("lemmatizer")

In [None]:
cross_es = preprocess_language(query_es, document_es, nlp_es)

In [None]:
search_recommendation(cross_es, recommendation_es)

In [None]:
cross_es.set_index(["id_query", "id_document"], inplace = True)

In [None]:
index_es = []
for key, values in recommendation_es.items():
    for value in values:
        index_es.append((key, value))

In [None]:
cross_es.loc[index_es]

## 6. Italienisch

In [None]:
import it_core_news_sm
nlp_it = it_core_news_sm.load()
nlp_it.get_pipe("lemmatizer")

In [None]:
cross_it = preprocess_language(query_it, document_it, nlp_it)

In [None]:
search_recommendation(cross_it, recommendation_it)

In [None]:
cross_it.set_index(["id_query", "id_document"], inplace = True)

In [None]:
index_it = []
for key, values in recommendation_it.items():
    for value in values:
        index_it.append((key, value))

In [None]:
cross_it.loc[index_it]

## 6. Franzöisch

In [None]:
import fr_core_news_sm
nlp_fr = fr_core_news_sm.load()
nlp_fr.get_pipe("lemmatizer")

In [None]:
cross_fr = preprocess_language(query_fr, document_fr, nlp_fr)

In [None]:
search_recommendation(cross_fr, recommendation_fr)

In [None]:
cross_fr.set_index(["id_query", "id_document"], inplace = True)

In [None]:
index_fr = []
for key, values in recommendation_fr.items():
    for value in values:
        index_fr.append((key, value))

In [None]:
cross_fr.loc[index_fr]

## 7. Portugiesisch

In [None]:
import pt_core_news_sm
nlp_pt = pt_core_news_sm.load()
nlp_pt.get_pipe("lemmatizer")

In [None]:
cross_pt = preprocess_language(query_pt, document_pt, nlp_pt)

In [None]:
search_recommendation(cross_pt, recommendation_pt)

In [None]:
cross_pt.set_index(["id_query", "id_document"], inplace = True)

In [None]:
index_pt = []
for key, values in recommendation_pt.items():
    for value in values:
        index_pt.append((key, value))

In [None]:
cross_pt.loc[index_pt]

## 8. Schwedisch

In [None]:
import xx_ent_wiki_sm
nlp_sw = xx_ent_wiki_sm.load()

In [None]:
cross_sw = preprocess_language(query_sw, document_sw, nlp_sw)

In [None]:
search_recommendation(cross_sw, recommendation_sw)

In [None]:
cross_sw.set_index(["id_query", "id_document"], inplace = True)

In [None]:
index_sw = []
for key, values in recommendation_sw.items():
    for value in values:
        index_sw.append((key, value))

In [None]:
cross_sw.loc[index_sw]

## 9. Baltisch

In [None]:
import xx_ent_wiki_sm
nlp_ba = xx_ent_wiki_sm.load()

In [None]:
cross_ba = preprocess_language(query_ba, document_ba, nlp_ba)

In [None]:
search_recommendation(cross_ba, recommendation_ba)

In [None]:
cross_ba.set_index(["id_query", "id_document"], inplace = True)

In [None]:
index_ba = []
for key, values in recommendation_ba.items():
    for value in values:
        index_ba.append((key, value))

In [None]:
cross_ba.loc[index_ba]

## 10. Finnisch

In [None]:
import xx_ent_wiki_sm
nlp_fi = xx_ent_wiki_sm.load()

In [None]:
cross_fi = preprocess_language(query_fi, document_fi, nlp_fi)

In [None]:
search_recommendation(cross_fi, recommendation_fi)

In [None]:
cross_fi.set_index(["id_query", "id_document"], inplace = True)

In [None]:
index_fi = []
for key, values in recommendation_fi.items():
    for value in values:
        index_fi.append((key, value))

In [None]:
cross_fi.loc[index_fi]

## 11. Hindi

In [None]:
import xx_ent_wiki_sm
nlp_hi = xx_ent_wiki_sm.load()

In [None]:
cross_hi = preprocess_language(query_hi, document_hi, nlp_hi)

In [None]:
search_recommendation(cross_hi, recommendation_hi)

In [None]:
cross_hi.set_index(["id_query", "id_document"], inplace = True)

In [None]:
index_hi = []
for key, values in recommendation_hi.items():
    for value in values:
        index_hi.append((key, value))

In [None]:
cross_hi.loc[index_hi]

## 12. Ungarisch

In [None]:
import xx_ent_wiki_sm
nlp_un = xx_ent_wiki_sm.load()

In [None]:
cross_un = preprocess_language(query_un, document_un, nlp_un)

In [None]:
document_un

In [None]:
search_recommendation(cross_un, recommendation_un)

In [None]:
cross_un.set_index(["id_query", "id_document"], inplace = True)

In [None]:
index_un = []
for key, values in recommendation_un.items():
    for value in values:
        index_un.append((key, value))

In [None]:
cross_un.loc[index_un]

## 13. All together

In [None]:
recommendation = {}
recommendation.update(recommendation_en)
recommendation.update(recommendation_de)
recommendation.update(recommendation_es)
recommendation.update(recommendation_it)
recommendation.update(recommendation_fr)
recommendation.update(recommendation_pt)
recommendation.update(recommendation_sw)
recommendation.update(recommendation_ba)
recommendation.update(recommendation_fi)
recommendation.update(recommendation_hi)
recommendation.update(recommendation_un)

In [None]:
cross = query.merge(document, how='cross')

In [None]:
cross.set_index(["id_query", "id_document"], inplace = True)

In [None]:
index = []
for key, values in recommendation.items():
    for value in values:
        index.append((key, value))

In [None]:
test = cross.loc[index]

In [None]:
result = pd.DataFrame(recommendation).transpose().reset_index().rename({"index":"book_id",
                                                               0:"recommendation_1",
                                                               1:"recommendation_2",
                                                               2:"recommendation_3",
                                                               3:"recommendation_4",
                                                               4:"recommendation_5"}, axis = 1)

In [None]:
result["team_id"] = "dataminerz"
result["model_id"] = "rule_based"

In [None]:
result.to_csv("../data/processed/rule_based_dataminerz.csv")