# Reco-Modell erstellen

In [None]:
import pandas as pd
import re
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
import joblib

# Daten laden

In [None]:
bgdata = pd.read_pickle("../data/bg_data20240412.pkl")

In [None]:
bgdata.head()

In [None]:
# Doppelte Spalten "name" umbenennen:
colnames = bgdata.columns.tolist()
for y, new in zip([i for i, n in enumerate(bgdata.columns) if n == "name"], ["name", "other_names"]):
    colnames[y] = new
bgdata.columns = colnames

In [None]:
# weitere doppelte Spalten entfernen
bgdata = bgdata.loc[:,~bgdata.columns.duplicated()].copy()

In [None]:
bgdata.columns

# Categories vercoden

In [None]:
bgdata["boardgamecategory"] = bgdata.boardgamecategory.apply(lambda x: re.sub("['\[\]^\s+]", '', x).split(","))

In [None]:
categories = pd.get_dummies(bgdata.boardgamecategory.apply(pd.Series), prefix = "cat")

In [None]:
categories = categories.T.groupby(categories.columns).max().T

In [None]:
# Top-Kategorien ermitteln
topcat = categories.sum().sort_values(ascending=False).index.to_list()

In [None]:
# Die Textdatei öffnen und in den Schreibmodus setzen
with open("../data/topcat.txt", 'w') as file:
    for item in topcat:
        file.write(item + ', ') 

In [None]:
# Nach Top-Kategorien sortieren
categories = categories[topcat].copy()

In [None]:
# 20 Komponenten extrahieren
ncomp = 20
pca = PCA(n_components=ncomp)
pca.fit(categories)
joblib.dump(pca, "../data/pca_cat.pkl")

In [None]:
pd.DataFrame(pca.explained_variance_ratio_).cumsum().plot()

In [None]:
categories_pca = pd.DataFrame(pca.transform(categories), columns=["catp_" + str(x) for x in range(ncomp)])

In [None]:
bgdata = pd.concat([bgdata,categories_pca], axis=1)

# Mechanik

In [None]:
bgdata["boardgamemechanic"] = bgdata.boardgamemechanic.apply(lambda x: re.sub("['\[\]^\s+]", '', x).split(","))

In [None]:
mechanic = pd.get_dummies(bgdata.boardgamemechanic.apply(pd.Series), prefix = "mec")

In [None]:
mechanic = mechanic.T.groupby(mechanic.columns).max().T

In [None]:
# Top-Mechaniken ermitteln (später für Reco-Datensatz)
topmec = mechanic.sum().sort_values(ascending=False).index.to_list()

In [None]:
# Die Textdatei öffnen und in den Schreibmodus setzen
with open("../data/topmec.txt", 'w') as file:
    for item in topmec:
        file.write(item + ', ') 

In [None]:
# Nach Top-Mechaniken sortieren
mechanic = mechanic[topmec].copy()

In [None]:
ncomp = 20
pca = PCA(n_components=ncomp)
pca.fit(mechanic)

In [None]:
joblib.dump(pca, "../data/pca_mec.pkl")

In [None]:
pd.DataFrame(pca.explained_variance_ratio_).cumsum().plot()

In [None]:
mechanic_pca = pd.DataFrame(pca.transform(mechanic), columns=["mecp_" + str(x) for x in range(ncomp)])

In [None]:
bgdata = pd.concat([bgdata,mechanic_pca], axis=1)

# Subdomain

In [None]:
bgdata["boardgamesubdomain"] = bgdata.boardgamesubdomain.apply(lambda x: re.sub("['\[\]^\s+]", '', x).split(","))

In [None]:
subdomain = pd.get_dummies(bgdata["boardgamesubdomain"].apply(pd.Series), prefix = "sub")

In [None]:
subdomain = subdomain.T.groupby(subdomain.columns).max().T

In [None]:
bgdata = pd.concat([bgdata,subdomain], axis=1)

In [None]:
bgdata.columns

# X_Train

In [None]:
allfeat = ['yearpublished', 'playingtime', 'age', 'sub_AbstractGames', 'sub_CustomizableGames', 'sub_FamilyGames', 
           'sub_PartyGames', 'sub_StrategyGames', 'sub_ThematicGames', 'sub_Wargames']

In [None]:
feat = allfeat + ["catp_" + str(x) for x in range(ncomp)] + ["mecp_" + str(x) for x in range(ncomp)]

In [None]:
x_train = bgdata[feat].copy()

In [None]:
#z-Transformation
scaler = StandardScaler()
for x in allfeat:
     x_train[x] = scaler.fit_transform(x_train[x].values.reshape(-1, 1))
     joblib.dump(scaler, f"../data/scaler_{x}.pkl")

In [None]:
x_train

In [None]:
x_train.to_pickle("../data/x_train.pkl")

# Reco-Modell

In [None]:
recos = pd.DataFrame(euclidean_distances(x_train, x_train))

In [None]:
recos.to_pickle("../data/recos.pkl")

In [None]:
recos = pd.concat([recos, bgdata["name"]], axis=1)

In [None]:
recos.sort_values(51)[["name",51]].head(50)

# DataFrame für neue Abfragen erzeugen

In [None]:
pd.DataFrame(columns = allfeat + topcat + topmec).to_pickle("../data/recodata.pkl")

In [None]:
testdata = pd.merge(bgdata[allfeat],categories[topcat], left_index=True, right_index=True)
testdata = pd.merge(testdata,mechanic[topmec], left_index=True, right_index=True)

In [None]:
testdata.to_pickle("../data/recotestdata.pkl")