# Data Understanding

## Settings

In [30]:
########################################################################################################################
# Imports & Settings
########################################################################################################################

import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import re
import time
import pycountry
from pandas.core.common import flatten
from functools import reduce

In [31]:
# allow display of all rows (with scrollbar)
pd.set_option("display.max_rows", 10) #pd.set_option("display.max_rows", None)

# determine whether to use calculated language flags or recompute them
recompute_lg_flg = False

In [267]:
########################################################################################################################
# User Input
########################################################################################################################

# source data file paths
transactions_path = '../data/external/transactions.csv'
evaluation_path = '../data/external/evaluation.csv'
items_path = '../data/external/items.csv'
subject_cats_0_path = '../data/external/subject_cats_0.csv'

# pre-processed data file paths (incl. language flags)
items_path_pp = '../data/processed/items_pp.csv'
header_items_path_pp = '../data/processed/header_items_pp.csv'

# seaborn color palette
palette_blue = "Blues_d"
dark_blue = "#011f4b"
middle_blue = "#005b96"
light_blue = "#b3cde0"

## Functions

In [208]:
########################################################################################################################
# Functions
########################################################################################################################

def clean_alt_list(list_):
#     list_ = list_.replace(', ', ',')
    list_ = list_.replace('[', '')
    list_ = list_.replace(']', '')
    return list_

## Data load & initial pre-processing

In [255]:
########################################################################################################################
# Load Data
########################################################################################################################

# Load the dmc source data
# - clicks/baskets/order over a period of 3M
# - rows: one transaction for single item
transactions_df = pd.read_csv(transactions_path, delimiter='|', sep='.', encoding='utf-8')
# - list of product ids (subset of products from items_df) to be used for prediction
evaluation_df = pd.read_csv(evaluation_path, sep='.', encoding='utf-8')
items_df = pd.read_csv(items_path, delimiter='|', sep='.', encoding='utf-8')

# load category lookup table (manually created)
subject_cats_0 = pd.read_csv(subject_cats_0_path, delimiter=';', encoding='utf-8')

# Load pre-processed df (incl. language flags)
items_df_pp = pd.read_csv(items_path_pp, delimiter=',', encoding='utf-8')

########################################################################################################################
# Preprocessing for further inspection
########################################################################################################################

# extract list of base cols
initial_cols= list(items_df.columns)

# normalization: author col
# items_df.loc[items_df['author'] == 'ohne Autor', 'author'] = None

# add col: get len of mt string
items_df['mt_len'] = items_df['main topic'].str.len()

# add col: get first element (top level category) of mt string
items_df['mt_0'] = items_df['main topic'].str[0]

# add col: main topic as set (and converted back to list)
items_df['mt_cl'] = items_df['main topic'].astype(str).apply(lambda x: list(set(clean_alt_list(x).split(','))))

# adjust subtopics: set to None if subtopics list is empty
items_df['st_cl'] = items_df['subtopics'].astype(str).apply(lambda x: list(set(clean_alt_list(x).split(','))))
items_df.loc[items_df['st_cl']=={''}, 'st_cl'] = None

# add col: unique combination of main and subtopic
items_df['mt_st_cl'] = (items_df['st_cl'] + items_df['mt_cl']) #.apply(set)

# add col: get click / basket / order flag
transactions_df['click_flg'] = np.where(transactions_df['click'] > 0, 1, 0)
transactions_df['basket_flg'] = np.where(transactions_df['basket'] > 0, 1, 0)
transactions_df['order_flg'] = np.where(transactions_df['order'] > 0, 1, 0)

########################################################################################################################
# Inspection of dfs after initial pre-processing
########################################################################################################################

# show dfs after initial pre-processing
print(f'items_df after first pre-processing:')
display(items_df.head(2))

print(f'transactions_df after first pre-processing:')
display(transactions_df.head(2))

items_df after first pre-processing:


Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,mt_len,mt_0,mt_cl,st_cl,mt_st_cl
0,21310,Princess Poppy: The Big Mix Up,Janey Louise Jones,Penguin Random House Children's UK,YFB,[5AH],3.0,Y,[YFB],[5AH],"[5AH, YFB]"
1,73018,Einfach zeichnen! Step by Step,Wiebke Krabbe,Schwager und Steinlein,AGZ,"[5AJ,AGZ,WFA,YBG,YBL,YNA,YPA]",3.0,A,[AGZ],"[YBL, 5AJ, WFA, YBG, YNA, YPA, AGZ]","[YBL, 5AJ, WFA, YBG, YNA, YPA, AGZ, AGZ]"


transactions_df after first pre-processing:


Unnamed: 0,sessionID,itemID,click,basket,order,click_flg,basket_flg,order_flg
0,0,21310,1,0,0,1,0,0
1,1,73018,1,0,0,1,0,0


In [36]:
########################################################################################################################
# Print first summary statistics
########################################################################################################################

# transactions
print('\n****** transactions ******')
print(f'shape transactions_df: {transactions_df.shape}')
print(f'cols transactions_df: \n{transactions_df.dtypes}\n')
print(f'desc transactions_df:')
display(transactions_df.describe())

# - Get cnt of unique sessions / items
print(f'cnt unqiue sessions: {transactions_df["sessionID"].nunique()}') #271,983
print(f'cnt unqiue items: {transactions_df["itemID"].nunique()}') #24,909

# items
print('\n****** items ******')
print(f'shape items_df: {items_df.shape}\n')
print(f'cols items_df: \n{items_df.dtypes}\n')
print(f'desc items_df:')
display(items_df.describe())


****** transactions ******
shape transactions_df: (365143, 8)
cols transactions_df: 
sessionID     int64
itemID        int64
click         int64
basket        int64
order         int64
click_flg     int32
basket_flg    int32
order_flg     int32
dtype: object

desc transactions_df:


Unnamed: 0,sessionID,itemID,click,basket,order,click_flg,basket_flg,order_flg
count,365143.0,365143.0,365143.0,365143.0,365143.0,365143.0,365143.0,365143.0
mean,139586.939175,40051.292307,1.23318,0.141202,0.048403,0.93551,0.123207,0.0463
std,80795.207871,22493.347334,1.069996,1.107574,0.268717,0.245624,0.328675,0.210134
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,69459.5,20713.0,1.0,0.0,0.0,1.0,0.0,0.0
50%,139608.0,40692.0,1.0,0.0,0.0,1.0,0.0,0.0
75%,209750.5,58916.0,1.0,0.0,0.0,1.0,0.0,0.0
max,279354.0,79066.0,118.0,293.0,28.0,1.0,1.0,1.0


cnt unqiue sessions: 271983
cnt unqiue items: 24909

****** items ******
shape items_df: (78030, 9)

cols items_df: 
itemID             int64
title             object
author            object
publisher         object
main topic        object
subtopics         object
mt_len           float64
mt_0              object
subtopics_str     object
dtype: object

desc items_df:


Unnamed: 0,itemID,mt_len
count,78030.0,77772.0
mean,39545.062553,2.994355
std,22825.650252,0.746807
min,0.0,1.0
25%,19775.25,3.0
50%,39561.5,3.0
75%,59306.75,3.0
max,79067.0,10.0


## Overview statistics per relation / attribute

### items

#### author

In [None]:
# count of books per author
books_per_author = pd.DataFrame.from_dict(Counter(items_df.loc[:,'author']),
                                    orient='index',
                                    columns=['book_cnt']).sort_values(by='book_cnt', ascending=False)
books_per_author['frac[%]'] = books_per_author['book_cnt'] * 100 / books_per_author['book_cnt'].sum()

print(f'# books per author:')
display(books_per_author.head(10))

print(f'summary statistics of books per author:')
display(books_per_author.describe())

# distribution of cnt af books among authors
books_per_author_cnts = pd.DataFrame(books_per_author['book_cnt'].value_counts().reset_index()).rename(columns={'index': 'book_cnt',
                                                                                                                'book_cnt': 'author_cnt'})
books_per_author_cnts['author_cnt.cum'] = books_per_author_cnts['author_cnt'].cumsum()
books_per_author_cnts['frac[%]'] = books_per_author_cnts['author_cnt'] * 100 / books_per_author_cnts['author_cnt'].sum()
books_per_author_cnts['frac.cum[%]'] = books_per_author_cnts['frac[%]'].cumsum()

print(f'distribution of books per author:')
display(books_per_author_cnts.head(10))
sns.set_theme()
sns.histplot(books_per_author[books_per_author['book_cnt']<50]['book_cnt'], binwidth=1)
plt.show()

#### publisher

In [None]:
# count of books per publisher
books_per_publisher = pd.DataFrame.from_dict(Counter(items_df.loc[:,'publisher']),
                                    orient='index',
                                    columns=['book_cnt']).sort_values(by='book_cnt', ascending=False)
books_per_publisher['frac[%]'] = books_per_publisher['book_cnt'] * 100 / books_per_publisher['book_cnt'].sum()

print(f'# books per publisher:')
display(books_per_publisher.head(10))

print(f'summary statistics of books per publisher:')
display(books_per_publisher.describe())

# distribution of cnt af books among publishers
books_per_publisher_cnts = pd.DataFrame(books_per_publisher['book_cnt'].value_counts().reset_index()).rename(columns={'index': 'book_cnt',
                                                                                                                'book_cnt': 'publisher_cnt'})
books_per_publisher_cnts['publisher_cnt.cum'] = books_per_publisher_cnts['publisher_cnt'].cumsum()
books_per_publisher_cnts['frac[%]'] = books_per_publisher_cnts['publisher_cnt'] * 100 / books_per_publisher_cnts['publisher_cnt'].sum()
books_per_publisher_cnts['frac.cum[%]'] = books_per_publisher_cnts['frac[%]'].cumsum()

print(f'distribution of books per publisher:')
display(books_per_publisher_cnts.head(10))
sns.set_theme()
sns.histplot(books_per_publisher[books_per_publisher['book_cnt']<50]['book_cnt'], binwidth=1)
plt.show()

#### main topics

In [None]:
# get depth of main topic tree
print(f'str len main topics:')
display(pd.DataFrame(items_df["mt_len"].describe()))
mt_len_hist = sns.histplot(items_df['mt_len']).set_title(f'distribution of len of main topics')

# count of books per main topic (=mt) combo
books_per_mt = pd.DataFrame.from_dict(Counter(items_df.loc[:,'main topic']),
                                    orient='index',
                                    columns=['book_cnt']).sort_values(by='book_cnt', ascending=False)
books_per_mt['frac[%]'] = books_per_mt['book_cnt'] * 100 / books_per_mt['book_cnt'].sum()

# plot mt_0 distribution
sns.set_theme()
sns.histplot(items_df['mt_0'].astype(str).sort_values())

# count of books per first element of mt
books_per_mt_0 = pd.DataFrame.from_dict(Counter(items_df.loc[:,'mt_0']),
                                    orient='index',
                                    columns=['book_cnt']).sort_values(by='book_cnt', ascending=False).reset_index()
books_per_mt_0 = books_per_mt_0.rename(columns={'index': 'Notation'})
books_per_mt_0['frac[%]'] = books_per_mt_0['book_cnt'] * 100 / books_per_mt_0['book_cnt'].sum()

# join with category heading
books_per_mt_0 = books_per_mt_0.merge(subject_cats_0, on='Notation', how='left')
print(f'top 5 high level cats:')
display(books_per_mt_0.head(5))

### transactions

- basket: items that were added to basket but not necessarily bought
- order: items that where finally bought

In [None]:
# merge transactions with items to get title
transactions_df = transactions_df.merge(items_df[['itemID','title']], left_on='itemID', right_on='itemID', how='left')
transactions_df.head(5)

#### cnts per sessionID

In [None]:
# # get cnt of distinct item clicks per session
# unique_clicks_per_session = transactions_df[['sessionID', 'click_flg']].groupby('sessionID')['click_flg'].sum().reset_index().\
#     sort_values(by='click_flg', ascending=False)
# unique_clicks_per_session['frac[%]'] = unique_clicks_per_session['click_flg'] * 100 / unique_clicks_per_session['click_flg'].sum()
# unique_clicks_per_session = unique_clicks_per_session.rename(columns={'click_flg': '#clicked items unique'})

# print(f'clicked items per session:')
# display(round(unique_clicks_per_session.head(10),2))

# print(f'clicks per item summary stats:')
# display(round(unique_clicks_per_session.describe(),2))

# sns.boxplot(x=unique_clicks_per_session["#clicked items unique"])
# plt.show()

# # get cnt of distinctly ordered items per session
# orders_per_session = transactions_df[['sessionID', 'order_flg']].groupby('sessionID')['order_flg'].sum().reset_index().\
#     sort_values(by='order_flg', ascending=False).rename(columns={'order_flg': 'order_cnt'})
# orders_per_session['frac[%]'] = orders_per_session['order_cnt'] * 100 / orders_per_session['order_cnt'].sum()

# print(f'distinct orders per session (binary, w/o qty):')
# display(orders_per_session.head(10))

# print(f'distinct orders per session summary stats:')
# display(orders_per_session.describe())

# sns.boxplot(x=orders_per_session["order_cnt"])
# plt.show()

# # get cnt of distinct order sessions per item
# orders_per_item = transactions_df[['itemID', 'order_flg']].groupby('itemID')['order_flg'].sum().reset_index().\
#     sort_values(by='order_flg', ascending=False).rename(columns={'order_flg': 'order_cnt'})
# orders_per_item['frac[%]'] = orders_per_item['order_cnt'] * 100 / orders_per_item['order_cnt'].sum()

# # print(f'distinct orders per item (binary, w/o qty):')
# # display(orders_per_item.head(10))

# print(f'distinct orders per item summary stats:')
# display(orders_per_item.describe())

# get cnt of distinct orders / basket /orders per session
interaction_per_session = transactions_df[['sessionID',
                                           'click_flg',
                                           'basket_flg',
                                           'order_flg']].groupby('sessionID').sum().reset_index()
print(f'distribution of unique items clicked, added to basket, ordered:')
display(round(interaction_per_session[['click_flg','basket_flg','order_flg']].describe(),1).loc[['count','mean','std','25%','50%','75%','max']])

# get click to basket to order conversion
items_per_basket_order = transactions_df[['itemID',
                                          'click_flg',
                                          'basket_flg',
                                          'order_flg']].groupby(['click_flg',
                                                                 'basket_flg',
                                                                 'order_flg'])['itemID'].count().reset_index().rename(columns={'itemID': 'item_cnt'})
items_per_basket_order['frac[%]'] = items_per_basket_order['item_cnt'] * 100 / items_per_basket_order['item_cnt'].sum()
print(f'click to basket to order conversion:')
display(round(items_per_basket_order.sort_values(by=['click_flg','basket_flg','order_flg'],ascending=False),2))

#### top interaction items

In [None]:
# get top sellers
top_interaction_items = transactions_df[['itemID', 'title',
                                         'click',
                                         'basket',
                                         'order']].groupby(['itemID','title']).sum().reset_index().sort_values(by='click')
top_clicked_items = top_interaction_items.sort_values(by='click',ascending=False).head(5)
top_basket_items = top_interaction_items.sort_values(by='basket',ascending=False).head(5)
top_order_items = top_interaction_items.sort_values(by='order',ascending=False).head(5)
# display(top_interaction_items.head(10))

# generate barplot
sns.set_theme()
fig, ax = plt.subplots(3,1)
plt.tight_layout()
sns.barplot(data=top_clicked_items,x='click',y='title',palette=palette_blue, ax=ax[0]).set(xlabel="# clicks",ylabel="")
sns.barplot(data=top_basket_items,x='basket',y='title',palette=palette_blue, ax=ax[1]).set(xlabel="# added to basket",ylabel="")
sns.barplot(data=top_order_items,x='order',y='title',palette=palette_blue, ax=ax[2]).set(xlabel="# orders",ylabel="")
plt.show()

# # get cnt of clicks per item
# clicks_per_item = transactions_df[['itemID', 'click']].groupby('itemID')['click'].sum().reset_index().\
#     sort_values(by='click', ascending=False).rename(columns={'click': 'click_cnt'})
# clicks_per_item['frac[%]'] = clicks_per_item['click_cnt'] * 100 / clicks_per_item['click_cnt'].sum()

# print(f'clicks per item:')
# display(clicks_per_item.head(10))

# print(f'clicks per item summary stats:')
# display(clicks_per_item.describe())

## Anomaly Detection

### Missing Values
- 9 items w/o publisher:
    - could be anything, cannot be imputed
    - not such a crucial information to be missing
    - thus: no handling
- 3240 items w/o author:
    - correct author might not be uniquely determinable or there might not even be a senseful author
    - thus: no handling
- 258 items w/o main topic:
    - at least subtopic is given
    - only 32 of these also have the author missing
- 36,904 items w/o subtopic:
    - in all of the cases, a main topic is given
    - thus: still enough information available

In [None]:
# get cnt of missing values per column
missing_values = pd.DataFrame(items_df.isnull().sum()).rename(columns={0: 'cnt'})
missing_values['frac[%]'] = missing_values['cnt'] * 100 / len(items_df)
print(f'null values per column:')
display(round(missing_values.loc[initial_cols + ["subtopics_str"]],2))

# get cnt of combined null values: sum null values per row and cnt rows with #null > 1
print(f'\n# rows with null values in more than one col: {(items_df[initial_cols + ["subtopics_str"]].isnull().sum(axis=1) > 1).sum()}')
print(f'\ndistribution of null values over cols (1=null, 0=not null):')
display(pd.DataFrame((items_df[initial_cols + ['subtopics_str']].isnull() * 1).value_counts().reset_index()).rename(columns={0: '#items'}))

#### Missing publisher

In [None]:
# check all items with missing publisher
print('all items with missing publisher:')
display(items_df[items_df['publisher'].isnull()])

# check whether there might be other entries with publisher given
missing_publisher_title = items_df[items_df['publisher'].isnull()]['title']
print(f'books with same title that appear twice: {(items_df[items_df["title"].isin(missing_publisher_title)].groupby("title")["itemID"].count() > 1).sum()}\n')

# inspect sample with missing publisher
# > missing publisher is most likely to be 'TEKTIME' > however: could also be different
print('entries for title "Back to Earth" with missing publisher for some editions:')
display(items_df[items_df['title'].str.contains('Back to Earth')])
print('entries for author "Danilo Clementoni" with missing publisher for some items:')
display(items_df[items_df['author'] == 'Danilo Clementoni'])

#### Missing author
- __problem__:
    - there is a lot of items with very generalistic titles like 'Dinosaurier' or 'Die Weihnachtsgeschichte' that do not allow to uniquely determine the correct author
    - there might not even be a unique author, like for 'Freundebuch - Einhorn-Paradies - Meine Freunde' or 'Kritzkratz-Spaß Glitzer'
    - there might be the same item but several different authors, like for 'Goldilocks and the Three Bears'

- __approach__:
    - try to not impute author, use other attributes instead, e.g. topic or publisher

In [None]:
# check all items with missing author
print('first 10 items with missing author:')
display(items_df[items_df['author'].isnull()].head(10))

# check whether there might be other entries with author given
missing_author_title = items_df[items_df['author'].isnull()]['title']
missing_author_cnt_dups = pd.DataFrame(items_df[items_df["title"].isin(missing_author_title)].groupby("title")["itemID"].count())
print(f'\nbooks with same title that appear twice (see df below): {(missing_author_cnt_dups["itemID"] > 1).sum()}')

# check whether author can be retried
missing_author_dups = missing_author_cnt_dups[missing_author_cnt_dups["itemID"] > 1].reset_index()['title']
display(items_df[items_df['title'].isin(missing_author_dups)].sort_values(by='title'))


#### Missing topic

In [None]:
# check whether there are items with no topic at all
print(f'cnt of items with both, main topic and subtopic == null: {((items_df["subtopics_str"].isnull()) & (items_df["main topic"].isnull())).sum()}')

##### main

In [None]:
# check all items with missing topic
print('first 10 items with missing topic:')
display(items_df[items_df['main topic'].isnull()].head(10))

# check whether there might be other entries with topic given
missing_topic_title = items_df[items_df['main topic'].isnull()]['title']
missing_topic_cnt_dups = pd.DataFrame(items_df[items_df["title"].isin(missing_topic_title)].groupby("title")["itemID"].count())
print(f'\nbooks with same title that appear twice (see df below): {(missing_topic_cnt_dups["itemID"] > 1).sum()}')

# check whether topic can be retried
missing_topic_dups = missing_topic_cnt_dups[missing_topic_cnt_dups["itemID"] > 1].reset_index()['title']
display(items_df[items_df['title'].isin(missing_topic_dups)].sort_values(by='title'))

# check cnt of items with main topic and subtopic missing


##### sub
- no scalable solution for imputing subtopics
- out of the 36,904 missing subtopics, only 2,668 items appear multiple times
    - out of these, only 1,574 actually have a duplicate with a subtopic given

In [None]:
# check all items with missing topic
print('first 10 items with missing topic:')
display(items_df[items_df['subtopics_str'].isnull()])

# check whether there might be other entries with topic given
missing_topic_title = items_df[items_df['subtopics_str'].isnull()]['title']
missing_topic_cnt_dups = pd.DataFrame(items_df[items_df["title"].isin(missing_topic_title)].groupby("title")["itemID"].count())
print(f'\nbooks with same title that appear twice (see df below): {(missing_topic_cnt_dups["itemID"] > 1).sum()}')

# check whether topic can be retried
missing_topic_dups = missing_topic_cnt_dups[(missing_topic_cnt_dups["itemID"] > 1)].reset_index()['title']
display(items_df[(items_df['title'].isin(missing_topic_dups)) & (~items_df['subtopics_str'].isnull())].sort_values(by='title'))

### Duplicates

__To keep in mind:__
1. other relevant attributes are not given, e.g.:
    - actual __language__ might not be that of title
    - __publication date__ might differ between itemIDs (=Neuauflage)
    - title might not be complete (e.g. __subtitle__ missing)
        - e.g. '[Ära der Lichtwächter](https://www.amazon.com/s?k=%C3%84ra+der+Lichtw%C3%A4chter&ref=nb_sb_noss)' from 'Klaus Pfrommer' (itemID = (40200,18242)) is collection with differing subtitles "Die Täuschung", "Das Vermächtnis", "Die Unschuld"
    - thus: itemID would be unique identifier for actually different items
2. __transactions__ might help to differentiate between items and __rank their relevance__

#### duplicate entries per column

In [38]:
# cnt column-wise duplication
sc_cnt = 1
total = len(items_df)
for col in initial_cols:
    dup_cnts = (items_df[col].value_counts() > 1).sum()
    print(f'cnt of duplicate {col}: {dup_cnts} ({round(dup_cnts*100/total,2)}%)')

# inspect title duplicates
title_cnts = (items_df["title"].value_counts().reset_index())
title_dups_lst = title_cnts[title_cnts["title"]>1]["index"]
items_df[(items_df["title"].isin(title_dups_lst))].sort_values(by="title")

cnt of duplicate itemID: 0 (0.0%)
cnt of duplicate title: 4193 (5.37%)
cnt of duplicate author: 10120 (12.97%)
cnt of duplicate publisher: 3426 (4.39%)
cnt of duplicate main topic: 478 (0.61%)
cnt of duplicate subtopics: 2599 (3.33%)


Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,mt_len,mt_0,subtopics_str
33470,44003,(Heli-)opolis - Der verhängnisvolle Plan des Weltkoordinators,Michael Häusler,Books on Demand,FM,[],2.0,F,
34511,12623,(Heli-)opolis - Der verhängnisvolle Plan des Weltkoordinators,Michael Häusler,Books on Demand,FL,[],2.0,F,
30784,69287,13 Kings,V. S. Nesby,Xlibris,FL,[],2.0,F,
55425,55553,13 Kings,Vs Nesby,Xlibris US,FL,[],2.0,F,
7605,46714,19 Love Songs,David Levithan,Random House LCC US,YNMD,"[5HC,5PS,5PT,YFB,YFM,YFU]",4.0,Y,"5HC,5PS,5PT,YFB,YFM,YFU"
...,...,...,...,...,...,...,...,...,...
7000,70797,Äon,Andreas Brandhorst,Heyne,FHQ,[1DST],3.0,F,1DST
53131,40200,Ära der Lichtwächter,Klaus Pfrommer,swb media publishing,FMR,"[FMR,FMT,FMX]",3.0,F,"FMR,FMT,FMX"
46735,18242,Ära der Lichtwächter,Klaus Pfrommer,swb media publishing,FMB,"[FMB,FMR,FMX]",3.0,F,"FMB,FMR,FMX"
52956,6755,Éveil,Aurora Clerc,Books on Demand,FMB,"[5AX,FMH,FT,3KLF]",3.0,F,"5AX,FMH,FT,3KLF"


#### everything identical except of single column
- only cases for duplicated items with same attributes but different itemID

In [None]:
col_list = initial_cols #['itemID']
for col in col_list:

    # check all cols except of current one
    col_list_lim = [c for c in items_df.columns if c != col]
    #print(f'{col}: {col_list_lim}')

    # compute duplicate cnt
    dup = pd.DataFrame(items_df.groupby(col_list_lim)[col].count().reset_index())
    print(f'everything identical except of {col} = {(dup[col] > 1).sum()}')
    #display(dup[dup[col] > 1].sort_values(by=col))
    #display(dup.sort_values(by=col))

In [None]:
# deep dive: everything identical except of ID
print(f'sample entry for sc1: everything identical except of itemID')
display(items_df[items_df['title']=='Reisestipendien'])

## Pre-Processing

### [DEV] Outlier Detection
- only for __transactions__: remove transactions with suspiciously high #of clicks/basket/order

### [DEV] String normalization

__Applied:__
1. conversion to lowercase, e.g. publisher = 'TEKTIME' or 'Tektime' to 'tektime'
2. removal of leading special characters, e.g. ",william shakespeare"
3. conversion of unicode characters (ä,ö,ü)

__No fix yet:__
1. author = 'V. S. Nesby' and 'Vs Nesby' -> approach: no test for equality but similarity / remove dots?
2. weird entries
    - author: der Authhhhor
    - diverse Autoren, Autoren
3. unicode characters like (à,é,è,°o)

In [None]:
# generate copy of original df
items_df_cl = items_df.copy()
display(items_df_cl.head(5))

#### title

In [None]:
# Functions -> move to beginning of script

def remove_special_characters(list_):
    list_ = re.sub(r'®','',list_)
    list_ = re.sub(r'^\W+', r'', list_) #removes leading non-alphanumerics, e.g. ",william shakespeare"
    return list_

def remove_nontitle_substrings(list_):

    # type of book
    for book_type in ['taschenbuch','hardcover']:
        list_ = re.sub(f'\(.*{book_type}.*\)?','',list_) #remove all content within brackets
        list_ = re.sub(f'-\s*(\w*\s*){book_type}.*','',list_)
        list_ = re.sub(f':.*{book_type}.*','',list_)
        list_ = re.sub(f'(.*{book_type}[\w\d\s]*):','',list_)
        list_ = re.sub(f'[(special)(book)(edition)\s*]*{book_type}\s*[(special)(book)(edition)\s*]*','',list_)
        list_ = re.sub(f'{book_type}','',list_)

#     list_ = re.sub(r'\(.*hardcover.*\)?','',list_)
#     list_ = re.sub(r':.*hardcover.*','',list_)

    return list_

def convert_umlaute(list_):
    """
    converts ä > ae, ö > oe, etc.
    """
    # convert umlaute
#     chars = {'ö':'oe','ä':'ae','ü':'ue'} # usw.
#     for char in chars:
#         items_df_cl["author_cl"] = items_df_cl["author_cl"].apply(lambda s: s.replace(char, chars[char]) if type(s) == str else s)

#     # test sample after normalization
#     items_df_cl[items_df_cl["author_cl"].str.contains('schlueter')].head(10)
    return list_
    
def insert_dot_after_single_chars(list_):
    list_ = re.sub(r'([A-Z])\.?(?![a-z])\s*', r'\g<1>. ', list_)
    return list_
    

# generate titles df (with comparison column for original and cleaned title)
titles_df = pd.DataFrame(items_df_cl["title"].unique()).rename(columns={0: "title"})
titles_df['title_cl'] = titles_df['title']

print(f'#unique titles (before preprocessing): {titles_df["title"].nunique()} / {len(titles_df)}')

# convert all strings to lowercase
titles_df = titles_df.applymap(lambda s:s.lower() if type(s) == str else s)
display(titles_df.head(10))

In [None]:
# print cnt of items including special terms
print(f'#items with title including:')
for entry in ['hardcover','taschenbuch']:
    cnt = titles_df["title"].str.contains(f'{entry}').sum()
    print(f'\t{entry}: {cnt}')

In [None]:
# inspect matches for specific terms/patterns
pd.set_option("display.max_rows", None)
p = re.compile('\(.*\)')
matches = titles_df["title"].apply(lambda s: p.findall(s))
matches = pd.DataFrame(set(flatten([x for x in matches if x])))
matches.head()

# (1) -> elfengeist (1)
# (dt. ausgabe)
# the dark artifices box set (3 bände im schuber)
# star wars(tm) - schülerin der dunklen seite
# (sammelband) / (filmausgabe)
# (neuauflage) / (sonderausgabe)
# (roman) / (light novel)
# (großdruck)
# (gift edition) / (signed limited edition)
# (manga)
# (1-3 jahre)
# (greek edition) / (german edition) / (greek book for kids) -> additional column with language tag extracted?
# (spanish language edition of the things m -> check if error during reading in
# (hardback)

In [None]:
# search for specific entry
pd.set_option("display.max_rows", None)
pd.set_option('display.max_colwidth', None)

search_entry = '\(.*\)'
display(titles_df.loc[titles_df['title'].str.contains(f'{search_entry}'), :])

In [None]:
# Testing of removal
book_type = 'hardcover'
for book in ["no trail behind me, special edition hardcover", "no trail behind me, hardcover special edition"]:
    print(re.sub(f'[(special)(book)(edition)\s*]*{book_type}\s*[(special)(book)(edition)\s*]*','',book))

In [None]:
# apply pre-processing
pd.set_option("display.max_rows", 5)

# clean strings
titles_df['title_cl'] = titles_df['title_cl'].astype(str).apply(remove_special_characters)
titles_df['title_cl'] = titles_df['title_cl'].apply(remove_nontitle_substrings)

# test: remove_special_characters
# display(titles_df.loc[titles_df['title_cl'].str.contains('ninjago'), :])

In [None]:
# test effect of normalization

# inspect overall df
items_df_cl[~items_df_cl['author'].isna()].sort_values(by='author').head(100)[['itemID','title', 'author','author_cl']]

# check items affected by normalization
author_cl_unique_author = items_df_cl.groupby("author_cl")["author"].nunique()
print(f'cnt of authors that could be matched due to normalization: {(author_cl_unique_author > 1).sum()}')
items_df_cl[items_df_cl['author_cl'].isin(author_cl_unique_author[author_cl_unique_author > 1].reset_index()['author_cl'])].sort_values(by='author_cl')

#### publisher

### [DEV] Unify main and subtopic

### [DEV] Header-Set Generation

__Approach:__
1. __[done]__ Generate new header-set with new IDs to unify same books that appear multiple times in the items and transactions table
    a. generate new IDs
    b. unify information
2. Pull data on header level from external sources (e.g. google doc incl. publication date and language flag)
3. __[done]__ Replace the subset IDs in transactions table by superset IDs

In [265]:
def generate_header_set(items_df):
    """
    generates header set of items that combines attributes of several items with same title that e.g. only differ in itemID
    or other attributes
    > headerID can be used to replace itemID in transactions_df
    """
    # generate header attribute sets from sub-items -> important: generate sets to prevent duplication 
    header_items_author_df = items_df['author'].groupby([items_df.title]).apply(set).reset_index()
    header_items_publisher_df = items_df['publisher'].groupby([items_df.title]).apply(set).reset_index()
    header_items_mtst_df = items_df['mt_st_cl'].groupby([items_df.title]).apply(sum).apply(set).reset_index() # get unique list of topics

    # compile the list of dataframes you want to merge
    header_items_df_lst = [header_items_author_df, header_items_publisher_df, header_items_mtst_df]

    # merge all attributes
    header_items_df = reduce(lambda left,right: pd.merge(left,right,on=['title'],
                                                how='outer'), header_items_df_lst)

    # generate new header index
    header_items_df = header_items_df.reset_index().rename(columns={'index':'headerID'})

#     # result inspection
#     print(f'shape of items_df: {items_df.shape}')
#     print(f'shape of header_items_df: {header_items_df.shape}')

#     print(f'\ncnt of duplicate "title" in header_df: {(header_items_df["title"].value_counts() > 1).sum()} ({round(dup_cnts*100/len(header_items_df),2)}%)')

#     print(f'\nconverted df:')
#     display(header_items_df[header_items_df['title'].isin(['(Heli-)opolis - Der verhängnisvolle Plan des Weltkoordinators',
#                                                    '13 Kings',
#                                                    'Ära der Lichtwächter'])].head(5))

#     print(f'\noriginal df:')
#     display(items_df[items_df['title'].isin(['(Heli-)opolis - Der verhängnisvolle Plan des Weltkoordinators',
#                                                    '13 Kings',
#                                                    'Ära der Lichtwächter'])].head(5))

    return header_items_df

In [266]:
# generate header set with unique ids for "super-items"
header_items_df = generate_header_set(items_df)

# add headerID to items_df (drop before join if already existent)
if 'headerID' in items_df.columns:
    items_df = items_df.drop(columns=['headerID'])
items_df = items_df.merge(header_items_df[['title','headerID']], left_on='title', right_on='title',how='left') 
display(items_df.head())
print(f'missing headerIDs in items_df: {items_df["headerID"].isnull().sum()}')

shape of items_df: (78030, 11)
shape of header_items_df: (72128, 5)

cnt of duplicate "title" in header_df: 0 (0.0%)

original df:


Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,mt_len,mt_0,mt_cl,st_cl,mt_st_cl
30784,69287,13 Kings,V. S. Nesby,Xlibris,FL,[],2.0,F,[FL],[],"[, FL]"
33470,44003,(Heli-)opolis - Der verhängnisvolle Plan des Weltkoordinators,Michael Häusler,Books on Demand,FM,[],2.0,F,[FM],[],"[, FM]"
34511,12623,(Heli-)opolis - Der verhängnisvolle Plan des Weltkoordinators,Michael Häusler,Books on Demand,FL,[],2.0,F,[FL],[],"[, FL]"
46735,18242,Ära der Lichtwächter,Klaus Pfrommer,swb media publishing,FMB,"[FMB,FMR,FMX]",3.0,F,[FMB],"[FMB, FMX, FMR]","[FMB, FMX, FMR, FMB]"
53131,40200,Ära der Lichtwächter,Klaus Pfrommer,swb media publishing,FMR,"[FMR,FMT,FMX]",3.0,F,[FMR],"[FMT, FMX, FMR]","[FMT, FMX, FMR, FMR]"



converted df:


Unnamed: 0,headerID,title,author,publisher,mt_st_cl
68,68,(Heli-)opolis - Der verhängnisvolle Plan des Weltkoordinators,{Michael Häusler},{Books on Demand},"{, FM, FL}"
195,195,13 Kings,"{Vs Nesby, V. S. Nesby}","{Xlibris, Xlibris US}","{, FL}"
72083,72083,Ära der Lichtwächter,{Klaus Pfrommer},{swb media publishing},"{FMT, FMB, FMX, FMR}"


Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,mt_len,mt_0,mt_cl,st_cl,mt_st_cl,headerID
0,21310,Princess Poppy: The Big Mix Up,Janey Louise Jones,Penguin Random House Children's UK,YFB,[5AH],3.0,Y,[YFB],[5AH],"[5AH, YFB]",45233
1,73018,Einfach zeichnen! Step by Step,Wiebke Krabbe,Schwager und Steinlein,AGZ,"[5AJ,AGZ,WFA,YBG,YBL,YNA,YPA]",3.0,A,[AGZ],"[YBL, 5AJ, WFA, YBG, YNA, YPA, AGZ]","[YBL, 5AJ, WFA, YBG, YNA, YPA, AGZ, AGZ]",18841
2,19194,Red Queen 1,Victoria Aveyard,Orion Publishing Group,YFH,"[5AP,FBA]",3.0,Y,[YFH],"[FBA, 5AP]","[FBA, 5AP, YFH]",46439
3,40250,Meine Kindergarten-Freunde (Pirat),,Ars Edition GmbH,YB,"[5AC,5AD,YBG,YBL,YF]",2.0,Y,[YB],"[YBL, 5AD, 5AC, YBG, YF]","[YBL, 5AD, 5AC, YBG, YF, YB]",38287
4,46107,Mein großes Schablonen-Buch - Wilde Tiere,Elizabeth Golding,Edition Michael Fischer,WFTM,"[WD,WFTM,YBG,YBL,YBLD,YBLN1]",4.0,W,[WFTM],"[YBL, YBLN1, YBG, WFTM, WD, YBLD]","[YBL, YBLN1, YBG, WFTM, WD, YBLD, WFTM]",38114


missing headerIDs in items_df: 0


## Feature Engineering

### Language flag

__Idea:__
Flag Language of title in order to improve same language recommendations

__Lookup Links:__
1. [stackoverflow:](https://stackoverflow.com/questions/39142778/python-how-to-determine-the-language) comparison of different language detection modules
2. [tds](https://towardsdatascience.com/benchmarking-language-detection-for-nlp-8250ea8b67c) performance evaluation -> recommends __fasttext__

In [None]:
# define test strings
str_en = "romeo and juliet: the graphic novel"
str_de = "sternenschweif. zauberhafter schulanfang"

# define whether to use existing flags and df
if not recompute_lg_flg:
    items_df = items_df_pp

#### module testing

In [None]:
# module detector dict
lan_detector = {'ld': 'langdetect', 'gl': 'guess_language', 'lg': 'langid'}

##### langdetect (=title_ld)
[langdetect](https://pypi.org/project/langdetect/)
- important: use try-catch block to handle e.g. numerics, urls etc
- non-deterministic approach: remember to set seed for reproducible results

In [None]:
from langdetect import DetectorFactory, detect
from langdetect.lang_detect_exception import LangDetectException

In [None]:
# test detector on sample strings
print(detect(str_en))
print(detect(str_de))

In [None]:
if recompute_lg_flg:
    # get start time for performance evaluation
    start_time_ld = time.time()

    # set seed for reproducability
    DetectorFactory.seed = 0

    # option 1: pre-calculate list of languages
    title_ld = []
    for title in items_df['title']:
        try:
            title_ld.append(detect(title))
    #         print(f'{title}: {detect(title)}')
        except LangDetectException:
            title_ld.append(None)
    #         print(f'{title}: "undefined"')

    # compute execution time
    end_time_ld = time.time()
    print(f'exection time langdetect: {end_time_ld - start_time_ld} seconds')

    items_df['title_ld'] = title_ld

    # option 2: use apply and title col
    # items_df['title_ld'] = items_df['title'].apply(lambda x: detect(x) if not x.isnumeric() else None)

In [None]:
# inspect items w/o language specification -> only numeric !
print(f'cnt of items without language flag: {items_df["title_ld"].isnull().sum()}')
display(items_df[items_df["title_ld"].isnull()].head(10))

# inspect results
ld_vc = pd.DataFrame(items_df['title_ld'].value_counts().reset_index())
display(ld_vc.transpose())

# show barplot with # items with title in given language
fig, ax = plt.subplots(figsize=(15, 5))
sns.barplot(x='index', y='title_ld', ax=ax, data=ld_vc, palette=palette_blue).set(
    xlabel='languages determined by "langdetect"',
    ylabel='# items with title in given language'
)
plt.xticks(rotation=90)
plt.show()

##### guess_language (=title_gl)

- Can detect very short samples

In [None]:
from guess_language import guess_language

In [None]:
print(guess_language(str_en))
print(guess_language(str_de))

In [None]:
if recompute_lg_flg:

    # get start time for performance evaluation
    start_time_gl = time.time()

    # detect langauge of titles
    items_df['title_gl'] = items_df['title'].apply(lambda x: guess_language(x) if not x.isnumeric() else None)

    # set 'UNKNOWN' to None
    items_df.loc[items_df['title_gl']=='UNKNOWN','title_gl'] = None

    # compute execution time
    end_time_gl = time.time()
    print(f'exection time guess_language: {end_time_gl - start_time_gl} seconds')

In [None]:
# inspect results
gl_vc = pd.DataFrame(items_df['title_gl'].value_counts().reset_index())
display(gl_vc.transpose())

# show barplot with # items with title in given language
fig, ax = plt.subplots(figsize=(15, 5))
sns.barplot(x='index', y='title_gl', ax=ax, data=gl_vc, palette=palette_blue).set(
    xlabel='languages determined by "guess_language"',
    ylabel='# items with title in given language'
)
plt.xticks(rotation=90)
plt.show()

##### textblob
Requires NLTK package, uses Google -> API blocked with "HTTP Error 429: Too Many Requests"

##### spacy
- [spacy doku](https://spacy.io/universe/project/spacy-langdetect): did not get it working

##### langid (=title_lg)

In [None]:
import langid

In [None]:
langid.classify(str_en)
langid.classify(str_de)

In [None]:
if recompute_lg_flg:

    # get start time for performance evaluation
    start_time_lg = time.time()

    # option 1: pre-calculate list of languages
    title_lg = []

    for title in items_df['title']:
        title_lg.append(langid.classify(title))
        print(f'{title}: {langid.classify(title)}')

    # compute execution time
    end_time_lg = time.time()
    print(f'exection time langid: {end_time_lg - start_time_lg} seconds')

    # add col to df
    items_df['title_lg'] = [t[0] for t in title_lg]

    # option 2: use apply
    # items_df['title_lg'] = items_df['title'].apply(lambda x: TextBlob(x).detect_language() if not x.isnumeric() or  else None)

In [None]:
# inspect items w/o language specification -> only numeric !
print(f'cnt of items without language flag: {items_df["title_lg"].isnull().sum()}')
#display(items_df[items_df["title_lg"].isnull()].head(10))

# inspect results
lg_vc = pd.DataFrame(items_df['title_lg'].value_counts().reset_index())
display(lg_vc.transpose())

# show barplot with # items with title in given language
fig, ax = plt.subplots(figsize=(15, 5))
sns.barplot(x='index', y='title_lg', ax=ax, data=lg_vc, palette=palette_blue).set(
    xlabel='languages determined by "langid"',
    ylabel='# items with title in given language'
)
plt.xticks(rotation=90)
plt.show()

##### fasttext
- official Python binding module by Facebook
- problems with installation on windows

#### module performance evaluation

In [None]:
# compare execution time and items w/o flag
if recompute_lg_flg:
    lan_detector_eval_df = pd.DataFrame({'execution time [s]': [eval('end_time_'+det.split("_")[1]) - eval('start_time_'+det.split("_")[1]) for det in ['title_ld','title_gl','title_lg']],
                                        '#items w/o language flg':[items_df[det].isnull().sum() for det in ['title_ld','title_gl','title_lg']]},
                                       index=[det for det in lan_detector.values()])
    display(lan_detector_eval_df)

# merge results dfs
ld_gl_vc = ld_vc.merge(gl_vc, left_on='index', right_on='index', how='outer')
ld_gl_lg_vc = ld_gl_vc.merge(lg_vc, left_on='index', right_on='index', how='outer')
display(ld_gl_lg_vc.transpose())
ld_gl_lg_vc = ld_gl_lg_vc.head(10)

# rename columns
ld_gl_lg_vc.columns = ['index', 'langdetect','guess_language','langid']

# add language name
ld_gl_lg_vc['language_name'] = ld_gl_lg_vc['index'].apply(lambda l: pycountry.countries.get(alpha_2=l).name if l != 'en' else 'English')

# transform model cols into identifier column for plotting
ld_gl_lg_vc = pd.melt(ld_gl_lg_vc, id_vars=["index", "language_name"],
                  var_name="flag_m", value_name="idCnt")
#display(ld_gl_lg_vc)

# Draw a nested barplot by language detector
sns.set_theme()
fig, ax = plt.subplots(figsize=(5,4))
g = sns.barplot(y="language_name", x="idCnt", hue="flag_m", data=ld_gl_lg_vc, palette=palette_blue, orient='h')
g.set(xlabel="# itemID", ylabel = "")
g.legend(loc='lower right')
plt.show()

### [DEV] Topic Similarity
__TODO: add scraping results of Estelle__

## Export of final pre-processed dfs

In [268]:
# export items_df
# items_df.to_csv(items_path_pp)

# export header_items_df
# header_items_df.to_csv(header_items_path_pp)