# Data Understanding

## Settings

In [24]:
########################################################################################################################
# Imports & Settings
########################################################################################################################

import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import re

In [25]:
# allow display of all rows (with scrollbar)
pd.set_option("display.max_rows", 10) #pd.set_option("display.max_rows", None)

In [26]:
########################################################################################################################
# Global Variables
########################################################################################################################

# change working dir to access tempData folder
if os.getcwd().endswith('Data Understanding'):
    os.chdir('..')
    print(f'went one level up --> current workdir: {os.getcwd()}')

# source data file paths
transactions_path = 'tempData/sourceData/transactions.csv'
evaluation_path = 'tempData/sourceData/evaluation.csv'
items_path = 'tempData/sourceData/items.csv'
subject_cats_0_path = 'tempData/sourceData/subject_cats_0.csv'

In [27]:
########################################################################################################################
# Functions
########################################################################################################################

def clean_alt_list(list_):
    list_ = list_.replace(', ', ',')
    list_ = list_.replace(',', ',')
    list_ = list_.replace('[', '')
    list_ = list_.replace(']', '')
    return list_

## Data load & initial pre-processing

In [134]:
########################################################################################################################
# Load Data & Print first summary statistics
########################################################################################################################

# Load the dmc source data
# - clicks/baskets/order over a period of 3M
# - rows: one transaction for single item
transactions_df = pd.read_csv(transactions_path, delimiter='|', sep='.', encoding='utf-8')
# - list of product ids (subset of products from items_df) to be used for prediction
evaluation_df = pd.read_csv(evaluation_path, sep='.', encoding='utf-8')
items_df = pd.read_csv(items_path, delimiter='|', sep='.', encoding='utf-8')

# load category lookup table (manually created)
subject_cats_0 = pd.read_csv(subject_cats_0_path, delimiter=';', encoding='utf-8')

# Print description of dfs

# Get shape of dfs
print(f'shape transactions_df: {transactions_df.shape}')
#print(f'shape evaluation_df: {evaluation_df.shape}')
print(f'shape items_df: {items_df.shape}\n')

# Get col names + datatype
print(f'cols transactions_df: \n{transactions_df.dtypes}\n')
#print(f'cols evaluation_df: \n{evaluation_df.dtypes}\n')
print(f'cols items_df: \n{items_df.dtypes}\n')

# Get description of dfs
print(f'desc transactions_df: \n{transactions_df.describe()}\n')
#print(f'desc evaluation_df: \n{evaluation_df.describe()}\n')
print(f'desc items_df: \n{items_df.describe()}\n')

# Get cnt of unique sessions / items
print(f'cnt unqiue sessions: {transactions_df["sessionID"].nunique()}') #271,983
print(f'cnt unqiue items: {transactions_df["itemID"].nunique()}') #24,909

shape transactions_df: (365143, 5)
shape items_df: (78030, 6)

cols transactions_df: 
sessionID    int64
itemID       int64
click        int64
basket       int64
order        int64
dtype: object

cols items_df: 
itemID         int64
title         object
author        object
publisher     object
main topic    object
subtopics     object
dtype: object

desc transactions_df: 
           sessionID         itemID          click         basket  \
count  365143.000000  365143.000000  365143.000000  365143.000000   
mean   139586.939175   40051.292307       1.233180       0.141202   
std     80795.207871   22493.347334       1.069996       1.107574   
min         0.000000       1.000000       0.000000       0.000000   
25%     69459.500000   20713.000000       1.000000       0.000000   
50%    139608.000000   40692.000000       1.000000       0.000000   
75%    209750.500000   58916.000000       1.000000       0.000000   
max    279354.000000   79066.000000     118.000000     293.000000   

  

In [149]:
########################################################################################################################
# Preprocessing for further inspection
########################################################################################################################

# extract list of base cols 
initial_cols= list(items_df.columns)

# normalization: author col
items_df.loc[items_df['author'] == 'ohne Autor', 'author'] = None

# add col: get len of mt string
items_df['mt_len'] = items_df['main topic'].str.len()

# add col: get first element (top level category) of mt string
items_df['mt_0'] = items_df['main topic'].str[0]

# adjust subtopics: set to None if subtopics list is empty
items_df['subtopics_str'] = items_df['subtopics'].astype(str).apply(clean_alt_list)
items_df.loc[items_df['subtopics_str'].apply(len) ==0, 'subtopics_str'] = None

# add col: get click / basket / order flag
transactions_df['click_flg'] = np.where(transactions_df['click'] > 0, 1, 0)
transactions_df['basket_flg'] = np.where(transactions_df['basket'] > 0, 1, 0)
transactions_df['order_flg'] = np.where(transactions_df['order'] > 0, 1, 0)

In [136]:
# show dfs after initial pre-processing
print(f'items_df after first pre-processing:')
display(items_df.head(2))

print(f'transactions_df after first pre-processing:')
display(transactions_df.head(2))

items_df after first pre-processing:


Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,mt_len,mt_0,subtopics_str
0,21310,Princess Poppy: The Big Mix Up,Janey Louise Jones,Penguin Random House Children's UK,YFB,[5AH],3.0,Y,5AH
1,73018,Einfach zeichnen! Step by Step,Wiebke Krabbe,Schwager und Steinlein,AGZ,"[5AJ,AGZ,WFA,YBG,YBL,YNA,YPA]",3.0,A,"5AJ,AGZ,WFA,YBG,YBL,YNA,YPA"


transactions_df after first pre-processing:


Unnamed: 0,sessionID,itemID,click,basket,order,click_flg,basket_flg,order_flg
0,0,21310,1,0,0,1,0,0
1,1,73018,1,0,0,1,0,0


## Overview statistics per relation / attribute

### items

#### author

In [None]:
# count of books per author
books_per_author = pd.DataFrame.from_dict(Counter(items_df.loc[:,'author']),
                                    orient='index',
                                    columns=['book_cnt']).sort_values(by='book_cnt', ascending=False)
books_per_author['frac[%]'] = books_per_author['book_cnt'] * 100 / books_per_author['book_cnt'].sum()

print(f'# books per author:')
display(books_per_author.head(10))

print(f'summary statistics of books per author:')
display(books_per_author.describe())

# distribution of cnt af books among authors
books_per_author_cnts = pd.DataFrame(books_per_author['book_cnt'].value_counts().reset_index()).rename(columns={'index': 'book_cnt', 
                                                                                                                'book_cnt': 'author_cnt'})
books_per_author_cnts['author_cnt.cum'] = books_per_author_cnts['author_cnt'].cumsum()
books_per_author_cnts['frac[%]'] = books_per_author_cnts['author_cnt'] * 100 / books_per_author_cnts['author_cnt'].sum()
books_per_author_cnts['frac.cum[%]'] = books_per_author_cnts['frac[%]'].cumsum()

print(f'distribution of books per author:')
display(books_per_author_cnts.head(10))
sns.set_theme()
sns.histplot(books_per_author[books_per_author['book_cnt']<50]['book_cnt'], binwidth=1)
plt.show()

#### publisher

In [None]:
# count of books per publisher
books_per_publisher = pd.DataFrame.from_dict(Counter(items_df.loc[:,'publisher']),
                                    orient='index',
                                    columns=['book_cnt']).sort_values(by='book_cnt', ascending=False)
books_per_publisher['frac[%]'] = books_per_publisher['book_cnt'] * 100 / books_per_publisher['book_cnt'].sum()

print(f'# books per publisher:')
display(books_per_publisher.head(10))

print(f'summary statistics of books per publisher:')
display(books_per_publisher.describe())

# distribution of cnt af books among publishers
books_per_publisher_cnts = pd.DataFrame(books_per_publisher['book_cnt'].value_counts().reset_index()).rename(columns={'index': 'book_cnt', 
                                                                                                                'book_cnt': 'publisher_cnt'})
books_per_publisher_cnts['publisher_cnt.cum'] = books_per_publisher_cnts['publisher_cnt'].cumsum()
books_per_publisher_cnts['frac[%]'] = books_per_publisher_cnts['publisher_cnt'] * 100 / books_per_publisher_cnts['publisher_cnt'].sum()
books_per_publisher_cnts['frac.cum[%]'] = books_per_publisher_cnts['frac[%]'].cumsum()

print(f'distribution of books per publisher:')
display(books_per_publisher_cnts.head(10))
sns.set_theme()
sns.histplot(books_per_publisher[books_per_publisher['book_cnt']<50]['book_cnt'], binwidth=1)
plt.show()

#### main topics

In [None]:
# get depth of main topic tree
print(f'str len main topics:')
display(pd.DataFrame(items_df["mt_len"].describe()))
mt_len_hist = sns.histplot(items_df['mt_len']).set_title(f'distribution of len of main topics')

# count of books per main topic (=mt) combo
books_per_mt = pd.DataFrame.from_dict(Counter(items_df.loc[:,'main topic']),
                                    orient='index',
                                    columns=['book_cnt']).sort_values(by='book_cnt', ascending=False)
books_per_mt['frac[%]'] = books_per_mt['book_cnt'] * 100 / books_per_mt['book_cnt'].sum()

# plot mt_0 distribution
sns.set_theme()
sns.histplot(items_df['mt_0'].astype(str).sort_values())

# count of books per first element of mt
books_per_mt_0 = pd.DataFrame.from_dict(Counter(items_df.loc[:,'mt_0']),
                                    orient='index',
                                    columns=['book_cnt']).sort_values(by='book_cnt', ascending=False).reset_index()
books_per_mt_0 = books_per_mt_0.rename(columns={'index': 'Notation'})
books_per_mt_0['frac[%]'] = books_per_mt_0['book_cnt'] * 100 / books_per_mt_0['book_cnt'].sum()

# join with category heading
books_per_mt_0 = books_per_mt_0.merge(subject_cats_0, on='Notation', how='left')
print(f'top 5 high level cats:')
display(books_per_mt_0.head(5))

### transactions

#### sessionid

In [None]:
# get cnt of items per sessionid
items_per_session = transactions_df[['sessionID', 'itemID']].groupby('sessionID')['itemID'].count().reset_index().\
    sort_values(by='itemID', ascending=False).rename(columns={'itemID': 'item_cnt'})
items_per_session['frac[%]'] = items_per_session['item_cnt'] * 100 / items_per_session['item_cnt'].sum()

print(f'items per session:')
display(items_per_session.head(10))

print(f'items per session summary stats:')
display(items_per_session.describe())

#### click

In [None]:
# get cnt of clicks per item
clicks_per_item = transactions_df[['itemID', 'click']].groupby('itemID')['click'].sum().reset_index().\
    sort_values(by='click', ascending=False).rename(columns={'click': 'click_cnt'})
clicks_per_item['frac[%]'] = clicks_per_item['click_cnt'] * 100 / clicks_per_item['click_cnt'].sum()

print(f'clicks per item:')
display(clicks_per_item.head(10))

print(f'clicks per item summary stats:')
display(clicks_per_item.describe())

#### basket vs order
- basket: items that were added to basekt but not necessarily bought
- order: items that where finally bought

In [None]:
# get cnt of orders per session
orders_per_session = transactions_df[['sessionID', 'order_flg']].groupby('sessionID')['order_flg'].sum().reset_index().\
    sort_values(by='order_flg', ascending=False).rename(columns={'order_flg': 'order_cnt'})
orders_per_session['frac[%]'] = orders_per_session['order_cnt'] * 100 / orders_per_session['order_cnt'].sum()

print(f'distinct orders per session (binary, w/o qty):')
display(orders_per_session.head(10))

print(f'distinct orders per session summary stats:')
display(orders_per_session.describe())

# get cnt of orders per item
orders_per_item = transactions_df[['itemID', 'order_flg']].groupby('itemID')['order_flg'].sum().reset_index().\
    sort_values(by='order_flg', ascending=False).rename(columns={'order_flg': 'order_cnt'})
orders_per_item['frac[%]'] = orders_per_item['order_cnt'] * 100 / orders_per_item['order_cnt'].sum()

print(f'distinct orders per item (binary, w/o qty):')
display(orders_per_item.head(10))

print(f'distinct orders per item summary stats:')
display(orders_per_item.describe())

# get frac of items that were added to basket but not bought
items_per_basket_order = transactions_df[['itemID',
                                          'basket_flg',
                                          'order_flg']].groupby(['basket_flg',
                                                                 'order_flg'])['itemID'].count().reset_index().rename(columns={'itemID': 'item_cnt'})
items_per_basket_order['frac[%]'] = items_per_basket_order['item_cnt'] * 100 / items_per_basket_order['item_cnt'].sum()
print(f'basket to order conversion:')
display(items_per_basket_order)

## Anomaly Detection

### Missing Values
- 9 items w/o publisher: 
    - could be anything, cannot be imputed
    - not such a crucial information to be missing
    - thus: no handling 
- 3240 items w/o author:
    - correct author might not be uniquely determinable or there might not even be a senseful author
    - thus: no handling
- 258 items w/o main topic:
    - at least subtopic is given
    - only 32 of these also have the author missing
- 36,904 items w/o subtopic:
    - in all of the cases, a main topic is given 
    - thus: still enough information available

In [31]:
# get cnt of missing values per column
missing_values = pd.DataFrame(items_df.isnull().sum()).rename(columns={0: 'cnt'})
missing_values['frac[%]'] = missing_values['cnt'] * 100 / len(items_df)
print(f'null values per column:')
display(missing_values.transpose())

# get cnt of combined null values: sum null values per row and cnt rows with #null > 1
print(f'\n# rows with null values in more than one col: {(items_df[initial_cols + ["subtopics_str"]].isnull().sum(axis=1) > 1).sum()}')
print(f'\ndistribution of null values over cols (1=null, 0=not null):')
display(pd.DataFrame((items_df[initial_cols + ['subtopics_str']].isnull() * 1).value_counts().reset_index()))

null values per column:


Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,mt_len,mt_0,subtopics_str
cnt,0.0,0.0,3240.0,9.0,258.0,1.0,258.0,258.0,36904.0
frac[%],0.0,0.0,4.152249,0.011534,0.330642,0.001282,0.330642,0.330642,47.29463



# rows with null values in more than one col: 901

distribution of null values over cols (1=null, 0=not null):


Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,subtopics_str,0
0,0,0,0,0,0,0,0,38519
1,0,0,0,0,0,0,1,36035
2,0,0,1,0,0,0,0,2348
3,0,0,1,0,0,0,1,860
4,0,0,0,0,1,0,0,226
5,0,0,1,0,1,0,0,32
6,0,0,0,1,0,0,1,9
7,0,0,0,0,0,1,0,1


#### Missing publisher

In [None]:
# check all items with missing publisher
print('all items with missing publisher:')
display(items_df[items_df['publisher'].isnull()])

# check whether there might be other entries with publisher given
missing_publisher_title = items_df[items_df['publisher'].isnull()]['title']
print(f'books with same title that appear twice: {(items_df[items_df["title"].isin(missing_publisher_title)].groupby("title")["itemID"].count() > 1).sum()}\n')

# inspect sample with missing publisher 
# > missing publisher is most likely to be 'TEKTIME' > however: could also be different
print('entries for title "Back to Earth" with missing publisher for some editions:')
display(items_df[items_df['title'].str.contains('Back to Earth')])
print('entries for author "Danilo Clementoni" with missing publisher for some items:')
display(items_df[items_df['author'] == 'Danilo Clementoni'])

#### Missing author
- __problem__: 
    - there is a lot of items with very generalistic titles like 'Dinosaurier' or 'Die Weihnachtsgeschichte' that do not allow to uniquely determine the correct author
    - there might not even be a unique author, like for 'Freundebuch - Einhorn-Paradies - Meine Freunde' or 'Kritzkratz-Spaß Glitzer'
    - there might be the same item but several different authors, like for 'Goldilocks and the Three Bears'
    
- __approach__: 
    - try to not impute author, use other attributes instead, e.g. topic or publisher

In [None]:
# check all items with missing author
print('first 10 items with missing author:')
display(items_df[items_df['author'].isnull()].head(10))

# check whether there might be other entries with author given
missing_author_title = items_df[items_df['author'].isnull()]['title']
missing_author_cnt_dups = pd.DataFrame(items_df[items_df["title"].isin(missing_author_title)].groupby("title")["itemID"].count())
print(f'\nbooks with same title that appear twice (see df below): {(missing_author_cnt_dups["itemID"] > 1).sum()}')

# check whether author can be retried
missing_author_dups = missing_author_cnt_dups[missing_author_cnt_dups["itemID"] > 1].reset_index()['title']
display(items_df[items_df['title'].isin(missing_author_dups)].sort_values(by='title'))


#### Missing topic

In [None]:
# check whether there are items with no topic at all
print(f'cnt of items with both, main topic and subtopic == null: {((items_df["subtopics_str"].isnull()) & (items_df["main topic"].isnull())).sum()}')

##### main

In [None]:
# check all items with missing topic
print('first 10 items with missing topic:')
display(items_df[items_df['main topic'].isnull()].head(10))

# check whether there might be other entries with topic given
missing_topic_title = items_df[items_df['main topic'].isnull()]['title']
missing_topic_cnt_dups = pd.DataFrame(items_df[items_df["title"].isin(missing_topic_title)].groupby("title")["itemID"].count())
print(f'\nbooks with same title that appear twice (see df below): {(missing_topic_cnt_dups["itemID"] > 1).sum()}')

# check whether topic can be retried
missing_topic_dups = missing_topic_cnt_dups[missing_topic_cnt_dups["itemID"] > 1].reset_index()['title']
display(items_df[items_df['title'].isin(missing_topic_dups)].sort_values(by='title'))

# check cnt of items with main topic and subtopic missing


##### sub
- no scalable solution for imputing subtopics
- out of the 36,904 missing subtopics, only 2,668 items appear multiple times
    - out of these, only 1,574 actually have a duplicate with a subtopic given

In [None]:
# check all items with missing topic
print('first 10 items with missing topic:')
display(items_df[items_df['subtopics_str'].isnull()])

# check whether there might be other entries with topic given
missing_topic_title = items_df[items_df['subtopics_str'].isnull()]['title']
missing_topic_cnt_dups = pd.DataFrame(items_df[items_df["title"].isin(missing_topic_title)].groupby("title")["itemID"].count())
print(f'\nbooks with same title that appear twice (see df below): {(missing_topic_cnt_dups["itemID"] > 1).sum()}')

# check whether topic can be retried
missing_topic_dups = missing_topic_cnt_dups[(missing_topic_cnt_dups["itemID"] > 1)].reset_index()['title']
display(items_df[(items_df['title'].isin(missing_topic_dups)) & (~items_df['subtopics_str'].isnull())].sort_values(by='title'))

### Duplicates

__To keep in mind:__
1. other relevant attributes are not given, e.g.:
    - actual __language__ might not be that of title
    - __publication date__ might differ between itemIDs (=Neuauflage)
    - title might not be complete (e.g. __subtitle__ missing)
        - e.g. '[Ära der Lichtwächter](https://www.amazon.com/s?k=%C3%84ra+der+Lichtw%C3%A4chter&ref=nb_sb_noss)' from 'Klaus Pfrommer' (itemID = (40200,18242)) is collection with differing subtitles "Die Täuschung", "Das Vermächtnis", "Die Unschuld" 
    - thus: itemID would be unique identifier for actually different items
2. __transactions__ might help to differentiate between items and __rank their relevance__

#### duplicate entries per column

In [44]:
# cnt column-wise duplication
sc_cnt = 1
for col in initial_cols:
    print(f'cnt of duplicate {col}: {(items_df[col].value_counts() > 1).sum()}')
    
# inspect title duplicates
title_cnts = (items_df["title"].value_counts().reset_index())
title_dups_lst = title_cnts[title_cnts["title"]>1]["index"]
items_df[(items_df["title"].isin(title_dups_lst))].sort_values(by="title")

cnt of duplicate itemID: 0
cnt of duplicate title: 4193
cnt of duplicate author: 10120
cnt of duplicate publisher: 3426
cnt of duplicate main topic: 478
cnt of duplicate subtopics: 2599


#### everything identical except of single column
- only cases for duplicated items with same attributes but different itemID

In [48]:
col_list = initial_cols #['itemID'] 
for col in col_list:
    
    # check all cols except of current one
    col_list_lim = [c for c in items_df.columns if c != col]
    #print(f'{col}: {col_list_lim}')

    # compute duplicate cnt
    dup = pd.DataFrame(items_df.groupby(col_list_lim)[col].count().reset_index())
    print(f'everything identical except of {col} = {(dup[col] > 1).sum()}')
    #display(dup[dup[col] > 1].sort_values(by=col))
    #display(dup.sort_values(by=col))

everything identical except of itemID = 464
everything identical except of title = 0
everything identical except of author = 0
everything identical except of publisher = 0
everything identical except of main topic = 0
everything identical except of subtopics = 0


In [None]:
# deep dive: everything identical except of ID
print(f'sample entry for sc1: everything identical except of itemID')
display(items_df[items_df['title']=='Reisestipendien'])

### String normalization

1. capitalization
- e.g. publisher = 'TEKTIME' or 'Tektime' or author = 'V. S. Nesby' and 'Vs Nesby'
2. special characters
3. unicode characters
    - remove any Unicode character, i.e. in Switzerland, multiple German (ä,ö,ü), French (à,é,è) or Italian umlaut are used, and we want to get rid of them as well.
4. weird entries
    - author: der Authhhhor
    - diverse Autoren, Autoren
5. 

In [88]:
pd.set_option("display.max_rows", 100)

In [None]:
items_df.dtypes

In [143]:
# generate copy of original df
items_df_cl = items_df.copy()
display(items_df_cl.head(10))
#display(items_df.head(10))

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,mt_len,mt_0,subtopics_str
0,21310,Princess Poppy: The Big Mix Up,Janey Louise Jones,Penguin Random House Children's UK,YFB,[5AH],3.0,Y,5AH
1,73018,Einfach zeichnen! Step by Step,Wiebke Krabbe,Schwager und Steinlein,AGZ,"[5AJ,AGZ,WFA,YBG,YBL,YNA,YPA]",3.0,A,"5AJ,AGZ,WFA,YBG,YBL,YNA,YPA"
2,19194,Red Queen 1,Victoria Aveyard,Orion Publishing Group,YFH,"[5AP,FBA]",3.0,Y,"5AP,FBA"
3,40250,Meine Kindergarten-Freunde (Pirat),,Ars Edition GmbH,YB,"[5AC,5AD,YBG,YBL,YF]",2.0,Y,"5AC,5AD,YBG,YBL,YF"
4,46107,Mein großes Schablonen-Buch - Wilde Tiere,Elizabeth Golding,Edition Michael Fischer,WFTM,"[WD,WFTM,YBG,YBL,YBLD,YBLN1]",4.0,W,"WD,WFTM,YBG,YBL,YBLD,YBLN1"
5,34217,Ewig geliebt,J. R. Ward,Heyne Taschenbuch,FMR,"[1KBB-US-NAK,FMX,FRX,3MRBF]",3.0,F,"1KBB-US-NAK,FMX,FRX,3MRBF"
6,31436,Meine Sticker-Tiere,,Ars Edition GmbH,YBG,"[5AD,YBG,YBLL]",3.0,Y,"5AD,YBG,YBLL"
7,14576,Unsterblich 01 - Tor der Dämmerung,Julie Kagawa,Heyne Taschenbuch,YFE,"[5AQ,FM,YFE,YFH]",3.0,Y,"5AQ,FM,YFE,YFH"
8,17731,Unsterblich 02 - Tor der Nacht,Julie Kagawa,Heyne Taschenbuch,YFH,"[5AQ,FM,YFE,YFH]",3.0,Y,"5AQ,FM,YFE,YFH"
9,58723,Pedro und die Bettler von Cartagena,Ursula Hasler,dtv Verlagsgesellschaft,YFB,"[5AM,1KLSC]",3.0,Y,"5AM,1KLSC"


In [154]:
# convert all strings to lowercase
items_df_cl = items_df.applymap(lambda s:s.lower() if type(s) == str else s)
#display(items_df_cl.head(10))

In [157]:
# insert dot after single characters
items_df_cl["author_cl"] = items_df_cl["author"].astype(str).apply(lambda x: re.sub(r'([A-Z])\.?(?![a-z])\s*', r'\g<1>. ', x))

In [153]:
items_df_cl[items_df_cl["author_cl"].str.contains('Nesby')]

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,mt_len,mt_0,subtopics_str,author_cl
30784,69287,13 Kings,V. S. Nesby,Xlibris,FL,[],2.0,F,,V. S. Nesby
55425,55553,13 Kings,Vs Nesby,Xlibris US,FL,[],2.0,F,,Vs Nesby
59340,60335,13Kings,Vs Nesby,Xlibris US,FL,[],2.0,F,,Vs Nesby


In [158]:
# inspect df
items_df_cl[~items_df_cl['author'].isna()].sort_values(by='author').head(200)[['itemID','title', 'author','author_cl']]

Unnamed: 0,itemID,title,author,author_cl
42562,30689,"''2046 (back to 2046,chinese edition)",'',''
2011,46571,triad blood,'nathan burgoine,'nathan burgoine
47587,9838,el jaguar y el aguila/the jaguar and the eagle,(mixtli) graycloud,(mixtli) graycloud
34388,38680,can you survive an alien invasion?: an interac...,",blake hoena",",blake hoena"
34398,44836,christmas carol,",charles dickens",",charles dickens"
...,...,...,...,...
72017,1409,dark stars - complete trilogy,a. k. duboff,a. k. duboff
31969,29050,schattenkrieger,a. k. hardware,a. k. hardware
56624,73329,god is my superhero,a. k. kronicles,a. k. kronicles
72539,7358,imperium heirs,a. k. kuykendall,a. k. kuykendall


## Feature Engineering

### Language flag

### Topic Similarity