# Data Pre-Processing

## To Dos

## Settings & User Input

In [None]:
########################################################################################################################
# Imports & Settings
########################################################################################################################

import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import re
import time
import pycountry
from pandas.core.common import flatten
from functools import reduce

In [None]:
# allow display of all rows (with scrollbar)
pd.set_option("display.max_rows", 10) #pd.set_option("display.max_rows", None)

In [None]:
########################################################################################################################
# User Input
########################################################################################################################

# source data file paths
transactions_path = '../data/external/transactions.csv'
evaluation_path = '../data/external/evaluation.csv'
items_path = '../data/external/items.csv'
subject_cats_0_path = '../data/external/subject_cats_0.csv'
gbooks_path = '../data/external/gbooks_final.json'

# pre-processed data file paths (incl. language flags)
transactions_path_pp = '../data/processed/transactions_pp.csv'
items_path_pp = '../data/processed/items_pp.csv'
header_items_path_pp = '../data/processed/header_items_pp.csv'
gbooks_volumeInfo_path_pp = '../data/processed/gbooks_volumeInfo_pp.feather'
header_items_20210517_path = '../data/processed/20210517_header_items_df.csv'
header_items_20210519_path = '../data/processed/20210519_header_items_df.csv'

# seaborn color palette
palette_blue = "Blues_d"
dark_blue = "#011f4b"
middle_blue = "#005b96"
light_blue = "#b3cde0"

# determine: re-calculate certain details
recompute_lg_flg = False # calculated language flags 
recompute_gbooks_volumeInfo = False # volumeInfo per book pulled from GoogleAPI

## Functions

In [None]:
########################################################################################################################
# Functions
########################################################################################################################

def clean_alt_list(list_):
#     list_ = list_.replace(', ', ',')
    list_ = list_.replace('[', '')
    list_ = list_.replace(']', '')
    return list_


def items_initial_col_processing(items_df, drop_original=True):
    # add col: get len of mt string
#     items_df['mt_len'] = items_df['main topic'].str.len()

    # add col: get first element (top level category) of mt string
#     items_df['mt_0'] = items_df['main topic'].str[0]

    # add col: main topic as set (and converted back to list)
    items_df['mt_cl'] = items_df['main topic'].astype(str).apply(lambda x: list(set(clean_alt_list(x).split(','))))

    # adjust subtopics: set to None if subtopics list is empty
    items_df['st_cl'] = items_df['subtopics'].astype(str).apply(lambda x: list(set(clean_alt_list(x).split(','))))
    items_df.loc[items_df['st_cl']=={''}, 'st_cl'] = None

    # add col: unique combination of main and subtopic
    items_df['mt_st_cl'] = (items_df['st_cl'] + items_df['mt_cl']) #.apply(set)
    
    # drop initial topic cols
    if drop_original:
        items_df = items_df.drop(columns=['main topic', 'subtopics'])
    
    return items_df


def tr_initial_col_processing(transactions_df):
    # add col: get click / basket / order flag
    transactions_df['click_flg'] = np.where(transactions_df['click'] > 0, 1, 0)
    transactions_df['basket_flg'] = np.where(transactions_df['basket'] > 0, 1, 0)
    transactions_df['order_flg'] = np.where(transactions_df['order'] > 0, 1, 0) 
    
    return transactions_df


def extract_gbook_volumeInfo(data, target_keys):

    # initialize final details df
    volumeInfo_df = pd.DataFrame()
    total = len(data)

    for index, row in data.iterrows():
    
        # print progress report
        if int(index%1000) == 0:
            print(f'{index}/{total}')
    
        # extract volumInfo if given
        if row["items"]:
            for item in row["items"]:

                available_keys = list(item['volumeInfo'].keys())
    #             print(f'available_keys: {available_keys}')

                extraction_keys = list(frozenset(available_keys).intersection(target_keys))
    #             print(f'extraction keys: {extraction_keys}')

                volumeInfo_item_df = pd.DataFrame(item).loc[extraction_keys,'volumeInfo']
                volumeInfo_item_df = pd.DataFrame(volumeInfo_item_df).transpose()
                volumeInfo_item_df["itemIdx"] = row["itemIdx"]
    #             display(volumeInfo_item_df)
    #             print()

                volumeInfo_df = pd.concat([volumeInfo_df,volumeInfo_item_df])

    # reset index of volumeInfo df
    volumeInfo_df.reset_index(inplace=True)
    volumeInfo_df = volumeInfo_df.drop(columns='index')  
    
    return volumeInfo_df 


def remove_special_characters(list_):
#     list_ = re.sub(r'^\W+', r'', list_) #removes leading non-alphanumerics, e.g. ",william shakespeare"

    # Remove punctuation & special characters
    list_ = re.sub(r'[®,\.!?\"\(\)\'\:#]','',list_)
    list_ = re.sub(r'-',' ',list_)
    return list_


def remove_next_sign(list_):
#     list_ = re.sub(r'^\W+', r'', list_) #removes leading non-alphanumerics, e.g. ",william shakespeare"

    # Remove punctuation & special characters
    list_ = re.sub(r'[\n]','',list_)
    
    return list_

def remove_nontitle_substrings(list_):
    list_ = str(list_)

    # type of book
    for book_type in ['taschenbuch','hardcover','hardback']:
        list_ = re.sub(f'\(.*{book_type}.*\)?','',list_) #remove all content within brackets
        list_ = re.sub(f'-\s*(\w*\s*){book_type}.*','',list_)
        list_ = re.sub(f':.*{book_type}.*','',list_)
        list_ = re.sub(f'(.*{book_type}[\w\d\s]*):','',list_)
        list_ = re.sub(f'[(special)(book)(edition)\s*]*{book_type}\s*[(special)(book)(edition)\s*]*','',list_)
        list_ = re.sub(f'{book_type}','',list_)
        
    # (light novel)
    list_ = re.sub(f'(light novel)','',list_)
    list_ = re.sub(f'\(novel\)','',list_)
    
    # (edition)
    list_ = re.sub(f'\(.*edition.*\)','',list_)  

    return list_


def convert_umlaute(list_):
    list_ = list_.replace("ä","ae").replace("ü","ue").replace("ö","oe")
    return list_


def remove_duplicate_whitespace(list_):
    list_ = re.sub(f' {2,}','',list_)
    return list_


def generate_header_set(items_df):
    """
    generates header set of items that combines attributes of several items with same title that e.g. only differ in itemID
    or other attributes
    > headerID can be used to replace itemID in transactions_df
    """
    # generate header attribute sets from sub-items -> important: generate sets to prevent duplication 
    header_items_author_df = items_df['author'].groupby([items_df.title]).apply(set).reset_index()
    header_items_publisher_df = items_df['publisher'].groupby([items_df.title]).apply(set).reset_index()
    header_items_mtst_df = items_df['mt_st_cl'].groupby([items_df.title]).apply(sum).apply(set).reset_index() # get unique list of topics

    header_items_language_df = items_df['language'].groupby([items_df.title]).apply(set).reset_index()
    header_items_number_pages_df = items_df['number_pages'].groupby([items_df.title]).apply(set).reset_index()
    header_items_recommended_age_df = items_df['recommended_age'].groupby([items_df.title]).apply(set).reset_index()
    header_items_release_date_df = items_df['release_date'].groupby([items_df.title]).apply(set).reset_index()
    header_items_description_df = items_df['description'].groupby([items_df.title]).apply(set).reset_index()
    
    # compile the list of dataframes you want to merge
    header_items_df_lst = [header_items_author_df, header_items_publisher_df, header_items_mtst_df, header_items_language_df,
                           header_items_number_pages_df,header_items_recommended_age_df, header_items_release_date_df,
                           header_items_description_df ]

    # merge all attributes
    header_items_df = reduce(lambda left,right: pd.merge(left,right,on=['title'],
                                                how='outer'), header_items_df_lst)

    # generate new header index
    header_items_df = header_items_df.reset_index().rename(columns={'index':'headerID'})

    # result inspection
    print(f'shape of header_items_df vs. items_df: {header_items_df.shape} vs. {items_df.shape}')
    print(f'cnt of duplicate "title" in header_df: {(header_items_df["title"].value_counts() > 1).sum()}')

#     print(f'\nconverted df:')
#     display(header_items_df[header_items_df['title'].isin(['(Heli-)opolis - Der verhängnisvolle Plan des Weltkoordinators',
#                                                    '13 Kings',
#                                                    'Ära der Lichtwächter'])].head(5))

#     print(f'\noriginal df:')
#     display(items_df[items_df['title'].isin(['(Heli-)opolis - Der verhängnisvolle Plan des Weltkoordinators',
#                                                    '13 Kings',
#                                                    'Ära der Lichtwächter'])].head(5))

    return header_items_df

## Data load & initial pre-processing

### DMC Source Data

In [None]:
########################################################################################################################
# Load Data
########################################################################################################################

# Load the dmc source data

# - clicks/baskets/order over a period of 3M
# - rows: one transaction for single item
transactions_df = pd.read_csv(transactions_path, delimiter='|', sep='.', encoding='utf-8')

# - list of product ids (subset of products from items_df) to be used for prediction
evaluation_df = pd.read_csv(evaluation_path, sep='.', encoding='utf-8')
items_df = pd.read_csv(items_path, delimiter='|', sep='.', encoding='utf-8')

# load category lookup table (manually created)
subject_cats_0 = pd.read_csv(subject_cats_0_path, delimiter=';', encoding='utf-8')

########################################################################################################################
# Preprocessing for further inspection
########################################################################################################################

# extract list of base cols
initial_cols= list(items_df.columns)

# add/pre-process cols
items_df = items_initial_col_processing(items_df, drop_original=True)
transactions_df = tr_initial_col_processing(transactions_df)

########################################################################################################################
# Inspection of dfs after initial pre-processing
########################################################################################################################

# show dfs after initial pre-processing
print(f'items_df after first pre-processing:')
display(items_df.head(2))

print(f'transactions_df after first pre-processing:')
display(transactions_df.head(2))

### Pre-Processed Header DF

In [None]:
# load header df 
header_items_20210517 = pd.read_csv(header_items_20210517_path)
header_items_20210517.head(5)

### [DEV] Google API Extract

__To do:__
1. process remaining batches
2. reduce to one match per item
3. include details into items_df

#### Data Load & Pre-Processing

In [None]:
# Load the gbooks details (df)
gbooks_df = pd.read_json(gbooks_path, orient='records')

# reset index (to simplify later join with items_df)
if 'index' in gbooks_df.columns:
    gbooks_df = gbooks_df.drop(columns='index')
gbooks_df.reset_index(inplace=True)
gbooks_df = gbooks_df.rename(columns={'index':'itemIdx'})

# get df stats
print(f'gbooks_df:')
display(gbooks_df.head())

print(f'shape gbooks_df: {gbooks_df.shape}')
print(f'shape items_df: {items_df.shape}\n')

# inspect distribution ot total items
# plt.hist(gbooks_df['totalItems'])

In [None]:
len(gbooks_df)

In [None]:
# 11.05.2021 - 09:57 - max batch index = 20k
# 11.05.2021 - 10:31 - 20k-30k
# 11.05.2021 - 10:55 - 30k-40k

batch_start_index = 40000
batch_end_index = len(gbooks_df)
volumeInfo_df = extract_gbook_volumeInfo(gbooks_df.iloc[batch_start_index+1:batch_end_index+1,:],
                                        target_keys=['title','publisher','authors','publishedDate','description','printType',
                                                       'categories','maturityRating', 'language'])
# inspect head of df
# display(volumeInfo_df.head())

# shape
print(f'shape volumeInfo_df: {volumeInfo_df.shape}\n')

# get cnt of nas
print(f'na per col: \n{volumeInfo_df.isna().sum()}\n')

# value counts specific cols
for col in ['maturityRating', 'printType','language']:
    display(pd.DataFrame(volumeInfo_df[col].value_counts()).transpose())

In [None]:
volumeInfo_df.tail()

In [None]:
# exclude magazines
volumeInfo_df = volumeInfo_df.loc[volumeInfo_df['printType']=='BOOK',:]
# volumeInfo_df = volumeInfo_df.drop(columns='printType') #check whether only books in subsequent batches

# reset index before saving as feather
volumeInfo_df.reset_index(inplace=True)
volumeInfo_df = volumeInfo_df.drop(columns='index')

In [None]:
# save table as feather file (for simplified later load)
gbooks_volumeInfo_path_pp = f'../data/interim/gbooks_volumeInfo_{int(batch_start_index / 1000)}k-{int(batch_end_index / 1000)}k.feather'
volumeInfo_df.to_feather(gbooks_volumeInfo_path_pp)

#### Testing 

In [None]:
test_itemIdx = 9378

display(volumeInfo_df.loc[volumeInfo_df['itemIdx']==test_itemIdx,:])
display(items_df.iloc[test_itemIdx,:])

## Thalia data

In [None]:
thalia_data = pd.read_pickle('../data/external/thalia_features.pkl')
print(thalia_data.head(5))

In [None]:
items_df = items_df.merge(thalia_data, on='itemID')

## [DEV] Outlier Detection
- only for __transactions__: remove transactions with suspiciously high #of clicks/basket/order

In [None]:
print('Original shape:', transactions_df.shape)

In [None]:
sns.boxplot(transactions_df['click'])

In [None]:
transactions_df = transactions_df[transactions_df['click'] < np.quantile(transactions_df.click, 0.99)]

In [None]:
sns.boxplot(transactions_df['basket'])

In [None]:
transactions_df = transactions_df[transactions_df['basket'] < 2]

In [None]:
sns.boxplot(transactions_df['order'])

In [None]:
transactions_df = transactions_df[transactions_df['order'] < 2]

In [None]:
print('After outlier removal shape:', transactions_df.shape)

## String normalization

__Applied:__
1. conversion to lowercase, e.g. publisher = 'TEKTIME' or 'Tektime' to 'tektime'
2. removal of leading special characters, e.g. ",william shakespeare"
3. conversion of unicode characters (ä,ö,ü)

__No fix yet:__
1. weird entries
    - author: der Authhhhor
    - diverse Autoren, Autoren
3. unicode characters like (à,é,è,°o)

# generate copy of original df for testing of pre-processing
items_df_cl = items_df.copy()
display(items_df_cl.head(5))

In [None]:
items_df_cl = items_df.copy()
display(items_df_cl.head(5))

### pre-processing

In [None]:
cols_pp = ['title', 'author', 'publisher']

# convert all strings to lowercase
items_df[cols_pp] = items_df[cols_pp].applymap(lambda s:s.lower() if type(s) == str else s)

for col in cols_pp:
    
    col_cl = col + '_cl'

    # add additional col for pp titles
    items_df[col_cl] = items_df[col]

    # clean strings
    if col == 'title':
        items_df[col_cl] = items_df[col_cl].apply(remove_nontitle_substrings)
    items_df[col_cl] = items_df[col_cl].astype(str).apply(remove_special_characters)
    items_df[col_cl] = items_df[col_cl].apply(convert_umlaute)

    # reduce all spaces in the articles to single spaces
    items_df[col_cl] = items_df[col_cl].apply(remove_duplicate_whitespace)

    # print stats
    col_cnt_unique = items_df[col].nunique()
    col_cl_cnt_unique = items_df[col_cl].nunique()
    print(f'# unique {col} (before preprocessing): {col_cnt_unique} / {len(items_df)}')
    print(f'# unique {col} (after preprocessing): {col_cnt_unique} / {len(items_df)}')
    print(f'# reduction in unique {col}: {col_cnt_unique-col_cl_cnt_unique}\n')
    
# replace original cols by pre-processed cols
items_df = items_df.drop(columns=cols_pp)
items_df = items_df.rename(columns={'title_cl': 'title', 'author_cl': 'author', 'publisher_cl': 'publisher'})

# remove items with missing title after pre-processing
print(f"remove items with missing/empty title after pp: {(items_df['title']=='').sum()}")
items_df = items_df[items_df['title']!='']

# display cleaned df head
display(items_df.head(10))

### validation

#### title

In [None]:
# generate titles df (with comparison column for original and cleaned title)
titles_df = pd.DataFrame(items_df_cl["title"].unique()).rename(columns={0: "title"})
titles_df['title_cl'] = titles_df['title']

# convert all strings to lowercase
titles_df = titles_df.applymap(lambda s:s.lower() if type(s) == str else s)

# clean strings
titles_df['title_cl'] = titles_df['title_cl'].astype(str).apply(remove_special_characters)
titles_df['title_cl'] = titles_df['title_cl'].apply(remove_nontitle_substrings)
titles_df['title_cl'] = titles_df['title_cl'].apply(convert_umlaute)

# reduce all spaces in the articles to single spaces
titles_df['title_cl'] = titles_df['title_cl'].apply(remove_duplicate_whitespace)

# print stats
title_cnt_unique = titles_df["title"].nunique()
title_cl_cnt_unique = titles_df["title_cl"].nunique()
print(f'# unique titles (before preprocessing): {title_cnt_unique} / {len(titles_df)}')
print(f'# unique titles (after preprocessing): {title_cl_cnt_unique} / {len(titles_df)}')
print(f'# reduction in unique titles: {title_cnt_unique-title_cl_cnt_unique}')

# display cleaned df head
display(titles_df.head(10))

In [None]:
# Testing of removal    
for book in ['unsterblich 02 - tor der nacht','meine kindergarten-freunde (pferde)']:
    book = re.sub(r'-',' ',book)
    print(book)

In [None]:
# print cnt of items including special terms
print(f'#items with title including:')
col = "title_cl"
for entry in ['hardcover','taschenbuch','edition','novel','hardback']:
    cnt = titles_df[col].str.contains(f'{entry}').sum()
    print(f'\t{entry}: {cnt}')

In [None]:
# search for specific entry
#pd.set_option("display.max_rows", None)
#pd.set_option('display.max_colwidth', None)

#search_entry = r' +'
#display(titles_df.loc[titles_df['title'].str.contains(f'{search_entry}'), :])

In [None]:
# inspect matches for specific terms/patterns
pd.set_option("display.max_rows", None)
# p = re.compile('\(.*\)')
p = re.compile(r'edition')
col = "title_cl"
matches = titles_df[col].apply(lambda s: p.findall(s))
matches = pd.DataFrame(set(flatten([x for x in matches if x])))
matches


# (1) -> elfengeist (1)
# (dt. ausgabe)
# the dark artifices box set (3 bände im schuber)
# star wars(tm) - schülerin der dunklen seite
# (sammelband) / (filmausgabe)
# (neuauflage) / (sonderausgabe)
# (roman) / (light novel)
# (großdruck)
# (gift edition) / (signed limited edition)
# (manga)
# (1-3 jahre)
# (greek edition) / (german edition) / (greek book for kids) -> additional column with language tag extracted?
# (spanish language edition of the things m -> check if error during reading in
# (hardback)

In [None]:
# cnt unique items per title
title_cnt_bpp = titles_df.groupby('title').count().reset_index().rename(columns={'title_cl': 'cnt'})
title_cnt_app = titles_df.groupby('title_cl').count().reset_index().rename(columns={'title': 'cnt'})
# display(title_cnt_bpp)
# display(title_cnt_app)

# merge both cnts to get comparison
titles_w_cnt = titles_df.merge(title_cnt_bpp, on='title', how='left')
titles_w_cnt = titles_w_cnt.merge(title_cnt_app, on='title_cl', how='left')
# display(titles_w_cnt)

# inspect differences
print(f'items with additional title matches: {len(titles_w_cnt[(titles_w_cnt["cnt_x"] < titles_w_cnt["cnt_y"])].drop_duplicates())}')

In [None]:
display(titles_w_cnt[(titles_w_cnt['cnt_x'] < titles_w_cnt['cnt_y']) & 
                     (titles_w_cnt['cnt_y'] > 1)].drop_duplicates())

In [None]:
# inspect exemplary item
titles_df[titles_df['title_cl'] == 'the dungeon masters wife']
titles_df[titles_df['title_cl'] == 'z rex']

#### author

In [None]:
# generate authors df (with comparison column for original and cleaned author)
author_df = pd.DataFrame(items_df_cl["author"].unique()).rename(columns={0: "author"})
author_df['author_cl'] = author_df['author']

# convert all strings to lowercase
author_df = author_df.applymap(lambda s:s.lower() if type(s) == str else s)

# clean strings
author_df['author_cl'] = author_df['author_cl'].astype(str).apply(remove_special_characters)
author_df['author_cl'] = author_df['author_cl'].apply(convert_umlaute)

# reduce all spaces in the articles to single spaces
author_df['author_cl'] = author_df['author_cl'].apply(remove_duplicate_whitespace)

# print stats
author_cnt_unique = author_df["author"].nunique()
author_cl_cnt_unique = author_df["author_cl"].nunique()
print(f'# unique authors (before preprocessing): {author_cnt_unique} / {len(author_df)}')
print(f'# unique authors (after preprocessing): {author_cl_cnt_unique} / {len(author_df)}')
print(f'# reduction in unique authors: {author_cnt_unique-author_cl_cnt_unique}')

# display cleaned df head
display(author_df.head(10))

In [None]:
# cnt unique items per author
author_cnt_bpp = author_df.groupby('author').count().reset_index().rename(columns={'author_cl': 'cnt'})
author_cnt_app = author_df.groupby('author_cl').count().reset_index().rename(columns={'author': 'cnt'})
# display(author_cnt_bpp)
# display(author_cnt_app)

# merge both cnts to get comparison
authors_w_cnt = author_df.merge(author_cnt_bpp, on='author', how='left')
authors_w_cnt = authors_w_cnt.merge(author_cnt_app, on='author_cl', how='left')
# display(authors_w_cnt)

# inspect differences
print(f'items with additional author matches: {len(authors_w_cnt[(authors_w_cnt["cnt_x"] < authors_w_cnt["cnt_y"])].drop_duplicates())}')

In [None]:
display(authors_w_cnt[(authors_w_cnt['cnt_x'] < authors_w_cnt['cnt_y']) & 
                     (authors_w_cnt['cnt_y'] > 1)].drop_duplicates())

In [None]:
# inspect exemplary item
author_df[author_df['author_cl'] == 'larry w miller jr']

#### publisher

In [None]:
# generate publishers df (with comparison column for original and cleaned publisher)
publisher_df = pd.DataFrame(items_df_cl["publisher"].unique()).rename(columns={0: "publisher"})
publisher_df['publisher_cl'] = publisher_df['publisher']

# convert all strings to lowercase
publisher_df = publisher_df.applymap(lambda s:s.lower() if type(s) == str else s)

# clean strings
publisher_df['publisher_cl'] = publisher_df['publisher_cl'].astype(str).apply(remove_special_characters)
publisher_df['publisher_cl'] = publisher_df['publisher_cl'].apply(convert_umlaute)

# reduce all spaces in the articles to single spaces
publisher_df['publisher_cl'] = publisher_df['publisher_cl'].apply(remove_duplicate_whitespace)

# print stats
publisher_cnt_unique = publisher_df["publisher"].nunique()
publisher_cl_cnt_unique = publisher_df["publisher_cl"].nunique()
print(f'# unique publishers (before preprocessing): {publisher_cnt_unique} / {len(publisher_df)}')
print(f'# unique publishers (after preprocessing): {publisher_cl_cnt_unique} / {len(publisher_df)}')
print(f'# reduction in unique publishers: {publisher_cnt_unique-publisher_cl_cnt_unique}')

# display cleaned df head
display(publisher_df.head(10))

In [None]:
# cnt unique items per publisher
publisher_cnt_bpp = publisher_df.groupby('publisher').count().reset_index().rename(columns={'publisher_cl': 'cnt'})
publisher_cnt_app = publisher_df.groupby('publisher_cl').count().reset_index().rename(columns={'publisher': 'cnt'})
# display(publisher_cnt_bpp)
# display(publisher_cnt_app)

# merge both cnts to get comparison
publishers_w_cnt = publisher_df.merge(publisher_cnt_bpp, on='publisher', how='left')
publishers_w_cnt = publishers_w_cnt.merge(publisher_cnt_app, on='publisher_cl', how='left')
# display(publishers_w_cnt)

# inspect differences
print(f'items with additional publisher matches: {len(publishers_w_cnt[(publishers_w_cnt["cnt_x"] < publishers_w_cnt["cnt_y"])].drop_duplicates())}')

In [None]:
display(publishers_w_cnt[(publishers_w_cnt['cnt_x'] < publishers_w_cnt['cnt_y']) & 
                     (publishers_w_cnt['cnt_y'] > 1)].drop_duplicates())

In [None]:
# inspect exemplary item
publisher_df[publisher_df['publisher_cl'] == 'digital scanning inc']

## Header-Set 

__Approach:__
1. __[done]__ Generate new header-set with new IDs to unify same books that appear multiple times in the items and transactions table
    a. generate new IDs
    b. unify information
2. __[done]__ Replace the subset IDs in transactions table by superset IDs

3. Pull data on header level from external sources (e.g. google doc incl. publication date and language flag)

### generation

In [None]:
# generate header set with unique ids for "super-items"
header_items_df = generate_header_set(items_df)
header_items_df.head()

In [None]:
## generate header set with unique ids for "super-items"
#header_items_df = generate_header_set(items_df)
#header_items_df.head()

In [None]:
# add headerID to items_df (drop before join if already existent)
if 'headerID' in items_df.columns:
    items_df = items_df.drop(columns=['headerID'])
items_df = items_df.merge(header_items_df[['title','headerID']], left_on='title', right_on='title',how='left') 
display(items_df.head())
print(f'missing headerIDs in items_df: {items_df["headerID"].isnull().sum()}')

# generate lookup table
header_items_lookup_df = items_df[['itemID','headerID']].drop_duplicates()
print(f'shape of items_df vs. header_items_lookup_df: {items_df.shape} vs. {header_items_lookup_df.shape}')

In [None]:
# add headerID to transactions_df (drop before join if already existent)
if 'headerID' in transactions_df.columns:
    transactions_df = transactions_df.drop(columns=['headerID'])
transactions_df = transactions_df.merge(header_items_lookup_df, left_on='itemID', right_on='itemID',how='left') 

# inspect results
display(transactions_df.head())
print(f'# missing headerIDs in transactions_df: {transactions_df["headerID"].isnull().sum()}')
print(f'# unique items in transactions_df: {transactions_df["itemID"].nunique()}')
print(f'# unique headers in transactions_df: {transactions_df["headerID"].nunique()}')

### [DEV] merge with external data

In [None]:
# read in preprocessed data
g_data = pd.read_feather('../data/processed/gbooks_volumeInfo_pp.feather')
g_data.head(1)

In [None]:
cols_pp = ['title', 'authors', 'publisher']

# convert all strings to lowercase
g_data[cols_pp] = g_data[cols_pp].applymap(lambda s:s.lower() if type(s) == str else s)

In [None]:
header_items_df.loc[:, 'author_'] = header_items_df.author.map(lambda x: next(iter(x)))
header_items_df.loc[:, 'publisher_'] = header_items_df.publisher.map(lambda x: next(iter(x)))
header_items_df.loc[:, 'release_date_'] = header_items_df.release_date.map(lambda x: next(iter(x)))

In [None]:
for col in cols_pp:
    
    col_cl = col + '_cl'

    # add additional col for pp titles
    g_data[col_cl] = g_data[col]

    # clean strings
    if col == 'title':
        g_data[col_cl] = g_data[col_cl].apply(remove_nontitle_substrings)
    g_data[col_cl] = g_data[col_cl].astype(str).apply(remove_special_characters)
    g_data[col_cl] = g_data[col_cl].apply(convert_umlaute)

    # reduce all spaces in the articles to single spaces
    g_data[col_cl] = g_data[col_cl].apply(remove_duplicate_whitespace)

    # print stats
    #col_cnt_unique = g_data[col].nunique()
    #col_cl_cnt_unique = g_data[col_cl].nunique()
    #print(f'# unique {col} (before preprocessing): {col_cnt_unique} / {len(items_df)}')
    #print(f'# unique {col} (after preprocessing): {col_cnt_unique} / {len(items_df)}')
    #print(f'# reduction in unique {col}: {col_cnt_unique-col_cl_cnt_unique}\n')
    
# replace original cols by pre-processed cols
g_data = g_data.drop(columns=cols_pp)
g_data = g_data.rename(columns={'title_cl': 'title', 'author_cl': 'author', 'publisher_cl': 'publisher'})

# remove items with missing title after pre-processing
print(f"remove items with missing/empty title after pp: {(g_data['title']=='').sum()}")
g_data = g_data[g_data['title']!='']

# display cleaned df head
display(g_data.head(3))

In [None]:
#pd.merge(header_items_df, g_data[['title', 'publisher','publishedDate', 'language', 'maturityRating']], left_on=['title','release_date_'], right_on=['title', 'publishedDate'])

## Feature Engineering

### Language flag

__Idea:__
Flag Language of title in order to improve same language recommendations

__Lookup Links:__
1. [stackoverflow:](https://stackoverflow.com/questions/39142778/python-how-to-determine-the-language) comparison of different language detection modules
2. [tds](https://towardsdatascience.com/benchmarking-language-detection-for-nlp-8250ea8b67c) performance evaluation -> recommends __fasttext__

In [None]:
# define test strings
str_en = "romeo and juliet: the graphic novel"
str_de = "sternenschweif. zauberhafter schulanfang"

# define whether to use existing flags and df
if not recompute_lg_flg:
    items_df = items_df_pp

#### module testing

In [None]:
# module detector dict
lan_detector = {'ld': 'langdetect', 'gl': 'guess_language', 'lg': 'langid'}

##### langdetect (=title_ld)
[langdetect](https://pypi.org/project/langdetect/)
- important: use try-catch block to handle e.g. numerics, urls etc
- non-deterministic approach: remember to set seed for reproducible results

In [None]:
from langdetect import DetectorFactory, detect
from langdetect.lang_detect_exception import LangDetectException

In [None]:
# test detector on sample strings
print(detect(str_en))
print(detect(str_de))

In [None]:
if recompute_lg_flg:
    # get start time for performance evaluation
    start_time_ld = time.time()

    # set seed for reproducability
    DetectorFactory.seed = 0

    # option 1: pre-calculate list of languages
    title_ld = []
    for title in items_df['title']:
        try:
            title_ld.append(detect(title))
    #         print(f'{title}: {detect(title)}')
        except LangDetectException:
            title_ld.append(None)
    #         print(f'{title}: "undefined"')

    # compute execution time
    end_time_ld = time.time()
    print(f'exection time langdetect: {end_time_ld - start_time_ld} seconds')

    items_df['title_ld'] = title_ld

    # option 2: use apply and title col
    # items_df['title_ld'] = items_df['title'].apply(lambda x: detect(x) if not x.isnumeric() else None)

In [None]:
# inspect items w/o language specification -> only numeric !
print(f'cnt of items without language flag: {items_df["title_ld"].isnull().sum()}')
display(items_df[items_df["title_ld"].isnull()].head(10))

# inspect results
ld_vc = pd.DataFrame(items_df['title_ld'].value_counts().reset_index())
display(ld_vc.transpose())

# show barplot with # items with title in given language
fig, ax = plt.subplots(figsize=(15, 5))
sns.barplot(x='index', y='title_ld', ax=ax, data=ld_vc, palette=palette_blue).set(
    xlabel='languages determined by "langdetect"',
    ylabel='# items with title in given language'
)
plt.xticks(rotation=90)
plt.show()

##### guess_language (=title_gl)

- Can detect very short samples

In [None]:
from guess_language import guess_language

In [None]:
print(guess_language(str_en))
print(guess_language(str_de))

In [None]:
if recompute_lg_flg:

    # get start time for performance evaluation
    start_time_gl = time.time()

    # detect langauge of titles
    items_df['title_gl'] = items_df['title'].apply(lambda x: guess_language(x) if not x.isnumeric() else None)

    # set 'UNKNOWN' to None
    items_df.loc[items_df['title_gl']=='UNKNOWN','title_gl'] = None

    # compute execution time
    end_time_gl = time.time()
    print(f'exection time guess_language: {end_time_gl - start_time_gl} seconds')

In [None]:
# inspect results
gl_vc = pd.DataFrame(items_df['title_gl'].value_counts().reset_index())
display(gl_vc.transpose())

# show barplot with # items with title in given language
fig, ax = plt.subplots(figsize=(15, 5))
sns.barplot(x='index', y='title_gl', ax=ax, data=gl_vc, palette=palette_blue).set(
    xlabel='languages determined by "guess_language"',
    ylabel='# items with title in given language'
)
plt.xticks(rotation=90)
plt.show()

##### textblob
Requires NLTK package, uses Google -> API blocked with "HTTP Error 429: Too Many Requests"

##### spacy
- [spacy doku](https://spacy.io/universe/project/spacy-langdetect): did not get it working

##### langid (=title_lg)

In [None]:
import langid

In [None]:
langid.classify(str_en)
langid.classify(str_de)

In [None]:
if recompute_lg_flg:

    # get start time for performance evaluation
    start_time_lg = time.time()

    # option 1: pre-calculate list of languages
    title_lg = []

    for title in items_df['title']:
        title_lg.append(langid.classify(title))
        print(f'{title}: {langid.classify(title)}')

    # compute execution time
    end_time_lg = time.time()
    print(f'exection time langid: {end_time_lg - start_time_lg} seconds')

    # add col to df
    items_df['title_lg'] = [t[0] for t in title_lg]

    # option 2: use apply
    # items_df['title_lg'] = items_df['title'].apply(lambda x: TextBlob(x).detect_language() if not x.isnumeric() or  else None)

In [None]:
# inspect items w/o language specification -> only numeric !
print(f'cnt of items without language flag: {items_df["title_lg"].isnull().sum()}')
#display(items_df[items_df["title_lg"].isnull()].head(10))

# inspect results
lg_vc = pd.DataFrame(items_df['title_lg'].value_counts().reset_index())
display(lg_vc.transpose())

# show barplot with # items with title in given language
fig, ax = plt.subplots(figsize=(15, 5))
sns.barplot(x='index', y='title_lg', ax=ax, data=lg_vc, palette=palette_blue).set(
    xlabel='languages determined by "langid"',
    ylabel='# items with title in given language'
)
plt.xticks(rotation=90)
plt.show()

##### fasttext
- official Python binding module by Facebook
- problems with installation on windows

#### module performance evaluation

In [None]:
# compare execution time and items w/o flag
if recompute_lg_flg:
    lan_detector_eval_df = pd.DataFrame({'execution time [s]': [eval('end_time_'+det.split("_")[1]) - eval('start_time_'+det.split("_")[1]) for det in ['title_ld','title_gl','title_lg']],
                                        '#items w/o language flg':[items_df[det].isnull().sum() for det in ['title_ld','title_gl','title_lg']]},
                                       index=[det for det in lan_detector.values()])
    display(lan_detector_eval_df)

# merge results dfs
ld_gl_vc = ld_vc.merge(gl_vc, left_on='index', right_on='index', how='outer')
ld_gl_lg_vc = ld_gl_vc.merge(lg_vc, left_on='index', right_on='index', how='outer')
display(ld_gl_lg_vc.transpose())
ld_gl_lg_vc = ld_gl_lg_vc.head(10)

# rename columns
ld_gl_lg_vc.columns = ['index', 'langdetect','guess_language','langid']

# add language name
ld_gl_lg_vc['language_name'] = ld_gl_lg_vc['index'].apply(lambda l: pycountry.countries.get(alpha_2=l).name if l != 'en' else 'English')

# transform model cols into identifier column for plotting
ld_gl_lg_vc = pd.melt(ld_gl_lg_vc, id_vars=["index", "language_name"],
                  var_name="flag_m", value_name="idCnt")
#display(ld_gl_lg_vc)

# Draw a nested barplot by language detector
sns.set_theme()
fig, ax = plt.subplots(figsize=(5,4))
g = sns.barplot(y="language_name", x="idCnt", hue="flag_m", data=ld_gl_lg_vc, palette=palette_blue, orient='h')
g.set(xlabel="# itemID", ylabel = "")
g.legend(loc='lower right')
plt.show()

### [DEV] Topic Similarity


In [None]:
# creation of sentence embeddings for each category

import pandas as pd
from gensim.models import KeyedVectors

# load embeddings
model = KeyedVectors.load_word2vec_format('../tempData/fetchedData/fasttext.wiki.en.300.vocab_200K.vec')
df = pd.read_json('../tempData/processedData/categories_translated.json', typ='series')

# create cat strings
def cats(row):
    words = ''
    cat_ins = ''
    for char in row:
        cat_ins = cat_ins + char
        try:
            words = words + df.loc[cat_ins].lower() +' '
        except:
            pass
    return words

# calculate averaged sentence embedding
def calc(words):
    summ = 0
    i = 0
    for word in set(words.split()):
        try:
            summ = summ + model[word]
            i = i+1
        except:
            pass
    return summ/i

In [None]:
# loop trough all cats & save as list to be able to save in df
avgs = []
for i in df.index:
    avgs.append([calc(cats(i))])

In [None]:
embs = pd.DataFrame(avgs, index=df.index, columns=['fasttext_emb'])
embs.head(2)

In [None]:
def f(col):
    s=0
    try:
        for e in col:
            s = s + embs.loc[e][0]
        return s / len(col)
    except KeyError:
        return 0

In [None]:
# create list from set
header_items_df.loc[:, 'mt_st_'] = header_items_df.mt_st_cl.map(lambda x: list(x))

In [None]:
header_items_df.mt_st_ = header_items_df.mt_st_.map(lambda x: ' '.join(x).split())

In [None]:
header_items_df.drop('mt_st_cl', axis=1, inplace=True)

In [None]:
# add categorical embeddings
header_items_df['emb_cats'] = header_items_df.apply(lambda x: f(x['mt_st_']), axis=1)

### Thalia Book Descriptions

#### Pre-processing

In [None]:
header_items_df.loc[:, 'description_'] = header_items_df.description.map(lambda x: next(iter(x)))

In [None]:
# Preprocessing
# convert all strings to lowercase
header_items_df['description_'] = header_items_df['description_'].apply(lambda s:s.lower() if type(s) == str else s)    
header_items_df['description_'] = header_items_df['description_'].apply(remove_nontitle_substrings)
header_items_df['description_'] = header_items_df['description_'].astype(str).apply(remove_special_characters)
header_items_df['description_'] = header_items_df['description_'].apply(convert_umlaute)
header_items_df['description_'] = header_items_df['description_'].apply(remove_next_sign)
# reduce all spaces in the articles to single spaces
header_items_df['description_'] = header_items_df['description_'].apply(remove_duplicate_whitespace)
# display cleaned df head
display(header_items_df.head(1))

In [None]:
header_items_df.drop('description', axis=1, inplace=True)

#### Translation

##### Google Translator

__Note:__
- not suitable as quite unreliable
- will block after 20k requests

##### Helsinki NLP Translation Model

__open:__
- clean header translation tags
- make translations per language (only select top 5?)
- detect whether description even is in source language !!! -> e.g. for Finnisch, most of descriptions seem to be in EN 

In [None]:
from transformers import MarianTokenizer, MarianMTModel

In [None]:
lan_dist = pd.DataFrame(header_items_20210517.language.value_counts().reset_index())
lan_dist_sub = lan_dist[lan_dist['language']>4]
print(len(lan_dist_sub))
display(lan_dist_sub)
sns.barplot(y='index',x='language', data=lan_dist_sub, orient='h')

In [None]:
# inspect finnish books
finnish_desc = header_items_20210517.loc[(header_items_20210517['language']=="{'Finnisch'}") & 
                                         (header_items_20210517['description']!="{'\\n'}"),:]
display(finnish_desc.head(5))
print(type(finnish_desc))
print(len(finnish_desc))

In [None]:
# inspect italian books
italian_desc = header_items_20210517.loc[(header_items_20210517['language']=="{'Italienisch'}") & 
                                         (header_items_20210517['description']!="{'\\n'}"),:]
display(italian_desc.head(5))
print(type(italian_desc))
print(len(italian_desc))

In [None]:
lang = "it"
target_lang = "en"
model_name = f'Helsinki-NLP/opus-mt-{lang}-{target_lang}'
print(model_name)

# Download the model and the tokenizer
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

In [None]:
print(f'desc to translate: {len(italian_desc)}')

lan_dict = {'it': "{'Italienisch'}",
           'de': "{'Deutsch'}"}

translation_hel_lst = []
for text in italian_desc['description']:
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    gen = model.generate(**inputs)
    translation = tokenizer.batch_decode(gen, skip_special_tokens=True)
    print(f'original: \t{text}\n')
    print(f'translation: \t{translation}\n\n')
    translation_hel_lst.append(translation)

### Thalia Recommended age

##### bin into children/ teenagers / adults
##### fill empty values with adults

In [None]:
# get rid of set structure (only get one value)
header_items_df.loc[:, 'recommended_age'] = header_items_df.recommended_age.map(lambda x: next(iter(x)))

In [None]:
header_items_df['rec_age'] = header_items_df.recommended_age.str.findall('\d+')

In [None]:
header_items_df['rec_age'].apply(lambda x: map(int, x))

In [None]:
header_items_df.recommended_age.str.replace('ab', )

In [None]:
header_items_df[header_items_df['recommended_age'].str.contains('5')]

In [None]:
def f(cell):
    cell = [int(x) for x in cell]
    try:
        if max(cell) < 3:
            return 'newborn'
        elif (min(cell) >=12) & (min(cell) <=18):
            return 'teen'
        elif (min(cell) > 7) & (min(cell) <12):
            return 'child'
        elif (min(cell) >=3) & (min(cell) <=7):
            return 'small child'
        else:
            return 'adult'
    except:
        return 'adult'

In [None]:
header_items_df['rec_age_'] = header_items_df['rec_age'].apply(f)

In [None]:
header_items_df.rec_age_.value_counts()

In [None]:
header_items_df.to_csv('header_items_df.csv')

## Export of final pre-processed dfs

In [None]:
# export transactions_df
transactions_df.to_csv(transactions_path_pp)

In [None]:
transactions_df.to_csv('transactions_wo.csv')

In [None]:
header_items_df.to_csv('header_items_df.csv')

In [None]:
items_df[]

In [None]:
header_items_df