## Recommendation Model

In [None]:
#!pip install pyarrow
#!pip install fuzzywuzzy
#!pip install python-Levenshtein

nltk.download('stopwords')
nltk.download('punkt')
import sys
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from langdetect import detect
from difflib import SequenceMatcher
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from re import search
import csv


df_items = pd.read_csv('items.csv', sep='|', quoting=csv.QUOTE_NONE, error_bad_lines=False)
df_transactions = pd.read_csv('transactions.csv', sep = '|')
df_evaluation = pd.read_csv('evaluation.csv', sep = '|')

In [None]:
items_path = '20210525_items_df.csv'
path = '20210525_header_items_df.csv'
mt_path = 'items_pp.csv'

# Some preprocessing
items_df = pd.read_csv(items_path, delimiter=',', encoding='utf-8')
del items_df['description']
del items_df['recommended_age']
del items_df['number_pages']

#header_df = pd.read_csv(header_path, delimiter=',', encoding='utf-8')

header_df = pd.read_csv(path, lineterminator='\n')
del header_df['title']
del header_df['author']
del header_df['publisher']
del header_df['item_lang_en']
#items_df = items_df.dropna(axis=0)

mt_df = pd.read_csv(mt_path, delimiter=',', encoding='utf-8')

#header_df = pd.merge(lang_df, header_df, how='left', left_on=['headerID'], right_on = ['headerID'])
items_df = pd.merge(items_df, header_df,  how='left', left_on=['headerID'], right_on = ['headerID'])
items_df['mt'] = mt_df['mt']
#items_df

In [None]:
final_df = pd.DataFrame(columns = ['book_id', 'model_id', 'team_id', 'recommendation_1', 'recommendation_2', 
                                  'recommendation_3', 'recommendation_4', 'recommendation_5'])

x = items_df.copy()
for index,row in df_evaluation.iterrows():
    
    items_df = x.copy()
    print(items_df.shape)
    
    #get information from base book
    base_book = row['itemID']
    print(base_book)
    author = items_df['author'][items_df['itemID'] == base_book].to_string(index=False).lstrip()
    mtopic = items_df['mt'][items_df['itemID'] == base_book].to_string(index=False).strip(' []')
    lang = items_df['language'][items_df['itemID'] == base_book].to_string(index=False).lstrip()
    headerID = int(items_df['headerID'][items_df['itemID'] == base_book])

    #filter data set according the language of the base book
    items_df = filter_on_lang(items_df, lang)
    items_df.reset_index(drop=True, inplace=True)
    
    #get different scores
    df_authorscores = items_df[['itemID', 'title', 'author']]
    df_authorscores['author_score'] = get_authorscores_new(df_authorscores, author)
    

    df_topicscore = items_df[['headerID','itemID', 'title', 'mt']]
    df_topicscore['mtopic_score'] = get_mtopicscores_new(df_topicscore, mtopic)
    

    df_titlescores = items_df[['itemID', 'title', 'author']]
    df_titlescores['title_score'] = get_titlescores(df_titlescores, items_df, base_book)
    
    result_transactions = recommend_based_on_transactions(df_transactions, df_items, base_book)
    
    #get final scores and remove duplicates
    result = get_totalscore(df_titlescores, df_authorscores, df_topicscore, result_transactions)
    result = result[result['headerID'] != headerID]
    result.drop_duplicates(subset ='headerID', keep = "first", inplace = True)
    result = result.sort_values(by='total_score', ascending=False)
      
    recommendations = result.iloc[0:5,:] 
    #print(recommendations)
    
    #write books with top 5 scores to file
    final_df = final_df.append({'book_id': row['itemID'],
                                'model_id': 'first',
                                'team_id': 'dataminerz', 
                                'recommendation_1': recommendations.iloc[0, 0], 
                                'recommendation_2': recommendations.iloc[1, 0], 
                                'recommendation_3': recommendations.iloc[2, 0], 
                                'recommendation_4': recommendations.iloc[3, 0], 
                                'recommendation_5': recommendations.iloc[4, 0]}, ignore_index=True)
    print(len(final_df))
                        

final_df.to_csv('rec_final.csv')


## Several Functions to calculate the scores

In [None]:
def filter_on_lang(df_item, language):
    
    if len(df_item[df_item['language'] == lang]) > 5:
    
        print(len(df_item[df_item['language'] == lang]))
        return df_item[df_item['language'] == lang]
    
    else:
    
        print(len(df_item))
        return df_item
    

def get_authorscores(df_authorscores, author):

    count = 0
    hits = 0
    for i in df_authorscores['itemID']:
        
        if pd.isnull(df_authorscores['author'][count]):
            df_authorscores.at[count, 'author_score'] = 0
            
        else:
        
            if  df_authorscores['author'][count] == author:
                df_authorscores.at[count, 'author_score'] = 2
                hits += 1
            else:
                df_authorscores.at[count, 'author_score'] = 0
        
        count += 1
        
    if hits < 3:
        
        count = 0
        for i in df_authorscores['itemID']:

            #print(df_authorscores['author'][count])
            #print(author)
            if pd.isnull(df_authorscores['author'][count]):
                df_authorscores.at[count, 'author_score'] = 0
            else:  

                fuzzratio = fuzz.ratio(author, df_authorscores['author'][count])/100
                fuzzpartial = (fuzz.partial_ratio(author, df_authorscores['author'][count]))/100

                if fuzzpartial > 0.70 and fuzzratio > 7.0:
                    df_authorscores.at[count, 'author_score'] = fuzzpartial + fuzzratio
                else:
                    df_authorscores.at[count, 'author_score'] = 0
                #print(fuzz.partial_ratio(author, df_authorscores['author'][count]))

            count += 1

        return df_authorscores['author_score']
        
        
    
    return df_authorscores['author_score']


def get_authorscores_new(df_authorscore, author):
    
    count = 0
    hits = 0
    for i in df_authorscores['itemID']:
        
        if pd.isnull(df_authorscores['author'][count]):
            df_authorscores.at[count, 'author_score'] = 0
            
        else:
        
            if  df_authorscores['author'][count] == author:
                df_authorscores.at[count, 'author_score'] = 2
                hits += 1
            else:
                df_authorscores.at[count, 'author_score'] = 0
        
        count += 1
        
    if hits < 3:     
        df_authorscore['author'] = df_authorscore['author'].fillna('')
        split_names = df_authorscore['author'].apply(lambda x: x.split(' '))
        split_author = author.split(' ')
        df_authorscore[split_names.apply(lambda x: len(set(split_author).intersection(set(x)))) >= 2]['author_score'] = 2.0
    
    return df_authorscores['author_score']


def get_authorscores_fuzzy(df_authorscores, author):

    count = 0
    for i in df_authorscores['itemID']:
            
        #print(df_authorscores['author'][count])
        #print(author)
        if pd.isnull(df_authorscores['author'][count]):
            df_authorscores.at[count, 'author_score'] = 0
        else:  
            
            fuzzratio = fuzz.ratio(author, df_authorscores['author'][count])/100
            
            if (fuzz.partial_ratio(author, df_authorscores['author'][count])/100) > 0.70 and (fuzzratio) > 0.7:
                df_authorscores.at[count, 'author_score'] = 1 + fuzzratio
            else:
                df_authorscores.at[count, 'author_score'] = 0
            #print(fuzz.partial_ratio(author, df_authorscores['author'][count]))
            
        count += 1
    
    return df_authorscores['author_score']

def get_mtopicscores_new(df_mtopicscores, mtopic):
    def get_mt_similarity(mt_sample, mtopic):
        if not mt_sample:
            return 0
        else:
            
            return SequenceMatcher(None, str(mt_sample).strip('[]'), str(mtopic)).ratio()
            
    
    mt_score = df_mtopicscores["mt"].apply(lambda mt_sample: get_mt_similarity(mt_sample, mtopic))
    return mt_score

def get_mtopicscores_emb(df_mtopicscores, emb):

    emb_items = df_mtopicscores['emb_cats']
    
    cos_similarity_topics = map(lambda x: cosine_similarity(emb, x), emb_items)
    
    output = list(cos_similarity_topics)
        
    
    return np.array(output).ravel()


def get_titlescores (df_titlescores, items_df, base_book):
    
    #Creating tfidf-matrix
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_items = tfidf_vectorizer.fit_transform((df_titlescores['title'].apply(lambda x: np.str_(x))))

    #Retrieving book for recommendations
    test_tfidf = tfidf_items[items_df['itemID'] == base_book]

    #Retrieving most similar books
    cos_similarity_tfidf = map(lambda x: cosine_similarity(test_tfidf, x), tfidf_items)
    output = list(cos_similarity_tfidf)
    
    return np.array(output).ravel()
    
def get_totalscore(df1, df2, df3, df_trans):

    result = pd.DataFrame(columns=['itemID', 'headerID', 'title', 'author', 'mt','title_score', 'author_score', 'mtopic_score', 'trans_score', 'total_score'])
    
    result['itemID'] = df1.iloc[:,0]
    result['headerID'] = df3.iloc[:,0]
    result['title'] = df3.iloc[:,2]
    result['author'] = df1.iloc[:,2]
    result['mt'] = df3.iloc[:,3]
    result['title_score'] = df1.iloc[:,3]
    result['author_score'] = df2.iloc[:,3]
    result['mtopic_score'] = df3.iloc[:,4]
    result['trans_score'] = 0
    
    count = 0
    for i in df_trans:
        
        result['trans_score'][result['itemID'] == df_trans[count]] = 1
        
        count += 1
    
    result['total_score'] = result['title_score'] + result['author_score'] + result['mtopic_score'].astype(float) + result['trans_score'].astype(float)

    return result

In [None]:
def recommend_based_on_transactions(df_transactions, df_items, item_id, sort_by="sum",
                                    max_number_recommendation=10, verbose=False):
    
    original_book = convert_id_to_name(df_items, item_id)
    original_title = original_book[["title"]].values[0][0]
    original_author = original_book[["author"]].values[0][0]
    recommendation_id = []

    if original_book.empty:
        sys.exit("ITEM ID NOT FOUND!")
    if verbose:
        print("Find recommendations based on: ")
        print("{} by {}.".format(original_title, original_author))
        print("\n")
        print("We recommend: ")
    session_ids = find_all_sessions_id(df_transactions, item_id)
    itemID_rows = find_all_item_id(df_transactions, session_ids)
    item_properties = find_number_click_basket_order(itemID_rows)

    if sort_by == "sum":
        summed_item_properties = sum_click_basket_order(item_properties)
        sorted_itemID = sorted(summed_item_properties, key=summed_item_properties.get, reverse=True)

        number_recommendation = 0
        for single_itemID in sorted_itemID:
            rank_book = convert_id_to_name(df_items, single_itemID)
            if single_itemID == item_id or compare_strings(original_title, rank_book[["title"]].values[0][0]):
                continue
            if number_recommendation < max_number_recommendation:
                recommendation_id.append(single_itemID)
                if verbose:
                    print("{}. {} by {}.".format(number_recommendation + 1,
                                                 rank_book[["title"]].values[0][0],
                                                 rank_book[["author"]].values[0][0]))
                number_recommendation += 1
            else:
                break
        if verbose:
            for i in range(max_number_recommendation):
                if i + 1 >= number_recommendation + 1:
                   print("{}. Not enough data to give recommendation".format(i + 1))

    else:
        sys.exit("INCORRECT ARGUMENT FOR sort_by!")

    return recommendation_id


def find_all_sessions_id(df, item_id):
    session_ids = df.loc[df['itemID'] == item_id]["sessionID"].to_list()
    return session_ids


def find_all_item_id(df, session_ids):
    return df.loc[df['sessionID'].isin(session_ids)]


def find_number_click_basket_order(itemID_rows):
    item_properties = {}

    for index, row in itemID_rows.iterrows():
        if row['itemID'] in item_properties.keys():
            item_properties[row['itemID']][0] += row['click']
            item_properties[row['itemID']][1] += row['basket']
            item_properties[row['itemID']][2] += row['order']
        else:
            item_properties[row['itemID']] = [row['click'], row['basket'], row['order']]
    return item_properties


def sum_click_basket_order(item_properties):
    summed_item_properties = {}
    for key, value in item_properties.items():
        summed_item_properties[key] = sum(value)

    return summed_item_properties


def convert_id_to_name(df_items, itemID):
    return df_items.loc[df_items['itemID'] == itemID]


def compare_strings(text1, text2):
    test1_wh_sw = remove_stopwords(text1)
    test2_wh_sw = remove_stopwords(text2)

    return test1_wh_sw.lower() == test2_wh_sw.lower()


def remove_stopwords(text):
    text_tokens = word_tokenize(text)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    return " ".join(tokens_without_sw)