In [1]:
import sqlite3
import pandas as pd
import networkx as nx
from collections import defaultdict
from tqdm.auto import tqdm
from itertools import combinations
import math
import matplotlib.pyplot as plt
import numpy as np
import json

db = sqlite3.connect("datasets.db")
cur = db.cursor()

db_out = sqlite3.connect("predictions.db")
cur_out = db_out.cursor()

## compile history

In [2]:
book_history = pd.read_sql_query("""
SELECT readerID, catalogueRecordID, author_id
FROM circulation_short 
JOIN books ON books.id = circulation_short.catalogueRecordID
WHERE author_id IS NOT NULL
ORDER BY readerID ASC, startDate DESC
""", con=db)
book_history = book_history.groupby("readerID").agg({"catalogueRecordID": list, "author_id": list})

In [3]:
book_history.head()

Unnamed: 0_level_0,catalogueRecordID,author_id
readerID,Unnamed: 1_level_1,Unnamed: 2_level_1
232,"[1594580, 234836, 170268, 521679, 255761]","[95006, 113922, 85642, 96845, 57022]"
233,"[1306443, 33791, 356999, 15646]","[32370, 71580, 134200, 121969]"
234,"[32293, 2024117, 477533, 2117315, 2107758]","[17749, 171045, 43023, 194663, 157395]"
235,"[42447, 2000683, 1322907, 984881, 1593991, 200...","[48682, 131044, 137032, 141468, 131044, 131044]"
236,"[1334309, 1963836, 1719236]","[19983, 52171, 19561]"


## model authors

In [4]:
authors_agg = book_history[["author_id"]]
authors_agg = authors_agg[authors_agg["author_id"].apply(len) > 1]
authors_agg.shape

(115674, 1)

In [5]:
edges = defaultdict(int)

for row in tqdm(authors_agg["author_id"].values):
    for o in combinations(set(row), 2): 
        edges[o] += 1
    for e in row: 
        edges[(e, e)] += 1

  0%|          | 0/115674 [00:00<?, ?it/s]

In [6]:
G = nx.Graph()

for edge, cnt in tqdm(edges.items()):
    G.add_edge(*edge, weight=edges[edge])

  0%|          | 0/651204 [00:00<?, ?it/s]

In [7]:
from collections import Counter

In [8]:
adfs = {}
for aid in G.nodes():
    adfs[int(aid)] = Counter({k: v["weight"] for k, v in G[aid].items()})
    
is_adfs = set(adfs.keys())

In [9]:
def get_for_one(authors, k=10):
    total = Counter()
    ctr = Counter(authors)
    for author_id in authors:
        if author_id in is_adfs:
            total += adfs[author_id]
    return [i[0] for i in total.most_common(k)]

In [10]:
book_data = pd.read_sql_query("SELECT id, title, author_fullName FROM books", con=db)
book_data = {idx: {"id": idx, "title": title, "author": author} for idx, title, author in book_data.values}

In [11]:
author_book = pd.read_sql_query("SELECT author_id, collapse_field, id, available as n FROM books WHERE author_id IS NOT NULL", con=db)
author_book = author_book.groupby(["author_id", "collapse_field"], as_index=False).agg({"id": list, "n": sum})
author_book = author_book.sort_values(by="n", ascending=False)

author_book = author_book.groupby("author_id").agg({"id": lambda x: list(set(i) for i in x)})

In [12]:
# author_book.head()

In [13]:
# readerid, a, b

In [14]:
def api_view(readerid, hauthor, hbooks):
    result = get_for_one(hauthor)        
    history = [
        book_data[idx] for idx in reversed(hbooks) if idx in book_data
    ]
    recommendations = []
    if len(result) > 0:
        bhist = set(hbooks)
        for i in result:
            for booklist in author_book.loc[i]["id"]:
                if booklist & bhist:
                    continue
                else:
                    recommendations.append(book_data[min(booklist)])
                    break
    if not recommendations:
        recommendations = None
    else:
        recommendations = json.dumps(recommendations, ensure_ascii=False) #, indent=1
    history = json.dumps(history, ensure_ascii=False)
    return readerid, history, recommendations

In [15]:
# %load_ext line_profiler

In [16]:
# print(a)
# %lprun -f api_view api_view(readerid, a, b)

In [27]:
row, readerid, a, b

(([301242], [119883]), 122415, [301242], [119883])

In [25]:
api_view(readerid, a, b)

(122415, '[]', None)

In [29]:
cur_out.execute("DROP TABLE IF EXISTS recommendations")
cur_out.execute("CREATE TABLE recommendations (id INTEGERE PRIMARY KEY, history TEXT, recommendations TEXT)")
db_out.commit()

data = []
k = 1
for readerid, row in tqdm({i: (k, j) for i, k, j in book_history.reset_index().values}.items()):
    b, a = row
    readerid, history, recommendations = api_view(readerid, a, b)
    data.append((readerid, history, recommendations,))
    if k % 1000 == 0:
        cur_out.executemany("INSERT INTO recommendations VALUES (?, ?, ?)", data)
        db_out.commit()
        data = []
    k += 1
cur_out.executemany("INSERT INTO recommendations VALUES (?, ?, ?)", data)
db_out.commit()

  0%|          | 0/173138 [00:00<?, ?it/s]

In [None]:
67000 % 1000