In [1]:
import datetime

start = datetime.datetime.now()
start

datetime.datetime(2021, 11, 6, 9, 56, 4, 694725)

In [2]:
import sqlite3
import pandas as pd
import networkx as nx
from tqdm.auto import tqdm
import numpy as np
import json
import pickle
import zlib

db = sqlite3.connect("datasets.db")
cur = db.cursor()

db_out = sqlite3.connect("predictions.db")
cur_out = db_out.cursor()

In [3]:
book_history = pd.read_sql_query("""
SELECT 
    readerID, 
    group_concat(COALESCE(rubric1, 0)) as rubrics, 
    group_concat(COALESCE(author_id, 0)) as authors,
    group_concat(smart) as booklist
FROM (
    SELECT readerID, rubric1, author_id, smart
    FROM circulation_short
    JOIN books ON circulation_short.smart = books.smart_collapse_field
    ORDER BY readerID, startDate DESC
) as t
GROUP BY readerID

""", con=db)
book_history.shape

(467892, 4)

In [4]:
book_history["rubrics"] = book_history["rubrics"].apply(lambda x: [int(i) if i != "0" else None for i in x.split(",") ])
book_history["authors"] = book_history["authors"].apply(lambda x: [int(i) if i != "0" else None for i in x.split(",")])
book_history["booklist"] = book_history["booklist"].apply(lambda x: [int(i) if i != "0" else None for i in x.split(",")])

book_history.head(2)

Unnamed: 0,readerID,rubrics,authors,booklist
0,163,"[657, 252, 479]","[None, 91143, 130512]","[849520, 716481, 173828]"
1,165,[124],[133250],[1112960]


In [5]:
book_history[book_history["booklist"].apply(len) > 50]

Unnamed: 0,readerID,rubrics,authors,booklist
35,232,"[479, 681, 479, 479, 479, 657, 657, 479, 161, ...","[95006, None, 133036, 139854, 43704, 139870, 1...","[471837, 872371, 1246730, 468760, 437587, 1587..."
36,233,"[630, 218, 11, 207, 630, 657, 479, 479, 2, 610...","[32370, 93531, 157558, 71571, 192999, None, 11...","[401605, 1243740, 872860, 846496, 1539171, 141..."
37,234,"[620, 674, 479, 11, 479, 98, 66, 479, 681, 403...","[157395, 194663, 126931, 126425, 43057, 127828...","[1587549, 1583025, 1231715, 1532013, 313007, 1..."
39,236,"[479, 681, 402, 479, 681, 11, 479, 479, 479, 4...","[52171, None, None, 19983, None, None, 111093,...","[1247607, 469847, 874354, 435008, 1242520, 124..."
46,243,"[479, 479, 479, 479, 479, 479, 479, 479, 479, ...","[170966, 57657, 131454, 171093, 107019, 138148...","[877223, 471912, 1242048, 1247532, 1587163, 46..."
...,...,...,...,...
463362,724686,"[681, None, None, 681, 681, None, None, None, ...","[None, None, None, None, None, None, None, Non...","[472678, 472679, 1230524, 436764, 1230523, 158..."
463589,725123,"[630, 693, 620, 620, 104, 104, 89, 89, 89, 89,...","[None, None, 92161, None, 164412, 130149, None...","[840982, 829150, 386604, 850286, 1206533, 3991..."
463604,725161,"[630, 630, 630, 657, 657, 657, 657, 114, 218, ...","[92940, 20087, None, None, None, None, None, 1...","[190971, 1197131, 416227, 448664, 842313, 8341..."
464269,726439,"[681, 681, None, None, None, 681, 681, 681, No...","[None, None, None, None, None, None, None, Non...","[1553377, 848483, 441448, 472515, 441446, 4416..."


## Загрузить модели

In [6]:
with open("author_graphs.pkl", "rb") as f:
    authors = pickle.load(f)

In [7]:
cur.execute("""
SELECT collapse_in, group_concat(collapse_out) 
FROM (SELECT collapse_in, collapse_out FROM prediction_author_sequence ORDER BY cnt DESC)
GROUP BY collapse_in
""")
series = {k: [int(i) for i in v.split(",")] for k, v in cur.fetchall()}

cur.execute("""
SELECT collapse_in, group_concat(collapse_out) 
FROM (SELECT collapse_in, collapse_out FROM prediction_nouthor_sequence ORDER BY cnt DESC)
GROUP BY collapse_in
""")
noauthor = {k: [int(i) for i in v.split(",")] for k, v in cur.fetchall()}

In [8]:
# cur.execute("SELECT author_id, group_concat(smart_collapse_field) FROM books GROUP BY author_id ORDER BY available DESC")
cur.execute("""
SELECT author_id, group_concat(smart_collapse_field) 
FROM (SELECT author_id, smart_collapse_field FROM books WHERE author_id IS NOT NULL ORDER BY available DESC)
GROUP BY author_id
""")
author_book = {k: [int(i) for i in v.split(",")] for k, v in cur.fetchall()}

cur.execute("""
SELECT author_id, rubric1, group_concat(smart_collapse_field) 
FROM (SELECT author_id, rubric1, smart_collapse_field FROM books WHERE author_id IS NOT NULL AND rubric1 IS NOT NULL ORDER BY available DESC)
GROUP BY author_id, rubric1
""")
author_rubric = {k: {r: [int(i) for i in v.split(",")]} for k, r, v in cur.fetchall()}

In [9]:
cur.execute("""
SELECT rubric1, group_concat(smart_collapse_field)
FROM (SELECT rubric1, smart_collapse_field, available FROM books ORDER BY available) as t
GROUP BY rubric1
""")
rubric_books = {k: [int(i) for i in v.split(",")][:-1][:25] for k, v in cur.fetchall()}

### Информация о книгах

In [10]:
cur.execute("""
SELECT smart_collapse_field, id, title, author_fullName 
FROM books LEFT JOIN authors ON books.author_id = authors.author_id
""")
book_info = {s: (i, t, a) for s, i, t, a in cur.fetchall()}

## Предсказания

In [11]:
def predict_one(rubric, author, book):
    if book in series and series[book]:
        return series[book]
    elif book in noauthor and noauthor[book]:
        return noauthor[book]
    elif rubric in authors and author in authors[rubric]:
        view = [(v["weight"], k) for k, v in dict(authors[rubric][author]).items()]
        current_authors = [i[1] for i in sorted(view)[:10]]
        current_books = []
        for ia, a in enumerate(current_authors):
            if a in author_rubric and rubric in author_rubric[a]:
                current_books.extend([(i, ia, b) for i, b in enumerate(author_rubric[a][rubric][:5])])
            elif a in author_book:
                current_books.extend([(i, ia, b) for i, b in enumerate(author_book[a][:5])])
        current_books = [i[-1] for i in sorted(current_books)]
        if not current_books:
            pass
        elif len(current_books) < 5 and rubric in rubric_books:
            return current_books + rubric_books[rubric]
        else:
            return current_books
    if rubric in rubric_books:
        return rubric_books[rubric]
    return []

In [12]:
def api_predictions(rubrics, authors, booklist, k=10):
    seen = set(booklist)
    result = []
    n = 0
    for i, triple in enumerate(zip(rubrics, authors, booklist)):
        variants = predict_one(*triple)
        for i2, v in enumerate(variants):
            result.append((i2, i, v))
    final_res = []
    for i in sorted(result):
        this = i[-1]
        if this not in seen:
            final_res.append(this)
            seen.add(this)
    return final_res[:k]

In [13]:
def pretty_predict(rubrics, authors, booklist, k=10):
    history = []
    preds = []
    for b in booklist[:10]:
        info = book_info.get(b, [0, "", ""])
        history.append({
            "id": info[0], "title": info[1], "author": info[2]
        })
        
    predictions = api_predictions(rubrics, authors, booklist, k=k)
    
    for b in predictions:
        info = book_info.get(b, [0, "", ""])
        preds.append({
           "id": info[0], "title": info[1], "author": info[2]
        })
    return (
        zlib.compress(json.dumps(history, ensure_ascii=False).encode("utf-8"), level=9), 
        zlib.compress(json.dumps(preds, ensure_ascii=False).encode("utf-8"), level=9)
    )

In [14]:
cur_out.execute("DROP TABLE IF EXISTS recommendations")
cur_out.execute("CREATE TABLE recommendations (id INTEGER PRIMARY KEY, history BLOB, recommendations BLOB)")
db_out.commit()

In [15]:
cur.execute("SELECT id, title, author FROM zero_prediction")
data = [{"id": i[0], "title": i[1], "author": i[2]} for i in cur.fetchall()]
data = (
    zlib.compress(json.dumps([], ensure_ascii=False).encode("utf-8"), level=9), 
    zlib.compress(json.dumps(data, ensure_ascii=False).encode("utf-8"), level=9)
)

In [16]:
cur_out.execute("INSERT INTO recommendations VALUES (0, ?, ?)", data)
db_out.commit()

In [17]:
cnt = 0
data = []
for rid, r, a, b in tqdm(book_history.values):
    pred = pretty_predict(r, a, b, k=10)
    data.append((rid, *pred))
    cnt += 1
    if cnt % 1000 == 0:
        cur_out.executemany("INSERT INTO recommendations VALUES (?, ?, ?)", data)
        db_out.commit()
        data = []
        
cur_out.executemany("INSERT INTO recommendations VALUES (?, ?, ?)", data)
db_out.commit()

  0%|          | 0/467892 [00:00<?, ?it/s]

In [18]:
print(datetime.datetime.now())
print(datetime.datetime.now() - start)

2021-11-06 10:00:34.580336
0:04:29.885700
