In [1]:
import datetime

start = datetime.datetime.now()
start

datetime.datetime(2021, 11, 5, 21, 56, 51, 643652)

In [2]:
import sqlite3
import pandas as pd
import networkx as nx
from collections import defaultdict
from tqdm.auto import tqdm
import math
from itertools import combinations
import pickle

db = sqlite3.connect("datasets.db")
cur = db.cursor()

In [3]:
book_history = pd.read_sql_query("""
SELECT 
    circulation_short.readerID, 
    group_concat(books.rubric1) as rubrics, 
    group_concat(books.author_id) as authors
FROM circulation_short
JOIN books ON circulation_short.smart = books.smart_collapse_field
WHERE rubric1 IS NOT NULL AND author_id IS NOT NULL
GROUP BY readerID
HAVING count(smart) > 1
""", con=db)
book_history.shape

(304096, 3)

In [4]:
book_history["rubrics"] = book_history["rubrics"].apply(lambda x: [int(i) for i in x.split(",")])
book_history["authors"] = book_history["authors"].apply(lambda x: [int(i) for i in x.split(",")])

In [5]:
book_history.head(3)

Unnamed: 0,readerID,rubrics,authors
0,163,"[252, 479]","[91143, 130512]"
1,170,"[610, 610]","[121213, 27741]"
2,171,"[479, 479, 479, 479, 479, 479, 479, 479, 479, 18]","[47875, 132224, 134206, 104699, 94970, 141471,..."


In [6]:
edges = defaultdict(lambda: defaultdict(int))
count_rubric = defaultdict(lambda: defaultdict(int))
count_name = defaultdict(int)

for row in tqdm(book_history.values):
    row_dict = defaultdict(list)
    
    for r, a in zip(row[1], row[2]):
        row_dict[r].append(a)
        count_rubric[a][r] += 1
        count_name[a] += 1
    
    for r in row_dict:
        for o in combinations(row_dict[r], 2):
            edges[o][r] += 1

  0%|          | 0/304096 [00:00<?, ?it/s]

In [7]:
del book_history

In [8]:
edges2 = defaultdict(lambda: defaultdict(int))
pairs = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

for pair, links in tqdm(edges.items()):
    if sum(links.values()) >= 3:
        for rubric, val in links.items():
            if val > 1:
                coef = 14 - math.log(val / (count_rubric[pair[0]][rubric] + count_rubric[pair[1]][rubric]))
                edges2[rubric][pair] = coef
                pairs[rubric][pair[0]][pair[1]] = coef
                pairs[rubric][pair[1]][pair[0]] = coef

pairs2 = defaultdict(lambda: defaultdict(set))
for r in pairs:
    for i in pairs[r]:
        pairs2[r][i] = set([i for i in sorted(pairs[r][i], key=pairs[r][i].get)][:10])

  0%|          | 0/9086950 [00:00<?, ?it/s]

In [9]:
graphs = defaultdict(lambda: nx.Graph())

for pair, links in tqdm(edges.items()):
    if sum(links.values()) >= 3:
        for rubric, val in links.items():
            if val > 1:
                if pair[0] in pairs2[rubric][pair[1]] or pair[1] in pairs2[rubric][pair[0]]:
                    graphs[rubric].add_edge(*pair, weight=coef)

  0%|          | 0/9086950 [00:00<?, ?it/s]

In [10]:
count_rubric.get(23203)

defaultdict(int, {479: 2808, 534: 470, 218: 97, 541: 49})

In [11]:
view = graphs[479][23203]
sorted(view.items(), key=lambda x: x[1]["weight"])[:5]

[(15583, {'weight': 22.144195273445575}),
 (134228, {'weight': 22.144195273445575}),
 (45648, {'weight': 22.144195273445575}),
 (23203, {'weight': 22.144195273445575}),
 (50649, {'weight': 22.144195273445575})]

In [12]:
with open("author_graphs.pkl", "wb") as f:
    pickle.dump(dict(graphs), f)

In [13]:
print(datetime.datetime.now())
print(datetime.datetime.now() - start)

2021-11-05 21:57:52.754328
0:01:01.110839
