In [1]:
import datetime

start = datetime.datetime.now()
start

datetime.datetime(2021, 11, 6, 9, 15, 19, 466695)

In [2]:
import sqlite3
import pandas as pd
from collections import defaultdict
from tqdm.auto import tqdm

db = sqlite3.connect("datasets.db")
cur = db.cursor()

In [3]:
book_info = pd.read_sql_query("""
SELECT circulation_short.readerID, circulation_short.startDate, circulation_short.smart, books.author_id, books.title
FROM circulation_short
JOIN books ON circulation_short.smart = books.smart_collapse_field
WHERE author_id IS NOT NULL
ORDER BY circulation_short.readerID, books.author_id, circulation_short.startDate ASC
""", con=db
)

In [4]:
author_series = book_info.sort_values(by=["readerID", "author_id", "startDate", "title"]).groupby(["readerID", "author_id"], as_index=False).agg({"smart": list})
author_series = author_series[author_series["smart"].apply(len) > 1]

In [5]:
author_series.head()

Unnamed: 0,readerID,author_id,smart
43,186,14599,"[876142, 877385]"
46,187,70570,"[814783, 824321]"
48,188,33144,"[1236341, 405584]"
71,204,67552,"[21089, 1260896]"
82,207,71203,"[442626, 467840, 365819]"


In [6]:
edges = defaultdict(int)

for row in tqdm(author_series["smart"].values):
    for f, s in zip(row[:-1], row[1:]):
        edges[(f, s)] += 1
        
edges = {key: value for key, value in sorted(edges.items(), key=lambda x: x[1], reverse=True) if value >= 5}

  0%|          | 0/245347 [00:00<?, ?it/s]

In [7]:
data = [(key[0], key[1], value) for key, value in edges.items()]

In [8]:
data = pd.DataFrame(data, columns=["collapse_in", "collapse_out", "cnt"])
data = data.drop_duplicates(subset=["collapse_in"])
data.to_sql("prediction_author_sequence", index=False, con=db, if_exists="replace")

In [9]:
book_info = pd.read_sql_query("""
SELECT circulation_short.readerID, circulation_short.startDate, circulation_short.smart, books.rubric1, books.title
FROM circulation_short
JOIN books ON circulation_short.smart = books.smart_collapse_field
WHERE author_id IS NULL
ORDER BY circulation_short.readerID, books.rubric1, circulation_short.startDate ASC
""", con=db
)

In [10]:
series = book_info.sort_values(by=["readerID", "rubric1", "startDate", "title"]).groupby(["readerID", "rubric1"], as_index=False).agg({"smart": list})
series = series[series["smart"].apply(len) > 1]

In [11]:
edges = defaultdict(int)

for row in tqdm(series["smart"].values):
    for f, s in zip(row[:-1], row[1:]):
        edges[(f, s)] += 1
        
edges = {key: value for key, value in sorted(edges.items(), key=lambda x: x[1], reverse=True) if value >= 10}

  0%|          | 0/409551 [00:00<?, ?it/s]

In [12]:
data = [(key[0], key[1], value) for key, value in edges.items()]

In [13]:
data = pd.DataFrame(data, columns=["collapse_in", "collapse_out", "cnt"])
data = data.drop_duplicates(subset=["collapse_in"])

In [14]:
data.to_sql("prediction_nouthor_sequence", index=False, con=db, if_exists="replace")

In [15]:
print(datetime.datetime.now())
print(datetime.datetime.now() - start)

2021-11-06 09:16:23.514802
0:01:04.048197
