In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [3]:
from datetime import datetime
from collections import defaultdict

### Books predictor

In [4]:
books_engineered = pd.read_csv('data_algo/books_engineered.csv', index_col=0)

  mask |= (ar1 == a)


In [5]:
reader_clusters = books_engineered.copy()

In [6]:
reader_clusters = reader_clusters[reader_clusters['id_book'].notna() == True]

In [7]:
reader_clusters['reader_book_id'] = reader_clusters['id_reader'].astype('str') + reader_clusters['id_book'].astype('str')

In [8]:
reader_clusters.drop_duplicates(['reader_book_id'], inplace=True)

In [9]:
reader_clusters.head()

Unnamed: 0,id_issue,id_reader,id_book,author,name,reader_age,genre,is_novice,is_old,is_series,is_age_cencored,is_18_cencored,reader_book_id
0,1,179,5190.0,Семенова Мария Васильевна,Год Людоеда,30.0,russian_fiction,N,N,N,N,N,1795190.0
1,2,179,5185.0,,Сказки о солдате,30.0,poetry_folklor,N,N,Y,N,N,1795185.0
7,8,163,5190.0,Семенова Мария Васильевна,Год Людоеда,38.0,russian_fiction,N,N,N,N,N,1635190.0
9,10,232,5456.0,Андреева Наталья Вячеславовна,Москва не принимает,80.0,russian_fiction,N,N,Y,N,N,2325456.0
10,11,232,5324.0,Акунин Борис,Пелагия и белый бульдог,80.0,russian_fiction,N,N,Y,N,N,2325324.0


In [10]:
reader_book = reader_clusters[['id_reader', 'id_book']]

In [11]:
user_book_dict = defaultdict(list)
for row in reader_book.itertuples():
    user_book_dict[row[1]].append(row[2])

In [14]:
def extract_sequence(books: list):
    if len(books) == 1:
        return []
    prev_index = 0
    cur_index = 1
    sequence = []
    while cur_index <= len(books)-1:
        sequence.append((books[prev_index], books[cur_index]))
        prev_index += 1
        cur_index += 1
    return sequence

In [15]:
%%time
all_seqs = []
for key in user_book_dict.keys():
    books = user_book_dict[key]
    sequences = extract_sequence(books)
    all_seqs.extend(sequences)

Wall time: 2.67 s


In [16]:
frequency_dict = defaultdict(int)
for seq in all_seqs:
    frequency_dict[seq] += 1
    
freq_df = pd.DataFrame.from_dict(frequency_dict, orient='index')
freq_df.reset_index(inplace=True)
freq_df.rename(columns={'index':'seq', 0:'freq'}, inplace=True)

freq_df = freq_df.sort_values('freq', ascending=False)

freq_df['prev'] = freq_df['seq'].apply(lambda x: x[0])
freq_df['next'] = freq_df['seq'].apply(lambda x: x[1])

freq_df.head()

Unnamed: 0,seq,freq,prev,next
481,"(438933.0, 439155.0)",1753,438933.0,439155.0
7600,"(439155.0, 438933.0)",1299,439155.0,438933.0
151197,"(438933.0, 445127.0)",1025,438933.0,445127.0
7601,"(438933.0, 416677.0)",1015,438933.0,416677.0
482,"(439155.0, 416677.0)",1013,439155.0,416677.0


In [17]:
seq_predictions_dict = defaultdict(list)
for row in freq_df.itertuples():
    seq_predictions_dict[row[3]].append(row[4])

In [19]:
top_10_predictor = defaultdict()
for key, values in seq_predictions_dict.items():
    top_10_predictor[key] = values[:11]

In [20]:
top_10_predictor

defaultdict(None,
            {438933.0: [439155.0,
              445127.0,
              416677.0,
              440040.0,
              440017.0,
              416679.0,
              440047.0,
              439858.0,
              416678.0,
              1270341.0,
              1287581.0],
             439155.0: [438933.0,
              416677.0,
              445127.0,
              416678.0,
              444493.0,
              440047.0,
              416679.0,
              327034.0,
              279900.0,
              440034.0,
              440017.0],
             445127.0: [438933.0,
              416677.0,
              439155.0,
              327034.0,
              440047.0,
              327814.0,
              416679.0,
              416678.0,
              439858.0,
              327824.0,
              279921.0],
             416677.0: [438933.0,
              445127.0,
              439155.0,
              416679.0,
              440047.0,
              416678.0,
 

In [22]:
import pickle

In [23]:
with open('important_backlog/last_book_predictor.pickle', 'wb') as f:
    pickle.dump(top_10_predictor, f)

### Courses predictor

In [25]:
courses_engineered = pd.read_csv('data_algo/courses_merged.csv', index_col=0)

In [26]:
courses_engineered.head()

Unnamed: 0,status,pupil_id,service_id,Наименование_услуги,course_duration,is_newly_created,is_free,is_individual,Service_group,gender,age_cat,Onboard_year
0,3.0,25969.0,39178,Архитектура,yearly,N,Y,Y,art_drawing,M,mid_high_school,2015.0
1,3.0,158129.0,39178,Архитектура,yearly,N,Y,Y,art_drawing,F,student,2010.0
2,3.0,304476.0,39178,Архитектура,yearly,N,Y,Y,art_drawing,M,mid_high_school,2014.0
3,3.0,1139915.0,39178,Архитектура,yearly,N,Y,Y,art_drawing,M,mid_high_school,2014.0
4,3.0,298767.0,39178,Архитектура,yearly,N,Y,Y,art_drawing,F,mid_high_school,2014.0


In [28]:
pupil_service = courses_engineered[['pupil_id', 'service_id']]

In [29]:
pupil_service_dict = defaultdict(list)
for row in pupil_service.itertuples():
    pupil_service_dict[row[1]].append(row[2])

In [30]:
%%time
all_seqs = []
for key in pupil_service_dict.keys():
    books = pupil_service_dict[key]
    sequences = extract_sequence(books)
    all_seqs.extend(sequences)

Wall time: 2min 59s


In [31]:
frequency_dict = defaultdict(int)
for seq in all_seqs:
    frequency_dict[seq] += 1
    
freq_df = pd.DataFrame.from_dict(frequency_dict, orient='index')
freq_df.reset_index(inplace=True)
freq_df.rename(columns={'index':'seq', 0:'freq'}, inplace=True)

freq_df = freq_df.sort_values('freq', ascending=False)

freq_df['prev'] = freq_df['seq'].apply(lambda x: x[0])
freq_df['next'] = freq_df['seq'].apply(lambda x: x[1])

freq_df.head()

Unnamed: 0,seq,freq,prev,next
314,"(232803, 232803)",1439,232803,232803
776,"(228903, 228903)",1288,228903,228903
1636,"(230558, 230558)",1005,230558,230558
1333,"(147968, 147968)",937,147968,147968
163,"(87688, 87688)",715,87688,87688


In [32]:
seq_predictions_dict = defaultdict(list)
for row in freq_df.itertuples():
    seq_predictions_dict[row[3]].append(row[4])

In [33]:
top_10_predictor = defaultdict()
for key, values in seq_predictions_dict.items():
    top_10_predictor[key] = values[:11]

In [34]:
top_10_predictor

defaultdict(None,
            {232803: [232803,
              229485,
              43670,
              219837,
              216441,
              212319,
              301309,
              224217,
              229938,
              219785,
              513273],
             228903: [228903,
              229232,
              215668,
              229303,
              42233,
              42497,
              220326,
              508064,
              384235,
              42109,
              1068454],
             230558: [230558,
              627749,
              315395,
              315394,
              484212,
              511206,
              230549,
              511608,
              720574,
              513012,
              212378],
             147968: [147968,
              230619,
              216400,
              43668,
              44629,
              60898,
              485043,
              159673,
              307723,
              218952,
       

In [35]:
with open('important_backlog/last_course_predictor.pickle', 'wb') as f:
    pickle.dump(top_10_predictor, f)