In [1]:
import numpy as np
import pandas as pd

from dataset_creator import DatasetCreator
from preprocessor import Preprocessor

In [2]:
# creator = DatasetCreator(created=True)
pp = Preprocessor(created=True)

---

# Preprocessing for BatchGenerator

## Que and pro feature dicts

In [5]:
qa_data = pp.qa_data.merge(pp.stud_data, on='students_id')
qa_data.shape

(49722, 36)

In [6]:
que_content = qa_data[[
    'students_state',
    'students_location',
    
    'questions_body_length',
    
#     'students_average_question_body_length',
#     'students_average_answer_body_length',
]]

que_time = qa_data[[
#     'students_questions_asked',
#     'students_average_question_age',
    
    'students_date_joined_time',
    'students_date_joined_doy_sin',
    'students_date_joined_doy_cos',
    
    'questions_date_added_time',
    'questions_date_added_doy_sin',
    'questions_date_added_doy_cos',
]]

In [7]:
que = pp.qa_data.questions_id

In [9]:
que_content_dict = {que.loc[i]: que_content.loc[i].values for i in range(que.size)}
que_time_dict = {que.loc[i]: que_time.loc[i].values for i in range(que.size)}

In [10]:
pro_content = pp.prof_data[[
    'professionals_state',
    'professionals_location',
    'professionals_industry',
    
#     'professionals_average_question_body_length',
#     'professionals_average_answer_body_length',
]]

pro_time = pp.prof_data[[
#     'professionals_questions_answered',
#     'professionals_average_question_age',
#     'professionals_email_activated',
    
    'professionals_date_joined_time',
    'professionals_date_joined_doy_sin',
    'professionals_date_joined_doy_cos',
]]

In [11]:
pro = pp.prof_data.professionals_id

In [12]:
pro_content_dict = {pro.loc[i]: pro_content.loc[i].values for i in range(pro.size)}
pro_time_dict = {pro.loc[i]: pro_time.loc[i].values for i in range(pro.size)}

In [14]:
import pickle
with open('que_content_dict.pickle', 'wb') as f:
    pickle.dump(que_content_dict, f)
with open('que_time_dict.pickle', 'wb') as f:
    pickle.dump(que_time_dict, f)
with open('pro_content_dict.pickle', 'wb') as f:
    pickle.dump(pro_content_dict, f)
with open('pro_time_dict.pickle', 'wb') as f:
    pickle.dump(pro_time_dict, f)

## Time-related feature dicts

In [5]:
ans_prev_answer_date = pp.qa_data[['answers_id', 'professionals_prev_answer_date_time']]
ans_date_added = pp.qa_data[['answers_id', 'answers_date_added_time']]
que_date_added = pp.qa_data[['questions_id', 'questions_date_added_time']]
ans_date_added.head()

Unnamed: 0,answers_id,answers_date_added_time
0,4e5f01128cae4f6d8fd697cec5dca60c,-0.699372
1,334f6735d31e45589e43da5ae7056e50,0.747848
2,e5d66281cc314675b95ddbb799b75473,0.747848
3,e5c0da2a29ff414fa76b9da6e86337fc,-0.560503
4,f3519ab99a1a4a13a8a9ecb814287d2a,-0.500911


In [54]:
ans_date_added_dict = {ans:date for ans, date in ans_date_added.values}
que_date_added_dict = {que:date for que, date in que_date_added.values}
ans_prev_answer_date_dict = {ans:date for ans, date in ans_prev_answer_date.values}

In [76]:
# Change last_answer_date feature for incorrect professionals
pro_answer_dates_dict = {pro:df_slice.professionals_prev_answer_date_time.sort_values().values.tolist()
                         for pro, df_slice in pp.qa_data.groupby('professionals_id')}

In [77]:
pro_last_anwer_date_dict = {pro:df_slice.answers_date_added_time.max()
                            for pro, df_slice in pp.qa_data.groupby('professionals_id')}

# Load preprocessors here
import pickle
with open('preprocessors.pickle', 'rb') as f:
    preproc = pickle.load(f)

In [78]:
for pro, last_answer_date in pro_last_anwer_date_dict.items():
    last_answer_date = preproc['answers_date_added_time'].inverse_transform([[last_answer_date]])
    pro_last_anwer_date_dict[pro] = preproc['professionals_last_answer_date_time'].transform(last_answer_date)[0][0]

In [79]:
for pro in pro_answer_dates_dict.keys():
    pro_answer_dates_dict[pro].append(pro_last_anwer_date_dict[pro])

In [86]:
with open('pro_answer_dates_dict.pickle', 'wb') as f:
    pickle.dump(pro_answer_dates_dict, f)

In [64]:
pro_reg_dates = pp.prof_data[['professionals_id', 'professionals_date_joined_time']]
pro_reg_dates = pro_reg_dates.sort_values(by='professionals_date_joined_time')
pro_reg_dates['professionals_date_joined_time'] = pro_reg_dates['professionals_date_joined_time'].apply(
        lambda x: preproc['professionals_date_joined_time'].inverse_transform([[x]])[0][0])
pro_list = pro_reg_dates['professionals_id'].values.tolist()
pro_reg_date_list = pro_reg_dates['professionals_date_joined_time'].values

In [66]:
import pickle
with open('ans_date_added_dict.pickle', 'wb') as f:
    pickle.dump(ans_date_added_dict, f)
with open('que_date_added_dict.pickle', 'wb') as f:
    pickle.dump(que_date_added_dict, f)
with open('ans_prev_answer_date_dict.pickle', 'wb') as f:
    pickle.dump(ans_prev_answer_date_dict, f)
with open('pro_list.pickle', 'wb') as f:
    pickle.dump(pro_list, f)
with open('pro_reg_date_list.pickle', 'wb') as f:
    pickle.dump(pro_reg_date_list, f)

In [9]:
# Load preprocessors
import pickle
with open('preprocessors.pickle', 'rb') as f:
    preproc = pickle.load(f)

In [None]:
qa.data_data['answers']

In [17]:
ans_list = pp.qa_data.answers_id.values
ans_dates = pp.qa_data.answers_date_added_time.apply(
    lambda x: preproc['answers_date_added_time'].inverse_transform([[x]])[0][0]).values

In [18]:
date = 2018.67
train_ans_list, test_ans_list = ans_list[ans_dates < date], ans_list[ans_dates >= date]
train_ans_list.shape, test_ans_list.shape

((44502,), (5220,))

In [19]:
with open('train_ans_list.pickle', 'wb') as f:
    pickle.dump(train_ans_list, f)
with open('test_ans_list.pickle', 'wb') as f:
    pickle.dump(test_ans_list, f)

In [93]:
with open('pro_answer_dates_dict.pickle', 'rb') as f:
    pro_answer_dates_dict = pickle.load(f)

In [94]:
answer_date_pro_list = [(answer_date, pro)
                        for pro, answer_dates in pro_answer_dates_dict.items()
                        for answer_date in answer_dates]

In [95]:
len(answer_date_pro_list)

59737

In [96]:
answer_times = pd.DataFrame.from_records(answer_date_pro_list, columns=['prev_answer_date_time', 'professionals_id'])
answer_times.sort_values(by='prev_answer_date_time', inplace=True)

In [97]:
answer_times.head()

Unnamed: 0,prev_answer_date_time,professionals_id
2928,-4.133939,0c673e046d824ec0ad0ebe012a0673e4
35302,-4.087291,977428d851b24183b223be0eb8619a8c
47892,-4.018243,c9bfa93898594cbbace436deca644c64
2992,-3.989516,0c9a2748560541be9fe2df0d7be88282
47577,-3.989516,c79d4e4fd9af4ab7a7e6b6f433128476


In [98]:
np.searchsorted(answer_times.prev_answer_date_time.values, 0)

26740

In [99]:
answer_times.to_pickle('answer_times.pkl')

In [100]:
answer_times.set_index('professionals_id').loc['66a6d2d213da44fbbf6e9407df46be73']

Unnamed: 0_level_0,prev_answer_date_time
professionals_id,Unnamed: 1_level_1
66a6d2d213da44fbbf6e9407df46be73,-3.914846
66a6d2d213da44fbbf6e9407df46be73,-3.914846


---

## Get embeddings for questions from doc2vec

In [114]:
from doc2vec import train, save
import re

from utils import TextProcessor

In [115]:
pp.questions.loc[0, 'questions_body']

'What  is  a  maths  teacher?   what  is  a  maths  teacher  useful? #college #professor #lecture'

In [116]:
pp.questions.questions_body = (pp.questions.questions_body
    .apply(lambda x: re.sub(r'(<[^>]*[/]?>|[\r]?\n)', ' ', str(x)))
    .apply(lambda x: re.sub(r' +', ' ', x).strip())
    .apply(lambda x: x.replace('#', '')))
pp.questions.questions_body.loc[0]

'What is a maths teacher? what is a maths teacher useful? college professor lecture'

In [117]:
pp.questions.questions_title = (pp.questions.questions_title
    .apply(lambda x: re.sub(r'(<[^>]*[/]?>|[\r]?\n)', ' ', str(x)))
    .apply(lambda x: re.sub(r' +', ' ', x).strip())
    .apply(lambda x: x.replace('#', '')))
pp.questions.questions_title.loc[0]

'Teacher career question'

In [118]:
tp = TextProcessor()
for column in ['questions_title', 'questions_body']:
    pp.questions[column] = pp.questions[column].apply(tp.process, allow_stopwords=False)

In [119]:
pp.questions.questions_body.loc[2]

'plan go abroad first job teach job seriou career idea know job would work stay home instead assum stay leav makeba huge differ care unless find someth first job think way go abroad seen good bad know side respect employ willl side work abroad employ oversea'

In [120]:
questions_embs = train(pp.questions, 'questions_id', ['questions_title', 'questions_body'], 10)

In [121]:
save(questions_embs, 'questions')

In [76]:
tag_users = pp.tag_users
pro = pp.professionals
stu = pp.students
tags = pp.tags

pro_tags = pro.merge(tag_users, left_on='professionals_id', right_on='tag_users_user_id').merge(tags, left_on = 'tag_users_tag_id', right_on='tags_tag_id')
pro_tags = pro_tags[['professionals_id', 'tags_tag_name']].groupby(by='professionals_id', as_index = False).aggregate(lambda x: ' '.join(x))
pro_tags = {row['professionals_id']: row['tags_tag_name'].split() for _, row in pro_tags.iterrows()}

In [100]:
pros_with_tags = list(pro_tags.keys())

In [103]:
pp.prof_data.shape[0]

10015

In [102]:
pd.DataFrame({'professionals_id': pros_with_tags}).merge(pp.prof_data[['professionals_id']], on='professionals_id').shape[0]

9300

In [77]:
stu_tags = stu.merge(tag_users, left_on='students_id', right_on='tag_users_user_id').merge(tags, left_on = 'tag_users_tag_id', right_on='tags_tag_id')
stu_tags = stu_tags[['students_id', 'tags_tag_name']].groupby(by='students_id', as_index = False).aggregate(lambda x: ' '.join(x))
stu_tags = {row['students_id']: row['tags_tag_name'].split() for _, row in stu_tags.iterrows()}

In [104]:
stu_with_tags = list(stu_tags.keys())

In [99]:
pp.stud_data.shape[0]

11985

In [105]:
pd.DataFrame({'students_id': stu_with_tags}).merge(pp.stud_data[['students_id']], on='students_id').shape[0]

3673

---

# BatchGenerator class

In [15]:
import pickle
import random
from scipy.stats import cauchy

import keras
import numpy as np
import pandas as pd

from utils import TextProcessor

In [21]:
class BatchGenerator(keras.utils.Sequence):
    """
    Generates batch of data in train and test modes
    """
    
    def __init__(self, pos_size, neg_size, mode='train', data_path='../../data/'):
        self.pos_size = pos_size
        self.neg_size = neg_size
        
        que = pd.read_csv(data_path + 'questions.csv')
        tag_que = pd.read_csv(data_path + 'tag_questions.csv')
        tags = pd.read_csv(data_path + 'tags.csv')
        pro = pd.read_csv(data_path + 'professionals.csv')
        stu = pd.read_csv(data_path + 'students.csv')
        ans = pd.read_csv(data_path + 'answers.csv')
        tag_users = pd.read_csv(data_path + 'tag_users.csv')
        
        self.tp = TextProcessor()
        pro['professionals_industry'] = pro['professionals_industry'].apply(self.tp.process)
        tags['tags_tag_name'] = tags['tags_tag_name'].apply(lambda x: self.tp.process(x, allow_stopwords=True))
        
        self.pro_ind = {row['professionals_id']: row['professionals_industry'] for i, row in pro.iterrows()}
        
        que_tags = que.merge(tag_que, left_on = 'questions_id', right_on = 'tag_questions_question_id').merge(tags, left_on = 'tag_questions_tag_id', right_on = 'tags_tag_id')
        que_tags = que_tags[['questions_id', 'tags_tag_name']].groupby(by = 'questions_id', as_index = False).aggregate(lambda x: ' '.join(x))
        self.que_tags = {row['questions_id']: row['tags_tag_name'].split() for _, row in que_tags.iterrows()}
        
        pro_tags = pro.merge(tag_users, left_on='professionals_id', right_on='tag_users_user_id').merge(tags, left_on = 'tag_users_tag_id', right_on='tags_tag_id')
        pro_tags = pro_tags[['professionals_id', 'tags_tag_name']].groupby(by='professionals_id', as_index = False).aggregate(lambda x: ' '.join(x))
        self.pro_tags = {row['professionals_id']: row['tags_tag_name'].split() for _, row in pro_tags.iterrows()}
        
        stu_tags = stu.merge(tag_users, left_on='students_id', right_on='tag_users_user_id').merge(tags, left_on = 'tag_users_tag_id', right_on='tags_tag_id')
        stu_tags = stu_tags[['students_id', 'tags_tag_name']].groupby(by='students_id', as_index = False).aggregate(lambda x: ' '.join(x))
        self.stu_tags = {row['students_id']: row['tags_tag_name'].split() for _, row in stu_tags.iterrows()}
        
        ans_que = ans.merge(que, left_on = 'answers_question_id', right_on = 'questions_id')
        ans_que_pro = ans_que.merge(pro, left_on = 'answers_author_id', right_on = 'professionals_id')
        ans_que_pro = ans_que_pro.merge(stu, left_on = 'questions_author_id', right_on = 'students_id')
        
        # Add a dictionary mapping answer to (question, professional) pair
        self.ans_que_pro_dict = {row['answers_id']:(row['questions_id'], row['professionals_id'])
                                 for _, row in ans_que_pro.iterrows()}
        
        self.que_stu_dict = {row['questions_id']: row['students_id'] for _, row in ans_que_pro.iterrows()}
        self.que_pro_set = {(row['questions_id'], row['professionals_id']) for _, row in ans_que_pro.iterrows()}
        
        with open('tags_embs.pkl', 'rb') as f:
            self.tag_emb = pickle.load(f)
        with open('industries_embs.pkl', 'rb') as f:
            self.ind_emb = pickle.load(f)
        
        # Load que and pro content and time related features
        with open('que_content_dict.pickle', 'rb') as f:
            self.que_content_dict = pickle.load(f)
        with open('que_time_dict.pickle', 'rb') as f:
            self.que_time_dict = pickle.load(f)
        with open('pro_content_dict.pickle', 'rb') as f:
            self.pro_content_dict = pickle.load(f)
        with open('pro_time_dict.pickle', 'rb') as f:
            self.pro_time_dict = pickle.load(f)
        
        # Load time related dicts and lists
        with open('pro_answer_dates_dict.pickle', 'rb') as f:
            self.pro_answer_dates_dict = pickle.load(f)
        with open('ans_date_added_dict.pickle', 'rb') as f:
            self.ans_date_added_dict = pickle.load(f)
        with open('que_date_added_dict.pickle', 'rb') as f:
            self.que_date_added_dict = pickle.load(f)
        with open('ans_prev_answer_date_dict.pickle', 'rb') as f:
            self.ans_prev_answer_date_dict = pickle.load(f)
        with open('pro_list.pickle', 'rb') as f:
            self.pro_list = pickle.load(f)
        with open('pro_reg_date_list.pickle', 'rb') as f:
            self.pro_reg_date_list = pickle.load(f)
        
        # Load preprocessors
        with open('preprocessors.pickle', 'rb') as f:
            self.preproc = pickle.load(f)
        
        # Load answer list depending on the mode
        if mode == 'train':
            with open('train_ans_list.pickle', 'rb') as f:
                self.ans_list = pickle.load(f)
        elif mode == 'test':
            with open('test_ans_list.pickle', 'rb') as f:
                self.ans_list = pickle.load(f)
        
        # Load questions embeddings
        with open('questions_embs.pkl', 'rb') as f:
            self.que_emb = pickle.load(f)
        
#         print("Begin que computing!")
#         que_emb_dict = {que:self.convert_que(que) for que in ans_que_pro.questions_id.unique()}
        
#         print("Begin pro computing!")
#         pro_emb_dict = {pro:self.convert_pro(pro) for pro in ans_que_pro.professionals_id.unique()}
        
#         with open('que_emb_dict.pickle', 'wb') as f:
#             pickle.dump(que_emb_dict, f)
#         with open('pro_emb_dict.pickle', 'wb') as f:
#             pickle.dump(pro_emb_dict, f)
    
    
    def __len__(self):
        return len(self.ans_list) // self.pos_size
    
    
    def __getitem__(self, index):
        pos_pairs = []
        neg_pairs = []
        
        pos_prev_dates = []
        neg_prev_dates = []
        
        pos_cur_times = []
        neg_cur_times = []
        
        pos_que_content, pos_que_time, pos_pro_content, pos_pro_time = [], [], [], []
        neg_que_content, neg_que_time, neg_pro_content, neg_pro_time = [], [], [], []
        
        pos_ans = self.ans_list[self.pos_size * index: self.pos_size * (index + 1)]
        for ans in pos_ans:
            # Add que and pro features and dates to appropriate lists
            que, pro = self.ans_que_pro_dict[ans]
            pos_pairs.append((que, pro))
            
            pos_que_content.append(self.que_content_dict[que])
            pos_que_time.append(self.que_time_dict[que])
            
            pos_pro_content.append(self.pro_content_dict[pro])
            pos_pro_time.append(self.pro_time_dict[pro])
            
            pos_prev_dates.append(self.ans_prev_answer_date_dict[ans])
            pos_cur_times.append(self.ans_date_added_dict[ans])
        
        for i in range(self.neg_size):
            ans = random.choice(self.ans_list)
            que, _ = self.ans_que_pro_dict[ans]
            
            # Current time is realization of absolute value of Cauchy random variable
            cur_time = np.abs(cauchy.rvs(loc=0, scale=12.5))
            
            # Inverse transform current time
            cur_time = (self.preproc['questions_date_added_time']
                        .inverse_transform([[self.que_date_added_dict[que]]])[0][0] + cur_time / 365)
            
            # Include professionals whos registration date is belove threshold
            threshold = np.searchsorted(self.pro_reg_date_list, cur_time)
            valid_pros = self.pro_list[:threshold]
            
            # Transform current time with preprocessor for professionals_prev_answer_date_time
            cur_time = self.preproc['professionals_prev_answer_date_time'].transform([[cur_time]])[0][0]
            
#             #-------------------------------------------------------------------------
#             #                          WITH DISTRIBUTION
            
#             # Sample 50 (or less) pros among valid ones
#             sampled_pros = random.sample(valid_pros, min(50, len(valid_pros)))
            
#             pros = []
#             prev_answer_dates = []
            
#             # Compute previous answer date for every sampled professional
#             for pro in sampled_pros:
#                 if (que, pro) not in self.que_pro_set:
#                     prev_answer_date = self.__negative_que_prev_answer_date(pro, cur_time)
                    
#                     pros.append(pro)
#                     prev_answer_dates.append(prev_answer_date)
            
#             if len(pros) == 0:
#                 continue
            
#             # Substact prev answer dates from cur_time
#             distances = cur_time - np.array(prev_answer_dates)
            
#             # Apply log1p transformation to 1 / distances and normalize each entry
#             distances = np.log1p(1 / distances)
#             distances /= distances.sum()
            
#             # Sample one professional from distribution of distances
#             pro = np.random.choice(pros, p=distances)
#             #-------------------------------------------------------------------------
            
            #-------------------------------------------------------------------------
            #                         WITHOUT DISTRIBUTION
            
            pro = random.choice(valid_pros)
            while (que, pro) in self.que_pro_set:
                pro = random.choice(valid_pros)
            #-------------------------------------------------------------------------
            
            prev_date = self.__negative_que_prev_answer_date(pro, cur_time)
            
            # Add que and pro features and dates to appropriate lists
            neg_pairs.append((que, pro))
            
            neg_que_content.append(self.que_content_dict[que])
            neg_que_time.append(self.que_time_dict[que])
            
            neg_pro_content.append(self.pro_content_dict[pro])
            neg_pro_time.append(self.pro_time_dict[pro])
            
            neg_prev_dates.append(prev_date)
            neg_cur_times.append(cur_time)
        
        pos_que_embeddings, pos_pro_embeddings = self.__convert(pos_pairs)
        neg_que_embeddings, neg_pro_embeddings = self.__convert(neg_pairs)
        
        pos_que = np.hstack([
            np.array(pos_que_content),
            pos_que_embeddings,
            np.array(pos_que_time),
            np.array(pos_cur_times)[:, np.newaxis],
        ])
        neg_que = np.hstack([
            np.array(neg_que_content),
            neg_que_embeddings,
            np.array(neg_que_time),
            np.array(neg_cur_times)[:, np.newaxis],
        ])
        
        pos_pro = np.hstack([
            np.array(pos_pro_content),
            pos_pro_embeddings,
            np.array(pos_pro_time),
            np.array(pos_prev_dates)[:, np.newaxis],
            np.array(pos_cur_times)[:, np.newaxis],
        ])
        neg_pro = np.hstack([
            np.array(neg_pro_content),
            neg_pro_embeddings,
            np.array(neg_pro_time),
            np.array(neg_prev_dates)[:, np.newaxis],
            np.array(neg_cur_times)[:, np.newaxis],
        ])
        
        return_list = [np.vstack([pos_que, neg_que]), np.vstack([pos_pro, neg_pro])]
        target = np.vstack([np.ones((self.pos_size, 1)), np.zeros((self.neg_size, 1))])
        
        return return_list, target
    
    
    def __negative_que_prev_answer_date(self, pro, cur_time):
        pro_dates = self.pro_answer_dates_dict[pro]
        
        index = np.searchsorted(pro_dates, cur_time)
        if index == 0:
            raise ValueError("Index cannot be zero.")
        return pro_dates[index-1]
    
    
    def __convert(self, pairs):
        x_que, x_pro = [], []
        for que, pro in pairs:
            stu = self.que_stu_dict[que]
            
            que_tags = []
            pro_tags = []
            stu_tags = []
            
            # Average embedding of question tags
            for tag in self.que_tags.get(que, []):
                que_tags.append(self.tag_emb.get(tag, np.zeros(10)))
            if len(que_tags) == 0:
                que_tags.append(np.zeros(10))
            que_tag_emb = np.vstack(que_tags).mean(axis = 0).reshape(-1)
            
            # Average embedding of professional tags
            for tag in self.pro_tags.get(pro, []):
                pro_tags.append(self.tag_emb.get(tag, np.zeros(10)))
            if len(pro_tags) == 0:
                pro_tags.append(np.zeros(10))
            pro_tag_emb = np.vstack(pro_tags).mean(axis = 0).reshape(-1)
            
            # Collect all question and student embeddings
            que_emb = self.que_emb[que]
            x_que.append(np.hstack([
                que_emb,
                que_tag_emb,
            ]))
            
            # Collect all professional embeddings
            ind_emb = self.ind_emb.get(self.pro_ind[pro], np.zeros(10))
            x_pro.append(np.hstack([
                ind_emb,
                pro_tag_emb,
            ]))
        
        return np.vstack(x_que), np.vstack(x_pro)
    
    
    def convert_que(self, que):
        x_que = []
        que_tags = []
            
        # Average embedding of question tags
        for tag in self.que_tags.get(que, []):
            que_tags.append(self.tag_emb.get(tag, np.zeros(10)))
        if len(que_tags) == 0:
            que_tags.append(np.zeros(10))
        que_tag_emb = np.vstack(que_tags).mean(axis = 0).reshape(-1)

        # Collect all question and student embeddings
        que_emb = self.que_emb[que]
        x_que.append(np.hstack([que_emb,
                                que_tag_emb,
                                ]))
        
        return np.vstack(x_que)
    
    
    def convert_pro(self, pro):
        x_pro = []
        pro_tags = []
            
        # Average embedding of professional tags
        for tag in self.pro_tags.get(pro, []):
            pro_tags.append(self.tag_emb.get(tag, np.zeros(10)))
        if len(pro_tags) == 0:
            pro_tags.append(np.zeros(10))
        pro_tag_emb = np.vstack(pro_tags).mean(axis = 0).reshape(-1)

        # Collect all professional embeddings
        ind_emb = self.ind_emb.get(self.pro_ind[pro], np.zeros(10))
        x_pro.append(np.hstack([ind_emb,
                                pro_tag_emb
                                ]))
        
        return np.vstack(x_pro)
    
    
    def on_epoch_end(self):
        np.random.shuffle(self.ans_list)

In [22]:
generator = BatchGenerator(64, 64, mode='train')

In [24]:
generator[0][0][1].shape

(128, 28)

In [25]:
%%timeit
generator[0]

24.8 ms ± 777 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [55]:
%lprun -f generator.__getitem__ generator.__getitem__(0)

In [54]:
%load_ext line_profiler

---

## Precompute some professionals from previous answer date distribution for every answer

In [58]:
with open('ans_date_added_dict.pickle', 'rb') as f:
    ans_date_added_dict = pickle.load(f)
with open('pro_last_answer_dates_dict.pickle', 'rb') as f:
    pro_last_answer_dates_dict = pickle.load(f)

In [46]:
ans_que_pro_dict = {row['answers_id']:(row['questions_id'], row['professionals_id'])
                    for _, row in pp.qa_data.iterrows()}
que_pro_set = {(row['questions_id'], row['professionals_id']) for _, row in pp.qa_data.iterrows()}
pro_list = list(pp.prof_data['professionals_id'])

In [8]:
ans_date_added_df = (pp.qa_data[['answers_id', 'answers_date_added_time']]
                     .sort_values(by='answers_date_added_time'))

In [9]:
ans_list = list(ans_date_added_df.answers_id.values)
with open('ans_list.pickle', 'wb') as f:
    pickle.dump(ans_list, f)

In [12]:
np.unique(np.array(ans_list)).size

49722

In [105]:
inf = 1 / (np.float64(1e9))
np.log1p(inf)

9.999999995e-10

In [135]:
import time
start = time.time()
inf = 1e9

pro_cur_date = np.ones(len(pro_list)) * -inf
pro_next_date = np.zeros(len(pro_list))
pro_cur_idx = np.ones(len(pro_list), dtype=int) * -1
pro_last_answer_dates_list = []

ans_sampled_profs_dict = {}

for i, pro in enumerate(pro_list):
    pro_next_date[i] = pro_last_answer_dates_dict[pro][0]
    pro_last_answer_dates_list.append(pro_last_answer_dates_dict[pro])

for _, row in ans_date_added_df.iterrows():
    ans = row['answers_id']
    ans_date_added = row['answers_date_added_time']
    
    changed_pros_idx = np.nonzero(pro_next_date < ans_date_added)[0]
    
    for i in changed_pros_idx:
        pro_cur_idx[i] += 1
        idx = pro_cur_idx[i]
        
        pro_cur_date[i] = pro_next_date[i]
        
        if idx < pro_last_answer_dates_list[i].size:
            pro_next_date[i] = pro_last_answer_dates_list[i][idx]
        else:
            pro_next_date[i] = inf
    
    # Substact last answer dates from the actual date the answer was added
    distances = ans_date_added - np.array(pro_cur_date)
    
    # Apply log1p transformation to 1 / distances and normalize each entry
    distances = np.log1p(1 / distances)
    distances /= distances.sum()
    
    # Sample 50 professional from distribution of distances and choose 10 or less unique among them
    sampled_pro_set = set(np.random.choice(pro_list, 50, p=distances))
    
    que, _ = ans_que_pro_dict[ans]
    pros = sampled_pro_set.copy()
    for pro in sampled_pro_set:
        if (que, pro) in que_pro_set:
            pros.remove(pro)
    
    pros = list(pros)[: min(10, len(pros))]
    ans_sampled_profs_dict[ans] = pros

total = time.time() - start
print(total)

96.62652683258057


In [162]:
for key, value in ans_sampled_profs_dict.items():
    print(key, ':', value)
    break

bcb3d96f74104351938c362893b77e33 : ['0c70d8f19f074a0581c1c05c9922b03b', '977bfe665d674798948d6fa481ced3ac']


In [164]:
for key, value in ans_sampled_profs_dict.items():
    if len(value) == 0:
        print(key)

In [150]:
import pickle
with open('ans_sampled_profs_dict.pickle', 'wb') as f:
    pickle.dump(ans_sampled_profs_dict, f)

In [13]:
import pickle
with open('ans_sampled_profs_dict.pickle', 'rb') as f:
    ans_sampled_profs_dict = pickle.load(f)