# DatasetCreator class

In [2]:
import numpy as np
import pandas as pd

import re, os

In [64]:
class DatasetCreator:
    """
    Class that imports initial datasets and creates additional datasets for convenience
    """
    
    def __init__(self, data_path='../../data/', created=False):
        # Add data_path to class properties
        self.data_path = data_path
        
        # Import all initial datasets
        self.emails = pd.read_csv(data_path + 'emails.csv')
        self.questions = pd.read_csv(data_path + 'questions.csv')
        self.professionals = pd.read_csv(data_path + 'professionals.csv')
        self.comments = pd.read_csv(data_path + 'comments.csv')
        self.tag_users = pd.read_csv(data_path + 'tag_users.csv')
        self.group_memberships = pd.read_csv(data_path + 'group_memberships.csv')
        self.tags = pd.read_csv(data_path + 'tags.csv')
        self.students = pd.read_csv(data_path + 'students.csv')
        self.groups = pd.read_csv(data_path + 'groups.csv')
        self.tag_questions = pd.read_csv(data_path + 'tag_questions.csv')
        self.matches = pd.read_csv(data_path + 'matches.csv')
        self.answers = pd.read_csv(data_path + 'answers.csv')
        self.school_memberships = pd.read_csv(data_path + 'school_memberships.csv')
        
        if created:
            # Load additional datasets from disk
            self.qa_data = pd.read_csv(self.data_path + 'qa_data.csv')
            self.prof_data = pd.read_csv(self.data_path + 'prof_data.csv')
            self.stud_data = pd.read_csv(self.data_path + 'stud_data.csv')
        else:
            # Create additional datasets and save them to disk
            self.additional_datasets_creation()
    
    
    def additional_datasets_creation(self):
        """
        Creates additional datasets for futher processing and save them to disk.
        """
        # Create temporary dataset for further processing
        all_data = self.all_data_creation()
        
        # Create question-answer pairs dataset called qa_data
        self.qa_data = self.qa_data_creation(all_data)
        
        # Create dataset called prof_data compirising data of professionals
        # who answered at least one question
        self.prof_data = self.prof_data_creation(all_data)
        
        # Create dataset called stud_data compirising data of students
        # who asked at least one answered question
        self.stud_data = self.stud_data_creation(all_data)
        
        # Save new datasets to disc
        self.qa_data.to_csv(self.data_path + 'qa_data.csv', index=False)
        self.prof_data.to_csv(self.data_path + 'prof_data.csv', index=False)
        self.stud_data.to_csv(self.data_path + 'stud_data.csv', index=False)
    
    
    def all_data_creation(self):
        """
        Merges questions, answers, professionals and students datasets
        to get temporary dataset for further processing
        """
        # Merge questions with answers and delete not answered questions
        all_data = self.questions.merge(self.answers, left_on='questions_id', right_on='answers_question_id')
        
        # Merge with professionals and students (students asked, professionals answered)
        # Maybe change this in the future by taking care of professional who change status to students and vise versa
        all_data = all_data.merge(self.professionals, left_on='answers_author_id', right_on='professionals_id')
        all_data = all_data.merge(self.students, left_on='questions_author_id', right_on='students_id')
        
        # Transform dates from string representation to datetime object
        all_data.answers_date_added = pd.to_datetime(all_data.answers_date_added)
        all_data.questions_date_added = pd.to_datetime(all_data.questions_date_added)
        
        # Add questions_age feature, which represents amount of time
        # from question emergence to a particular answer to that question
        all_data['questions_age'] = all_data.answers_date_added - all_data.questions_date_added
        
        # Delete html tags and extra spaces from question and answer body
        all_data.questions_body = (all_data.questions_body
                                   .apply(lambda x: re.sub(r'(<[^>]*[/]?>|[\r]?\n)', ' ', str(x)))
                                   .apply(lambda x: re.sub(r' +', ' ', x).strip()))
        all_data.answers_body = (all_data.answers_body
                                 .apply(lambda x: re.sub(r'(<[^>]*[/]?>|[\r]?\n)', ' ', str(x)))
                                 .apply(lambda x: re.sub(r' +', ' ', x).strip()))
        
        # Count the number of words in question and answer body and add two new features
        all_data['questions_body_length'] = all_data.questions_body.apply(lambda x: len(x.split(' ')))
        all_data['answers_body_length'] = all_data.answers_body.apply(lambda x: len(x.split(' ')))
        
        return all_data
    
    
    def qa_data_creation(self, all_data):
        """
        Creates question-answer pairs dataset called qa_data_data
        """
        # Temporary qa_data representation
        qa_data = all_data.copy()
        
        # Select only unique professionals
        temp = qa_data[['professionals_id', 'answers_date_added', 'answers_id']]
        prof_unique = pd.DataFrame(temp.professionals_id.unique(), columns=['professionals_id'])
        prof_unique = prof_unique.merge(self.professionals, how='left', on='professionals_id')
        
        # For every professional add a "dummy" question with answer date being professional's registration date
        prof_unique['answers_id'] = list(None for _ in range(prof_unique.shape[0]))
        prof_unique['answers_date_added'] = prof_unique['professionals_date_joined']
        prof_unique = prof_unique[['professionals_id', 'answers_date_added', 'answers_id']]
        
        # Add "dummy" questions to all questions
        temp = pd.concat([temp, prof_unique])
        
        # Sort by professionals and answer dates
        temp = temp.sort_values(by=['professionals_id', 'answers_date_added']).reset_index(drop=True)
        
        # Get the sorted representation of the answers_date_added and shift the index down by one
        # so that current question is aligned with previous question answer date
        last_answer_date = pd.DataFrame({'professionals_last_answer_date': temp.answers_date_added})
        last_answer_date.index += 1
        
        # Add the professionals_last_answer_date column to temp
        temp = temp.merge(last_answer_date, left_index=True, right_index=True)
        temp.dropna(subset=['answers_id'], inplace=True)
        temp.drop(columns=['professionals_id', 'answers_date_added'], inplace=True)
        
        # Add professionals_last_answer_date column to qa_data 
        qa_data = qa_data.merge(temp, on='answers_id')
        
        # Transform dates from string representation to datetime object
        qa_data.professionals_last_answer_date = pd.to_datetime(qa_data.professionals_last_answer_date)
        
        print(qa_data[qa_data.professionals_id == '003cc21be89d4e42bc4424131a378e86']
              [['answers_date_added', 'professionals_last_answer_date']].sort_values(by='answers_date_added'))
        
        # Final qa_data representation
        qa_data = qa_data[[
            'students_id', 'questions_id', 'questions_title', 'questions_body',
            'questions_body_length', 'questions_date_added', 'professionals_id',
            'answers_id', 'answers_body', 'answers_date_added', 'professionals_last_answer_date'
        ]]
        
        return qa_data
    
    
    def prof_data_creation(self, all_data):
        """
        Creates dataset called prof_data compirising data of professionals who answered at least one question
        """
        # Select only professionals who answered at least one question
        active_professionals = pd.DataFrame({'professionals_id': all_data.professionals_id.unique()})
        prof_data = self.professionals.merge(active_professionals, how='right', on='professionals_id')
        
        # Extract state or country from location
        prof_data['professionals_state'] = prof_data['professionals_location'].apply(lambda loc: str(loc).split(', ')[-1])
        
        # Transform dates from string representation to datetime object
        prof_data.professionals_date_joined = pd.to_datetime(prof_data.professionals_date_joined)
        
        # Count the number of answered questions by each professional
        number_answered = all_data[['questions_id', 'professionals_id']].groupby('professionals_id').count()
        number_answered = number_answered.rename({'questions_id': 'professionals_questions_answered'}, axis=1)
        
        # Add professionals_questions_answered feature to prof_data
        prof_data = prof_data.merge(number_answered, left_on='professionals_id', right_index=True)
        
        # Get average question age for every professional among questions he answered
        average_question_age = (
            all_data.groupby('professionals_id')
            .questions_age.mean(numeric_only=False)
        )
        average_question_age = pd.DataFrame({'professionals_average_question_age': average_question_age})
        
        # Add professionals_average_question_age feature to prof_data
        prof_data = prof_data.merge(average_question_age, on='professionals_id')
        
        # Get all emails that every acting professional received
        prof_emails_received = pd.merge(
            prof_data[['professionals_id']], self.emails,
            left_on='professionals_id', right_on='emails_recipient_id')
        
        # Get all questions every acting professional received in emails
        prof_email_questions = prof_emails_received.merge(
            self.matches, how='inner', left_on='emails_id', right_on='matches_email_id')
        
        # Get answered questions about which professionals were notified by email
        questions_answered_from_emails = prof_email_questions.merge(
            self.qa_data[['professionals_id', 'questions_id']],
            left_on=['professionals_id', 'matches_question_id'],
            right_on=['professionals_id', 'questions_id'])
        
        # Count the number of answered questions about which professionals were notified by email
        email_activated = (questions_answered_from_emails
                           .groupby('professionals_id')[['questions_id']].count()
                           .rename(columns={'questions_id': 'professionals_email_activated'}))
        
        # Add professionals_email_activated feature to prof_data
        # This feature is percent of answered questions about which professionals were notified by email
        prof_data = prof_data.merge(email_activated, on='professionals_id', how='left')
        prof_data.professionals_email_activated.fillna(0, inplace=True)
        prof_data.professionals_email_activated /= prof_data.professionals_questions_answered
        
        # Compute average question and answer body length for each professional
        average_question_body_length = all_data.groupby('professionals_id')[['questions_body_length']].mean().reset_index()
        average_answer_body_length = all_data.groupby('professionals_id')[['answers_body_length']].mean().reset_index()
        
        # Add average question and answer body length features to prof_data
        prof_data = (prof_data.merge(average_question_body_length, on='professionals_id')
                     .rename(columns={'questions_body_length': 'professionals_average_question_body_length'}))
        prof_data = (prof_data.merge(average_answer_body_length, on='professionals_id')
                     .rename(columns={'answers_body_length': 'professionals_average_answer_body_length'}))
        
        return prof_data
    
    
    def stud_data_creation(self, all_data):
        """
        Creates dataset called stud_data compirising data of students who asked at least one answered question
        """
        # Select only students who asked at least one answered question
        active_students = pd.DataFrame({'students_id': all_data.students_id.unique()})
        stud_data = self.students.merge(active_students, how='right', on='students_id')
        
        # Extract state or country from location
        stud_data['students_state'] = stud_data['students_location'].apply(lambda loc: str(loc).split(', ')[-1])
        
        # Transform dates from string representation to datetime object
        stud_data.students_date_joined = pd.to_datetime(stud_data.students_date_joined)
        
        # Count the number of asked questions by each student
        number_asked = all_data[['questions_id', 'students_id']].groupby('students_id').count()
        number_asked = number_asked.rename({'questions_id': 'students_questions_asked'}, axis=1)
        
        # Add students_questions_answered feature to stud_data
        stud_data = stud_data.merge(number_asked, left_on='students_id', right_index=True)
        
        # Get average question age for every student among questions he asked that were answered
        average_question_age = (
            all_data.groupby('students_id')
            .questions_age.mean(numeric_only=False)
        )
        average_question_age = pd.DataFrame({'students_average_question_age': average_question_age})
        
        # Add professionals_average_question_age feature to prof_data
        stud_data = stud_data.merge(average_question_age, on='students_id')
        
        # Compute average question and answer body length for each student
        average_question_body_length = all_data.groupby('students_id')[['questions_body_length']].mean().reset_index()
        average_answer_body_length = all_data.groupby('students_id')[['answers_body_length']].mean().reset_index()
        
        # Add average question and answer body length features to stud_data
        stud_data = (stud_data.merge(average_question_body_length, on='students_id')
                     .rename(columns={'questions_body_length': 'students_average_question_body_length'}))
        stud_data = (stud_data.merge(average_answer_body_length, on='students_id')
                     .rename(columns={'answers_body_length': 'students_average_answer_body_length'}))
        
        return stud_data

In [65]:
creator = DatasetCreator(created=False)

       answers_date_added professionals_last_answer_date
35243 2017-10-19 18:16:16            2018-09-12 16:42:17
34793 2017-10-19 18:34:17            2017-10-19 18:16:16
34620 2017-10-19 18:47:15            2017-10-19 18:34:17
35255 2017-10-19 18:53:27            2017-10-19 18:47:15


In [74]:
creator.qa_data.head(1)

Unnamed: 0,students_id,questions_id,questions_title,questions_body,questions_body_length,questions_date_added,professionals_id,answers_id,answers_body,answers_date_added,professionals_last_answer_date
0,8f6f374ffd834d258ab69d376dd998f5,332a511f1569444485cf7a7a556a5e54,Teacher career question,What is a maths teacher? what is a maths teach...,14,2016-04-26 11:14:26,36ff3b3666df400f956f8335cf53e09e,4e5f01128cae4f6d8fd697cec5dca60c,Hi! You are asking a very interesting question...,2016-04-29 19:40:14,2016-04-29 14:15:00


---

# Preprocessor class

In [4]:
import numpy as np
import pandas as pd

import os, pickle, json, re

from sklearn.preprocessing import StandardScaler, LabelEncoder

In [5]:
class Preprocessor(DatasetCreator):
    """
    Class for qa_data, prof_data and stud_data feature preprocessing
    """
    
    def __init__(self, created=False):
        """
        Initializes DatasetCreator class and loads existing
        preprocessors that were already fit to data
        """
        # Initialize DatasetCreator
        super().__init__(created=created)
        
        # Load existing preprocessors that were already fit to data
        if os.path.isfile('preprocessors.pickle'):
            with open('preprocessors.pickle', 'rb') as file:
                self.pp = pickle.load(file)
        else:
            self.pp = {}
        
        # Load file that contains number of categories for categorical features
        with open('cat_features.json') as f:
            self.cat_features = json.load(f)
        
        # Carry out preprocessing of all datasets
        self.qa_data_preprocessing()
        self.prof_data_preprocessing()
        self.stud_data_preprocessing()
    
    
    def qa_data_preprocessing(self):
        """
        Preprocesses qa_data dataset
        """
        # Preprocess datetime and timedelta features
        Preprocessor.datetime(self.qa_data, 'questions_date_added', hour=True)
        Preprocessor.datetime(self.qa_data, 'answers_date_added', hour=True)
        Preprocessor.datetime(self.qa_data, 'professionals_last_answer_date', hour=True)
        
        # Preprocess numerical features
        for feature in [
            'questions_date_added_time', 'questions_date_added_dow',
            'answers_date_added_time', 'answers_date_added_dow',
            'professionals_last_answer_date_time', 'professionals_last_answer_date_dow',
            'questions_body_length',
        ]:
            Preprocessor.numerical(self.qa_data, feature, self.pp)
    
    
    def prof_data_preprocessing(self):
        """
        Preprocesses prof_data dataset
        """
        # Preprocess datetime and timedelta features
        Preprocessor.datetime(self.prof_data, 'professionals_date_joined')
        Preprocessor.timedelta(self.prof_data, 'professionals_average_question_age')
        
        # Preprocess numerical features
        for feature in [
            'professionals_questions_answered', 'professionals_date_joined_time',
            'professionals_date_joined_dow', 'professionals_average_question_age',
            'professionals_average_question_body_length', 'professionals_average_answer_body_length',
        ]:
            Preprocessor.numerical(self.prof_data, feature, self.pp)
        
        # Will need textual representation of industry in BatchGenerator
        self.prof_data['professionals_industry_textual'] = self.prof_data['professionals_industry']
        
        # Preprocess categorical features
        Preprocessor.categorical(
            self.prof_data, 'professionals_industry',
            self.cat_features['n_cats']['prof']['professionals_industry'],
            self.pp, oblige_fit=True
        )
        Preprocessor.categorical(
            self.prof_data, 'professionals_location',
            self.cat_features['n_cats']['prof']['professionals_location'],
            self.pp, oblige_fit=True
        )
        Preprocessor.categorical(
            self.prof_data, 'professionals_state',
            self.cat_features['n_cats']['prof']['professionals_state'],
            self.pp, oblige_fit=True
        )
    
    
    def stud_data_preprocessing(self):
        """
        Preprocesses stud_data dataset
        """
        # Preprocess datetime and timedelta features
        Preprocessor.datetime(self.stud_data, 'students_date_joined')
        Preprocessor.timedelta(self.stud_data, 'students_average_question_age')
        
        # Preprocess numerical features
        for feature in [
            'students_questions_asked', 'students_date_joined_time',
            'students_date_joined_dow', 'students_average_question_age',
            'students_average_question_body_length', 'students_average_answer_body_length',
        ]:
            Preprocessor.numerical(self.stud_data, feature, self.pp)
        
        # Preprocess categorical features
        Preprocessor.categorical(
            self.stud_data, 'students_location',
            self.cat_features['n_cats']['ques']['students_location'],
            self.pp, oblige_fit=True
        )
        Preprocessor.categorical(
            self.stud_data, 'students_state',
            self.cat_features['n_cats']['ques']['students_state'],
            self.pp, oblige_fit=True
        )
    
    
    @staticmethod
    def datetime(df: pd.DataFrame, feature: str, hour: bool = False):
        """
        Generates a bunch of new datetime features and drops the original feature inplace

        :param df: Data to work with.
        :param feature: Name of a column in df that contains date.
        :param hour: Whether feature contains time.
        """
        df[feature] = pd.to_datetime(df[feature])

        df[feature + '_time'] = df[feature].apply(lambda d: d.year + (d.dayofyear + d.hour / 24) / 365)
        df[feature + '_doy_sin'] = df[feature].apply(lambda d: np.sin(2 * np.pi * d.dayofyear / 365))
        df[feature + '_doy_cos'] = df[feature].apply(lambda d: np.cos(2 * np.pi * d.dayofyear / 365))
        df[feature + '_dow'] = df[feature].apply(lambda d: d.weekday())

        if hour:
            df[feature + '_hour_sin'] = df[feature].apply(lambda d: np.sin(2 * np.pi * (d.hour + d.minute / 60) / 24))
            df[feature + '_hour_cos'] = df[feature].apply(lambda d: np.cos(2 * np.pi * (d.hour + d.minute / 60) / 24))

        df.drop(columns=feature, inplace=True)
    
    
    @staticmethod
    def timedelta(df: pd.DataFrame, feature: str):
        """
        Generates the new timedelta feature

        :param df: Data to work with.
        :param feature: Name of a column in df that contains timedelta.
        """
        df[feature] = pd.to_timedelta(df[feature])

        df[feature] = df[feature] / pd.Timedelta("1 day")
    
    
    @staticmethod
    def _get_preprocessor(fit_data: np.array, feature: str, base, pp: dict, oblige_fit: bool):
        """
        Creates new preprocessor having class base or uses existing one in preprocessors.pickle
        Returns this preprocessor

        :param fit_data: NumPy array of data to fit new preprocessor.
        :param feature: Feature name to search for in preprocessors.pickle.
        :param base: Preprocessor's class.
        :param pp: Object with preprocessors.
        :param oblige_fit: Whether to fit new preprocessor on feature even if there already exists one.
        :returns: Preprocessor object.
        """    
        if feature in pp and not oblige_fit:
            preproc = pp[feature]
        else:
            preproc = base()
            preproc.fit(fit_data)
            pp[feature] = preproc
            with open('preprocessors.pickle', 'wb') as file:
                pickle.dump(pp, file)
        return preproc
    
    
    @staticmethod
    def numerical(df: pd.DataFrame, feature: str, pp: dict, oblige_fit: bool = False):
        """
        Transforms via StandardScaler

        :param df: Data to work with.
        :param feature: Name of a column in df that contains numerical data.
        :param pp: Object with preprocessors.
        :param oblige_fit: Whether to fit new StandardScaler on feature even if there already exists one.
        """
        fit_data = df[feature].values.reshape(-1, 1).astype('float64')
        sc = Preprocessor._get_preprocessor(fit_data, feature, StandardScaler, pp, oblige_fit)
        df[feature] = sc.transform(fit_data)
    
    
    @staticmethod
    def categorical(df: pd.DataFrame, feature: str, n: int, pp: dict, oblige_fit: bool = False):
        """
        Encodes top n most popular values with different labels from 0 to n-1,
        remaining values with n and NaNs with n+1

        :param df: Data to work with.
        :param feature: Name of a column in df that contains categorical data.
        :param n: Number of top by popularity values to move in separate categories.
                  0 to encode everything with different labels.
        :param pp: Object with preprocessors.
        :param oblige_fit: Whether to fit new LabelEncoder on feature even if there already exists one.
        """
        vc = df[feature].value_counts()
        n = len(vc) if n == 0 else n

        top = set(vc[:n].index)
        isin_top = df[feature].isin(top)

        fit_data = df.loc[isin_top, feature]
        le = Preprocessor._get_preprocessor(fit_data, feature, LabelEncoder, pp, oblige_fit)

        isin_le = df[feature].isin(set(le.classes_))
        df.loc[isin_le, feature] = le.transform(df.loc[isin_le, feature])

        bottom = set(vc.index) - set(le.classes_)
        isin_bottom = df[feature].isin(bottom)
        df.loc[isin_bottom, feature] = n
        df[feature].fillna(n + 1, inplace=True)

In [7]:
pp = Preprocessor(created=True)

In [8]:
pp.qa_data.head(1)

Unnamed: 0,students_id,questions_id,questions_title,questions_body,questions_body_length,professionals_id,answers_id,answers_body,questions_date_added_time,questions_date_added_doy_sin,...,answers_date_added_doy_cos,answers_date_added_dow,answers_date_added_hour_sin,answers_date_added_hour_cos,professionals_last_answer_date_time,professionals_last_answer_date_doy_sin,professionals_last_answer_date_doy_cos,professionals_last_answer_date_dow,professionals_last_answer_date_hour_sin,professionals_last_answer_date_hour_cos
0,8f6f374ffd834d258ab69d376dd998f5,332a511f1569444485cf7a7a556a5e54,Teacher career question,What is a maths teacher? what is a maths teach...,-0.683257,36ff3b3666df400f956f8335cf53e09e,4e5f01128cae4f6d8fd697cec5dca60c,Hi! You are asking a very interesting question...,-0.400852,0.903356,...,-0.474951,0.748932,-0.906308,0.422618,-0.645208,0.880012,-0.474951,0.771827,-0.55557,-0.83147


---

# BatchGenerator class

In [6]:
import numpy as np
import pandas as pd

import keras
from keras.utils import Sequence

from sklearn.utils import shuffle

from utils import TextProcessor

Using TensorFlow backend.


In [7]:
class BatchGenerator(Sequence):
    """
    Generates batches of data to feed into the model
    """
    
    def __init__(self, pp: Preprocessor, batch_size: int = 50, shuffle: bool = True):
        """
        Loads required datasets from pp, batch_size and shuffle parameters
        """
        self.qa_data = pp.qa_data.merge(pp.stud_data, on='students_id')
        self.prof_data = pp.prof_data
        
        # Select unique professionals from the ones that answered at least one question
        self.unique_profs = pp.prof_data.professionals_id.unique()
        
        #----------------------------------------------------------------------------
        #               INTEGRATION WITH NIKITA'S BATCH GENERATOR
        #----------------------------------------------------------------------------
        
        # Load required datasets (their names are left as they were in Nikita's batch generator)
        tag_que = pp.tag_questions
        tags = pp.tags
        pro = pp.prof_data
        que = pp.qa_data
        
        # Import precomputed embeddings
        with open('tags_embs.pickle', 'rb') as file:
            self.tag_emb = pickle.load(file)
        with open('industries_embs.pickle', 'rb') as file:
            self.ind_emb = pickle.load(file)
        
        # Preprocess professionals industries
        self.tp = TextProcessor()
        
        pro['professionals_industry_textual'] = pro['professionals_industry_textual'].apply(self.tp.process)
        tags['tags_tag_name'] = tags['tags_tag_name'].apply(lambda x: self.tp.process(x, allow_stopwords=True))
        
        # Map professionals_id to professionals_industry_textual
        self.pro_ind = {row['professionals_id']: row['professionals_industry_textual'] for i, row in pro.iterrows()}
        
        # Create string of tags for every question
        que_tags = (que.merge(tag_que, left_on='questions_id', right_on='tag_questions_question_id')
                       .merge(tags, left_on='tag_questions_tag_id', right_on='tags_tag_id'))
        que_tags = (que_tags[['questions_id', 'tags_tag_name']]
                    .groupby('questions_id', as_index=False)
                    .aggregate(lambda x: ' '.join(x)))
        
        # Map questions_id to string of tags
        self.que_tag = {row['questions_id']: row['tags_tag_name'].split() for i, row in que_tags.iterrows()}
        
        #----------------------------------------------------------------------------
        
        # Set batch_size and shuffle parameters
        self.batch_size = batch_size
        self.shuffle = shuffle
        
        # Initial shuffle 
        self.on_epoch_end()
    
    
    def __len__(self):
        """
        Denotes the number of batches per epoch
        """
        return self.qa_data.shape[0] // (self.batch_size)
    
    
    def __getitem__(self, index):
        """
        Generates one batch of data
        """
        # Positive batch is selected by index
        positive_batch = self.qa_data.iloc[index * self.batch_size : (index + 1) * self.batch_size, :]
        negative_batch = positive_batch
        
        # Choose random professionals for negative batch
        cur_profs = negative_batch.professionals_id
        new_profs = np.random.choice(self.unique_profs, self.batch_size)
        
        # Check if all professionals from negative batch are different from true professionals
        while np.sum(cur_profs == new_profs) > 0:
            # If not (tiny probability), resample random professionals
            new_profs = np.random.choice(self.unique_profs, self.batch_size)
        
        # Assign random professionals to negative batch
        negative_batch.assign(professionals_id=new_profs)
        
        # Concatenate positive and negative batches into a single batch
        single_batch = pd.concat([positive_batch, negative_batch])
        
        # Add professionals data to single_batch
        single_batch = single_batch.merge(self.prof_data, on='professionals_id')
        
        # Select statistical question features
        x_que_features = single_batch[[
            'students_location', 'students_state', 'students_questions_asked',
            'students_average_question_age', 'students_average_question_body_length',
            'students_average_answer_body_length',
            
            'students_date_joined_time', 'students_date_joined_doy_sin',
            'students_date_joined_doy_cos', 'students_date_joined_dow',
            
            'questions_body_length',
            
            'questions_date_added_time', 'questions_date_added_doy_sin',
            'questions_date_added_doy_cos', 'questions_date_added_dow',
            'questions_date_added_hour_sin', 'questions_date_added_hour_cos',
        ]].values
        
        # Select statistical professional features
        x_pro_features = single_batch[[
            'professionals_industry', 'professionals_location', 'professionals_state',
            'professionals_questions_answered', 'professionals_average_question_age',
            'professionals_average_question_body_length', 'professionals_average_answer_body_length',
            'professionals_email_activated',
            
            'professionals_date_joined_time', 'professionals_date_joined_doy_sin',
            'professionals_date_joined_doy_cos', 'professionals_date_joined_dow',
            
            'professionals_last_answer_date_time', 'professionals_last_answer_date_doy_sin',
            'professionals_last_answer_date_doy_cos', 'professionals_last_answer_date_dow',
            'professionals_last_answer_date_hour_sin', 'professionals_last_answer_date_hour_cos',
        ]].values
        
        #----------------------------------------------------------------------------
        #               INTEGRATION WITH NIKITA'S BATCH GENERATOR
        #----------------------------------------------------------------------------
        
        # Extract embeddings from batch questions and professionals
        x_que_embeddings, x_pro_embeddings = self.__convert(
            single_batch[['questions_id', 'professionals_id']].values)
        
        # Stack statistical features and embeddings
        x_que = np.hstack((x_que_features, x_que_embeddings))
        x_pro = np.hstack((x_pro_features, x_pro_embeddings))
        
        #----------------------------------------------------------------------------
        
        # Create target array
        y = np.concatenate([np.ones(self.batch_size), np.zeros(self.batch_size)])
        
        return [x_que, x_pro], y
    
    
    def on_epoch_end(self):
        """
        Shuffle qa_data after each epoch
        """
        if self.shuffle:
            self.qa_data = shuffle(self.qa_data)
    
    
    def __convert(self, batch):
        """
        Computes embeddings for questions based on average of precomputed tag embeddings
        and embeddings for professionals based on precomputed industry embeddings
        """
        x_que, x_pro = [], []
        for que, pro in batch:
            tmp = []
            for tag in self.que_tag.get(que, []):
                tmp.append(self.tag_emb.get(tag, np.zeros(10)))
            if len(tmp) == 0:
                tmp.append(np.zeros(10))
            x_que.append(np.vstack(tmp).mean(axis = 0))
            x_pro.append(self.ind_emb.get(self.pro_ind[pro], np.zeros(10)))
        
        return np.vstack(x_que), np.vstack(x_pro)

In [9]:
generator = BatchGenerator(pp)

In [10]:
generator.__getitem__(0)[0][1].shape

(100, 28)

In [11]:
%%timeit
generator.__getitem__(0)

17.5 ms ± 161 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [249]:
%lprun -f generator.__getitem__ generator.__getitem__(0)

In [None]:
%load_ext line_profiler

---

## Additional code

In [None]:
# Create feature list (added _ in file's name)
with open('_feature_list.json', 'w') as f:
    json.dump(
        {
            'ques': list(x_ques.columns),
            'prof': list(x_prof.columns)
        }, f, indent=2)

In [None]:
# Change last_answer_date feature for incorrect professionals
self.prof_ques_dict = {prof_id:df_slice.sort_values(by='professionals_last_answer_date_time')
                       for prof_id, df_slice in self.qa_data.groupby('professionals_id')}

for i, prof in enumerate(new_profs):
    prof_ques = self.prof_ques_dict[prof]
    index = np.searchsorted(np.array(prof_ques.professionals_last_answer_date_time),
                            negative_batch.professionals_last_answer_date_time.iloc[i])
    if index < 1:
        index = 1

    negative_batch.iloc[i, 13:] = prof_ques.iloc[index-1, 13:]

# Preprocessing for other version of BatchGenerator

In [9]:
qa_data = pp.qa_data.merge(pp.stud_data, on='students_id')
qa_data.shape

(49722, 36)

In [10]:
# Here without professionals_last_answer_date
que_features = qa_data[[
    'students_location', 'students_state', 'students_questions_asked',
    'students_average_question_age', 'students_average_question_body_length',
    'students_average_answer_body_length',

    'students_date_joined_time', 'students_date_joined_doy_sin',
    'students_date_joined_doy_cos', 'students_date_joined_dow',

    'questions_body_length',

    'questions_date_added_time', 'questions_date_added_doy_sin',
    'questions_date_added_doy_cos', 'questions_date_added_dow',
    'questions_date_added_hour_sin', 'questions_date_added_hour_cos',
]]

In [11]:
que_features.shape

(49722, 17)

In [13]:
que = pp.qa_data.questions_id

In [14]:
que_feature_dict = {que.loc[i]: que_features.loc[i].values for i in range(que.size)}

In [15]:
pro_features = pp.prof_data[[
    'professionals_industry', 'professionals_location', 'professionals_state',
    'professionals_questions_answered', 'professionals_average_question_age',
    'professionals_average_question_body_length', 'professionals_average_answer_body_length',
    'professionals_email_activated',

    'professionals_date_joined_time', 'professionals_date_joined_doy_sin',
    'professionals_date_joined_doy_cos', 'professionals_date_joined_dow',
]]

In [16]:
pro = pp.prof_data.professionals_id

In [17]:
pro_feature_dict = {pro.loc[i]: pro_features.loc[i].values for i in range(pro.size)}

In [18]:
import pickle
with open('que_feature_dict.pickle', 'wb') as f:
    pickle.dump(que_feature_dict, f)
with open('pro_feature_dict.pickle', 'wb') as f:
    pickle.dump(pro_feature_dict, f)

## Last answer date features

In [32]:
pp.qa_data.head(1)

Unnamed: 0,students_id,questions_id,questions_title,questions_body,questions_body_length,professionals_id,answers_id,answers_body,questions_date_added_time,questions_date_added_doy_sin,...,answers_date_added_doy_cos,answers_date_added_dow,answers_date_added_hour_sin,answers_date_added_hour_cos,professionals_last_answer_date_time,professionals_last_answer_date_doy_sin,professionals_last_answer_date_doy_cos,professionals_last_answer_date_dow,professionals_last_answer_date_hour_sin,professionals_last_answer_date_hour_cos
0,8f6f374ffd834d258ab69d376dd998f5,332a511f1569444485cf7a7a556a5e54,Teacher career question,What is a maths teacher? what is a maths teach...,-0.683257,36ff3b3666df400f956f8335cf53e09e,4e5f01128cae4f6d8fd697cec5dca60c,Hi! You are asking a very interesting question...,-0.400852,0.903356,...,-0.474951,0.748932,-0.906308,0.422618,-0.645208,0.880012,-0.474951,0.771827,-0.55557,-0.83147


In [20]:
ans_last_answer_date = pp.qa_data[['answers_id', 'professionals_last_answer_date_time']]
ans_date_added = pp.qa_data[['answers_id', 'answers_date_added_time']]
ans_date_added.head()

Unnamed: 0,answers_id,answers_date_added_time
0,4e5f01128cae4f6d8fd697cec5dca60c,-0.699372
1,334f6735d31e45589e43da5ae7056e50,0.747848
2,e5d66281cc314675b95ddbb799b75473,0.747848
3,e5c0da2a29ff414fa76b9da6e86337fc,-0.560503
4,f3519ab99a1a4a13a8a9ecb814287d2a,-0.500911


In [21]:
ans_date_added_dict = {ans:date for ans, date in ans_date_added.values}
ans_last_answer_date_dict = {ans:date for ans, date in ans_last_answer_date.values}

In [46]:
# Change last_answer_date feature for incorrect professionals
pro_answer_dates_dict = {pro:df_slice.professionals_last_answer_date_time.sort_values().values
                         for pro, df_slice in pp.qa_data.groupby('professionals_id')}

In [49]:
pro_last_anwer_date_dict = {pro:df_slice.answers_date_added_time.max()
                            for pro, df_slice in pp.qa_data.groupby('professionals_id')}

In [54]:
(pp.qa_data[pp.qa_data.professionals_id == '003cc21be89d4e42bc4424131a378e86']
 [['answers_date_added_time', 'professionals_last_answer_date_time']])

Unnamed: 0,answers_date_added_time,professionals_last_answer_date_time
34620,0.448552,0.488031
34793,0.448552,0.488031
35243,0.448552,1.179823
35255,0.448552,0.488031


In [57]:
(creator.qa_data[creator.qa_data.professionals_id == '003cc21be89d4e42bc4424131a378e86']
 [['answers_date_added', 'professionals_last_answer_date']]).sort_values(by='answers_date_added')

Unnamed: 0,answers_date_added,professionals_last_answer_date
35243,2017-10-19 18:16:16,2018-09-12 16:42:17
34793,2017-10-19 18:34:17,2017-10-19 18:16:16
34620,2017-10-19 18:47:15,2017-10-19 18:34:17
35255,2017-10-19 18:53:27,2017-10-19 18:47:15


In [47]:
pro_answer_dates_dict

{'00009a0f9bda43eba47104e9ac62aff5': array([-0.74198844, -0.74198844]),
 '000d4635e5da41e3bfd83677ee11dda4': array([-0.74198844, -0.64925134, -0.63843934]),
 '00271cc10e0245fba4a35e76e669c281': array([-0.64925134,  1.17982321,  1.18087804,  1.1831635 ,  1.1831635 ,
         1.18500946,  1.18500946,  1.18500946,  1.18711912,  1.18720702,
         1.19239326,  1.19546985,  1.19546985,  1.19546985,  1.19775531,
         1.19775531,  1.20004078,  1.20004078,  1.20004078,  1.20188673,
         1.20188673,  1.20188673,  1.20399639,  1.29058032]),
 '003cc21be89d4e42bc4424131a378e86': array([0.48803079, 0.48803079, 0.48803079, 1.17982321]),
 '0046ab8089c04b3a8df3f8c28621a818': array([0.14626599, 0.14626599, 0.14626599, 0.14626599, 0.48803079]),
 '004cb439b2fb4abcbf823380a1170e83': array([0.14600229, 1.13402602, 1.13402602]),
 '0053becb71d94164b014a5a7d9673210': array([-0.23347267,  1.13402602]),
 '005cbd7ae54949db98efaa35641a3f17': array([-0.23347267]),
 '00680f924e8f49d4962876df5b38eb94': arr

In [50]:
pro_last_anwer_date_dict

{'00009a0f9bda43eba47104e9ac62aff5': -0.0020843733816376065,
 '000d4635e5da41e3bfd83677ee11dda4': -0.6869018378598925,
 '00271cc10e0245fba4a35e76e669c281': 1.2854223424833042,
 '003cc21be89d4e42bc4424131a378e86': 0.4485518847863769,
 '0046ab8089c04b3a8df3f8c28621a818': 0.10231287796710671,
 '004cb439b2fb4abcbf823380a1170e83': 1.1031725333882676,
 '0053becb71d94164b014a5a7d9673210': -0.27759691469674164,
 '005cbd7ae54949db98efaa35641a3f17': 1.1780855780846828,
 '00680f924e8f49d4962876df5b38eb94': -0.5848205664898392,
 '007e521571a248378a7f335fccb67658': -0.3108223197761869,
 '0082a7f7a9834612999dfa4c338da164': 0.09518678572480319,
 '00894b0c1d2a43c0bda8c2ccf9519a56': 1.1314987500510738,
 '0089d879abb94898af27cfd7b2998f19': 0.6560993213413161,
 '008a26bb0e2842efaa1ba6b41723f574': 0.7650394564944158,
 '009e7c8c8d9d41f79f809e64cc6e6fd5': 1.1421878884144403,
 '009edb66cac04815a2e03a07fef38255': 0.4559452054877602,
 '00a08850b6f943929ae9fe00899ddebd': 1.2844425048000119,
 '00a0eaf104ad4d14a9

In [29]:
for pro in pro_answer_date_dict.keys():
    pro_answer_date_dict[pro].append[pro_last_anwer_date_dict[pro]] + value

In [42]:
pp.qa_data[pp.qa_data['professionals_id'] == '0046ab8089c04b3a8df3f8c28621a818'].answers_date_added_time.iloc[0]

0.10222380181408014

In [30]:
pro_last_answer_dates_dict

{'00009a0f9bda43eba47104e9ac62aff5': array([-0.74198844, -0.74198844]),
 '000d4635e5da41e3bfd83677ee11dda4': array([-0.74198844, -0.64925134, -0.63843934]),
 '00271cc10e0245fba4a35e76e669c281': array([-0.64925134,  1.17982321,  1.18087804,  1.1831635 ,  1.1831635 ,
         1.18500946,  1.18500946,  1.18500946,  1.18711912,  1.18720702,
         1.19239326,  1.19546985,  1.19546985,  1.19546985,  1.19775531,
         1.19775531,  1.20004078,  1.20004078,  1.20004078,  1.20188673,
         1.20188673,  1.20188673,  1.20399639,  1.29058032]),
 '003cc21be89d4e42bc4424131a378e86': array([0.48803079, 0.48803079, 0.48803079, 1.17982321]),
 '0046ab8089c04b3a8df3f8c28621a818': array([0.14626599, 0.14626599, 0.14626599, 0.14626599, 0.48803079]),
 '004cb439b2fb4abcbf823380a1170e83': array([0.14600229, 1.13402602, 1.13402602]),
 '0053becb71d94164b014a5a7d9673210': array([-0.23347267,  1.13402602]),
 '005cbd7ae54949db98efaa35641a3f17': array([-0.23347267]),
 '00680f924e8f49d4962876df5b38eb94': arr

In [25]:
import pickle
with open('ans_date_added_dict.pickle', 'wb') as f:
    pickle.dump(ans_date_added_dict, f)
with open('ans_last_answer_date_dict.pickle', 'wb') as f:
    pickle.dump(ans_last_answer_date_dict, f)
with open('pro_last_answer_dates_dict.pickle', 'wb') as f:
    pickle.dump(pro_last_answer_dates_dict, f)

## Precompute 10 professionals from last answer date distribution for every answer

In [58]:
with open('ans_date_added_dict.pickle', 'rb') as f:
    ans_date_added_dict = pickle.load(f)
with open('pro_last_answer_dates_dict.pickle', 'rb') as f:
    pro_last_answer_dates_dict = pickle.load(f)

In [46]:
ans_que_pro_dict = {row['answers_id']:(row['questions_id'], row['professionals_id'])
                    for _, row in pp.qa_data.iterrows()}
que_pro_set = {(row['questions_id'], row['professionals_id']) for _, row in pp.qa_data.iterrows()}
pro_list = list(pp.prof_data['professionals_id'])

In [8]:
ans_date_added_df = (pp.qa_data[['answers_id', 'answers_date_added_time']]
                     .sort_values(by='answers_date_added_time'))

In [9]:
ans_list = list(ans_date_added_df.answers_id.values)
with open('ans_list.pickle', 'wb') as f:
    pickle.dump(ans_list, f)

In [12]:
np.unique(np.array(ans_list)).size

49722

In [105]:
inf = 1 / (np.float64(1e9))
np.log1p(inf)

9.999999995e-10

In [135]:
import time
start = time.time()
inf = 1e9

pro_cur_date = np.ones(len(pro_list)) * -inf
pro_next_date = np.zeros(len(pro_list))
pro_cur_idx = np.ones(len(pro_list), dtype=int) * -1
pro_last_answer_dates_list = []

ans_sampled_profs_dict = {}

for i, pro in enumerate(pro_list):
    pro_next_date[i] = pro_last_answer_dates_dict[pro][0]
    pro_last_answer_dates_list.append(pro_last_answer_dates_dict[pro])

for _, row in ans_date_added_df.iterrows():
    ans = row['answers_id']
    ans_date_added = row['answers_date_added_time']
    
    changed_pros_idx = np.nonzero(pro_next_date < ans_date_added)[0]
    
    for i in changed_pros_idx:
        pro_cur_idx[i] += 1
        idx = pro_cur_idx[i]
        
        pro_cur_date[i] = pro_next_date[i]
        
        if idx < pro_last_answer_dates_list[i].size:
            pro_next_date[i] = pro_last_answer_dates_list[i][idx]
        else:
            pro_next_date[i] = inf
    
    # Substact last answer dates from the actual date the answer was added
    distances = ans_date_added - np.array(pro_cur_date)
    
    # Apply log1p transformation to 1 / distances and normalize each entry
    distances = np.log1p(1 / distances)
    distances /= distances.sum()
    
    # Sample 50 professional from distribution of distances and choose 10 or less unique among them
    sampled_pro_set = set(np.random.choice(pro_list, 50, p=distances))
    
    que, _ = ans_que_pro_dict[ans]
    pros = sampled_pro_set.copy()
    for pro in sampled_pro_set:
        if (que, pro) in que_pro_set:
            pros.remove(pro)
    
    pros = list(pros)[: min(10, len(pros))]
    ans_sampled_profs_dict[ans] = pros

total = time.time() - start
print(total)

96.62652683258057


In [162]:
for key, value in ans_sampled_profs_dict.items():
    print(key, ':', value)
    break

bcb3d96f74104351938c362893b77e33 : ['0c70d8f19f074a0581c1c05c9922b03b', '977bfe665d674798948d6fa481ced3ac']


In [164]:
for key, value in ans_sampled_profs_dict.items():
    if len(value) == 0:
        print(key)

In [150]:
import pickle
with open('ans_sampled_profs_dict.pickle', 'wb') as f:
    pickle.dump(ans_sampled_profs_dict, f)

In [13]:
import pickle
with open('ans_sampled_profs_dict.pickle', 'rb') as f:
    ans_sampled_profs_dict = pickle.load(f)

# Other version of BatchGenerator

In [177]:
import pickle
import random

import keras
import numpy as np
import pandas as pd

from utils import TextProcessor


class BatchGenerator(keras.utils.Sequence):
    def __init__(self, pos_size, neg_size, data_path='../../data/'):
        self.pos_size = pos_size
        self.neg_size = neg_size
        
        que = pd.read_csv(data_path + 'questions.csv')
        tag_que = pd.read_csv(data_path + 'tag_questions.csv')
        tags = pd.read_csv(data_path + 'tags.csv')
        pro = pd.read_csv(data_path + 'professionals.csv')
        stu = pd.read_csv(data_path + 'students.csv')
        ans = pd.read_csv(data_path + 'answers.csv')
        
        self.tp = TextProcessor()
        pro['professionals_industry'] = pro['professionals_industry'].apply(self.tp.process)
        tags['tags_tag_name'] = tags['tags_tag_name'].apply(lambda x: self.tp.process(x, allow_stopwords=True))
        
        self.pro_ind = {row['professionals_id']: row['professionals_industry'] for i, row in pro.iterrows()}
        
        que_tags = que.merge(tag_que, left_on = 'questions_id', right_on = 'tag_questions_question_id').merge(tags, left_on = 'tag_questions_tag_id', right_on = 'tags_tag_id')
        que_tags = que_tags[['questions_id', 'tags_tag_name']].groupby(by = 'questions_id', as_index = False).aggregate(lambda x: ' '.join(x))
        self.que_tag = {row['questions_id']: row['tags_tag_name'].split() for _, row in que_tags.iterrows()}
        
        ans_que = ans.merge(que, left_on = 'answers_question_id', right_on = 'questions_id')
        ans_que_pro = ans_que.merge(pro, left_on = 'answers_author_id', right_on = 'professionals_id')
        ans_que_pro = ans_que_pro.merge(stu, left_on = 'questions_author_id', right_on = 'students_id')
        
        self.ques = list(set(ans_que_pro['questions_id']))
        self.pros = list(set(ans_que_pro['professionals_id']))
        
        self.que_pro_set = {(row['questions_id'], row['professionals_id']) for _, row in ans_que_pro.iterrows()}
        self.que_pro_list = list(self.que_pro_set)
        
        with open('tags_embs.pickle', 'rb') as file:
            self.tag_emb = pickle.load(file)
        with open('industries_embs.pickle', 'rb') as file:
            self.ind_emb = pickle.load(file)
        
        #------------------------------------------------------------------
        #                    THE CODE I ADDED
        #------------------------------------------------------------------
        
        # Add answer list and a dictionary mapping answer to (question, professional) pair
        self.ans_que_pro_dict = {row['answers_id']:(row['questions_id'], row['professionals_id'])
                                 for _, row in ans_que_pro.iterrows()}
        
        # Load que and pro statistical features
        with open('que_feature_dict.pickle', 'rb') as f:
            self.que_feature_dict = pickle.load(f)
        with open('pro_feature_dict.pickle', 'rb') as f:
            self.pro_feature_dict = pickle.load(f)
        
        # Load pro last answer dates dict and que answer date dict
        with open('pro_last_answer_dates_dict.pickle', 'rb') as f:
            self.pro_last_answer_dates_dict = pickle.load(f)
        with open('ans_date_added_dict.pickle', 'rb') as f:
            self.ans_date_added_dict = pickle.load(f)
        with open('ans_last_answer_date_dict.pickle', 'rb') as f:
            self.ans_last_answer_date_dict = pickle.load(f)
        with open('ans_sampled_profs_dict.pickle', 'rb') as f:
            self.ans_sampled_profs_dict = pickle.load(f)
        with open('ans_list.pickle', 'rb') as f:
            self.ans_list = pickle.load(f)
        #------------------------------------------------------------------
    
    
    def __len__(self):
        return len(self.que_pro_list) // self.pos_size
    
    
    def __convert(self, pairs):
        x_que, x_pro = [], []
        for que, pro in pairs:
            tmp = []
            for tag in self.que_tag.get(que, []):
                tmp.append(self.tag_emb.get(tag, np.zeros(10)))
            if len(tmp) == 0:
                tmp.append(np.zeros(10))
            
            x_que.append(np.vstack(tmp).mean(axis = 0))
            x_pro.append(self.ind_emb.get(self.pro_ind[pro], np.zeros(10)))
        
        return np.vstack(x_que), np.vstack(x_pro)
    
    
    def __negative_que_last_answer_date(self, ans, pro):
        ans_date = self.ans_date_added_dict[ans]
        pro_dates = self.pro_last_answer_dates_dict[pro]
        
        index = np.searchsorted(pro_dates, ans_date)
        if index == 0:
            raise ValueError("Index cannot be zero.")
        return pro_dates[index-1]     
    
    
    def __getitem__(self, index):
        pos_pairs = []
        neg_pairs = []
        
        pos_last_dates = []
        neg_last_dates = []
        
        pos_que_features, pos_pro_features = [], []
        neg_que_features, neg_pro_features = [], []
        
        pos_ans = self.ans_list[self.pos_size * index: self.pos_size * (index + 1)]
        for ans in pos_ans:
            que, pro = self.ans_que_pro_dict[ans]
            pos_pairs.append((que, pro))
            pos_que_features.append(self.que_feature_dict[que])
            pos_pro_features.append(self.pro_feature_dict[pro])
            pos_last_dates.append(self.ans_last_answer_date_dict[ans])
        
        for i in range(self.neg_size):
            ans = random.choice(self.ans_list)
            que, _ = self.ans_que_pro_dict[ans]
            pro = random.choice(self.ans_sampled_profs_dict[ans])
            
            # Add que and pro data to all required lists
            last_date = self.__negative_que_last_answer_date(ans, pro)
            neg_pairs.append((que, pro))
            neg_que_features.append(self.que_feature_dict[que])
            neg_pro_features.append(self.pro_feature_dict[pro])
            neg_last_dates.append(last_date)
        
        pos_que_embeddings, pos_pro_embeddings = self.__convert(pos_pairs)
        neg_que_embeddings, neg_pro_embeddings = self.__convert(neg_pairs)
        
        x_pos_que = np.hstack([np.array(pos_que_features), pos_que_embeddings])
        x_neg_que = np.hstack([np.array(neg_que_features), neg_que_embeddings])
        
        # print(np.array(pos_pro_features).shape, np.array(pos_last_dates)[:, np.newaxis].shape, pos_pro_embeddings.shape)
        x_pos_pro = np.hstack([np.array(pos_pro_features), np.array(pos_last_dates)[:, np.newaxis], pos_pro_embeddings])
        x_neg_pro = np.hstack([np.array(neg_pro_features), np.array(neg_last_dates)[:, np.newaxis], neg_pro_embeddings])
        
        return [np.vstack([x_pos_que, x_neg_que]), np.vstack([x_pos_pro, x_neg_pro])], \
                np.vstack([np.ones((self.pos_size, 1)), np.zeros((self.neg_size, 1))])
    
    
    def on_epoch_end(self):
        np.random.shuffle(self.ans_list)

In [178]:
generator = BatchGenerator(64, 64)

In [182]:
generator[0][0][1].shape

(128, 23)

In [184]:
%%timeit
generator[0]

4.31 ms ± 71.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [185]:
%lprun -f generator.__getitem__ generator.__getitem__(0)

In [23]:
%load_ext line_profiler