In [4]:
import numpy as np
import pandas as pd
import os, pickle
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Create class for data preprocessing required for training

In [2]:
class DatasetCreator:
    """
    Class that imports initial datasets and creates additional datasets for convenience
    """
    
    def __init__(self, data_path='../../data/', created=False):
        # Add data_path to class properties
        self.data_path = data_path
        
        # Import all initial datasets
        self.emails = pd.read_csv(data_path + 'emails.csv')
        self.questions = pd.read_csv(data_path + 'questions.csv')
        self.professionals = pd.read_csv(data_path + 'professionals.csv')
        self.comments = pd.read_csv(data_path + 'comments.csv')
        self.tag_users = pd.read_csv(data_path + 'tag_users.csv')
        self.group_memberships = pd.read_csv(data_path + 'group_memberships.csv')
        self.tags = pd.read_csv(data_path + 'tags.csv')
        self.students = pd.read_csv(data_path + 'students.csv')
        self.groups = pd.read_csv(data_path + 'groups.csv')
        self.tag_questions = pd.read_csv(data_path + 'tag_questions.csv')
        self.matches = pd.read_csv(data_path + 'matches.csv')
        self.answers = pd.read_csv(data_path + 'answers.csv')
        self.school_memberships = pd.read_csv(data_path + 'school_memberships.csv')
        
        if created:
            # Load additional datasets from disk
            self.qa_data = pd.read_csv(self.data_path + 'qa_data.csv')
            self.prof_data = pd.read_csv(self.data_path + 'prof_data.csv')
            self.stud_data = pd.read_csv(self.data_path + 'stud_data.csv')
        else:
            # Create additional datasets and save them to disk
            self.additional_datasets_creation()
    
    
    def additional_datasets_creation(self):
        """
        Creates additional datasets for futher processing and save them to disk.
        """
        # Create temporary dataset for further processing
        all_data = self.all_data_creation()
        
        # Create question-answer pairs dataset called qa_data
        self.qa_data = self.qa_data_creation(all_data)
        
        # Create dataset called prof_data compirising data of professionals
        # who answered at least one question
        self.prof_data = self.prof_data_creation(all_data)
        
        # Create dataset called stud_data compirising data of students
        # who asked at least one answered question
        self.stud_data = self.stud_data_creation(all_data)
        
        # Save new datasets to disc
        self.qa_data.to_csv(self.data_path + 'qa_data.csv', index=False)
        self.prof_data.to_csv(self.data_path + 'prof_data.csv', index=False)
        self.stud_data.to_csv(self.data_path + 'stud_data.csv', index=False)
    
    
    def all_data_creation(self):
        """
        Merges questions, answers, professionals and students datasets
        to get temporary dataset for further processing
        """
        # Merge questions with answers and delete not answered questions
        all_data = self.questions.merge(self.answers, how='right', left_on='questions_id', right_on='answers_question_id')
        
        # Merge with professionals and students (students asked, professionals answered)
        # Maybe change this in the future by taking care of professional who change status to students and vise versa
        all_data = all_data.merge(self.professionals, how='inner', left_on='answers_author_id', right_on='professionals_id')
        all_data = all_data.merge(self.students, how='inner', left_on='questions_author_id', right_on='students_id')
        
        # Transform dates from string representation to datetime object
        all_data.answers_date_added = pd.to_datetime(all_data.answers_date_added)
        all_data.questions_date_added = pd.to_datetime(all_data.questions_date_added)
        
        # Add questions_age feature, which represents amount of time
        # from question emergence to a particular answer to that question
        all_data['questions_age'] = all_data.answers_date_added - all_data.questions_date_added
        
        return all_data
    
    
    def qa_data_creation(self, all_data):
        """
        Creates question-answer pairs dataset called qa_data_data
        """
        # Temporary qa_data representation
        qa_data = all_data
        
        # Select only unique professionals
        temp = qa_data[['professionals_id', 'answers_date_added', 'answers_id']]
        prof_unique = pd.DataFrame(temp.professionals_id.unique(), columns=['professionals_id'])
        prof_unique = prof_unique.merge(self.professionals, how='left', on='professionals_id')
        
        # For every professional add a "dummy" question with answer date being professional's registration date
        prof_unique['answers_id'] = list(None for _ in range(prof_unique.shape[0]))
        prof_unique['answers_date_added'] = prof_unique['professionals_date_joined']
        prof_unique = prof_unique[['professionals_id', 'answers_date_added', 'answers_id']]
        
        # Add "dummy" questions to all questions
        temp = pd.concat([temp, prof_unique])
        
        # Sort by professionals and answer dates
        temp = temp.sort_values(by=['professionals_id', 'answers_date_added']).reset_index(drop=True)
        
        # Get the sorted representation of the answers_date_added and shift the index down by one
        # so that current question is aligned with previous question answer date
        last_answer_date = pd.DataFrame({'professionals_last_answer_date': temp.answers_date_added})
        last_answer_date.index += 1
        
        # Add the professionals_last_answer_date column to temp
        temp = temp.merge(last_answer_date, left_index=True, right_index=True)
        temp.dropna(subset=['answers_id'], inplace=True)
        temp.drop(columns=['professionals_id', 'answers_date_added'], inplace=True)
        
        # Add professionals_last_answer_date column to qa_data 
        qa_data = qa_data.merge(temp, on='answers_id')
        
        # Transform dates from string representation to datetime object
        qa_data.professionals_last_answer_date = pd.to_datetime(qa_data.professionals_last_answer_date)
        
        # Final qa_data representation
        qa_data = qa_data[[
            'students_id', 'questions_id', 'questions_title', 'questions_body',
            'questions_date_added', 'professionals_id', 'answers_id', 'answers_body',
            'professionals_last_answer_date'
        ]]
        
        return qa_data
    
    
    def prof_data_creation(self, all_data):
        """
        Creates dataset called prof_data compirising data of professionals who answered at least one question
        """
        # Select only professionals who answered at least one question
        active_professionals = pd.DataFrame({'professionals_id': all_data.professionals_id.unique()})
        prof_data = self.professionals.merge(active_professionals, how='right', on='professionals_id')
        
        # Extract state or country from location
        prof_data['professionals_state'] = prof_data['professionals_location'].apply(lambda loc: str(loc).split(', ')[-1])
        
        # Transform dates from string representation to datetime object
        prof_data.professionals_date_joined = pd.to_datetime(prof_data.professionals_date_joined)
        
        # Count the number of answered questions by each professional
        number_answered = all_data[['questions_id', 'professionals_id']].groupby('professionals_id').count()
        number_answered = number_answered.rename({'questions_id': 'professionals_questions_answered'}, axis=1)
        
        # Add professionals_questions_answered feature to prof_data
        prof_data = prof_data.merge(number_answered, left_on='professionals_id', right_index=True)
        
        # Get average question age for every professional among questions he answered
        average_question_age = (
            all_data.groupby('professionals_id')
            .questions_age.mean(numeric_only=False)
        )
        average_question_age = pd.DataFrame({'professionals_average_question_age': average_question_age})
        
        # Add professionals_average_question_age feature to prof_data
        prof_data = prof_data.merge(average_question_age, on='professionals_id')
        
        return prof_data
    
    
    def stud_data_creation(self, all_data):
        """
        Creates dataset called stud_data compirising data of students who asked at least one answered question
        """
        # Select only students who asked at least one answered question
        active_students = pd.DataFrame({'students_id': all_data.students_id.unique()})
        stud_data = self.students.merge(active_students, how='right', on='students_id')
        
        # Extract state or country from location
        stud_data['students_state'] = stud_data['students_location'].apply(lambda loc: str(loc).split(', ')[-1])
        
        # Transform dates from string representation to datetime object
        stud_data.students_date_joined = pd.to_datetime(stud_data.students_date_joined)
        
        # Count the number of asked questions by each student
        number_asked = all_data[['questions_id', 'students_id']].groupby('students_id').count()
        number_asked = number_asked.rename({'questions_id': 'students_questions_asked'}, axis=1)
        
        # Add students_questions_answered feature to stud_data
        stud_data = stud_data.merge(number_asked, left_on='students_id', right_index=True)
        
        # Get average question age for every student among questions he asked that were answered
        average_question_age = (
            all_data.groupby('students_id')
            .questions_age.mean(numeric_only=False)
        )
        average_question_age = pd.DataFrame({'students_average_question_age': average_question_age})
        
        # Add professionals_average_question_age feature to prof_data
        stud_data = stud_data.merge(average_question_age, on='students_id')
        
        return stud_data

In [31]:
creator = DatasetCreator(created=False)

In [32]:
creator.stud_data.head()

Unnamed: 0,students_id,students_location,students_date_joined,students_state,students_questions_asked,students_average_question_age
0,12a89e96755a4dba83ff03e03043d9c0,,2011-12-16 14:19:24,,2,794 days 06:00:53
1,5bdd2eb44dd944a9a7ab9aba068d1ef2,,2012-01-01 05:00:00,,2,2 days 16:05:06.500000
2,9658267bc2564a85bad1e802de5fb597,"Wayne, Pennsylvania",2012-01-01 05:00:00,Pennsylvania,6,377 days 15:36:03
3,7b1900c458e34573bfeb0d57ffbd260a,,2012-01-01 05:00:00,,3,8 days 13:05:38
4,e9efc4d6e06e49c7ae5afe1aad8c5bd5,,2012-01-01 05:00:00,,1,21 days 21:56:35


In [5]:
class Preprocessor(DatasetCreator):
    """
    Class for qa_data, prof_data and stud_data feature preprocessing
    """
    
    def __init__(self, created=False):
        """
        Initializes DatasetCreator class and loads existing
        preprocessors that were already fit to data
        """
        # Initialize DatasetCreator
        super().__init__(created=created)
        
        # Load existing preprocessors that were already fit to data
        if os.path.isfile('preprocessors.pickle'):
            with open('preprocessors.pickle', 'rb') as file:
                self.pp = pickle.load(file)
        else:
            self.pp = {}
        
        # Carry out preprocessing of all datasets
        self.qa_data_preprocessing()
        self.prof_data_preprocessing()
        self.stud_data_preprocessing()
    
    
    def qa_data_preprocessing(self):
        """
        Preprocesses qa_data dataset
        """
        # Preprocess datetime and timedelta features
        Preprocessor.datetime(self.qa_data, 'questions_date_added', hour=True)
        Preprocessor.datetime(self.qa_data, 'professionals_last_answer_date', hour=True)
        
        # Preprocess numerical features
        for feature in [
            'questions_date_added_time', 'questions_date_added_doy_sin',
            'professionals_last_answer_date_time', 'professionals_last_answer_date_dow'
        ]:
            Preprocessor.numerical(self.qa_data, feature, self.pp)
    
    
    def prof_data_preprocessing(self):
        """
        Preprocesses prof_data dataset
        """
        # Preprocess datetime and timedelta features
        Preprocessor.datetime(self.prof_data, 'professionals_date_joined')
        Preprocessor.timedelta(self.prof_data, 'professionals_average_question_age')
        
        # Preprocess numerical features
        for feature in [
            'professionals_questions_answered', 'professionals_date_joined_time',
            'professionals_date_joined_dow', 'professionals_average_question_age'
        ]:
            Preprocessor.numerical(self.prof_data, feature, self.pp)
        
        # Preprocess categorical features
        Preprocessor.categorical(self.prof_data, 'professionals_location', 100, self.pp, oblige_fit=True)
        Preprocessor.categorical(self.prof_data, 'professionals_state', 40, self.pp, oblige_fit=True)
        Preprocessor.categorical(self.prof_data, 'professionals_industry', 100, self.pp, oblige_fit=True)
    
    
    def stud_data_preprocessing(self):
        """
        Preprocesses stud_data dataset
        """
        # Preprocess datetime and timedelta features
        Preprocessor.datetime(self.stud_data, 'students_date_joined')
        Preprocessor.timedelta(self.stud_data, 'students_average_question_age')
        
        # Preprocess numerical features
        for feature in [
            'students_questions_asked', 'students_date_joined_time',
            'students_date_joined_dow', 'students_average_question_age'
        ]:
            Preprocessor.numerical(self.stud_data, feature, self.pp)
        
        # Preprocess categorical features
        Preprocessor.categorical(self.stud_data, 'students_location', 100, self.pp, oblige_fit=True)
        Preprocessor.categorical(self.stud_data, 'students_state', 40, self.pp, oblige_fit=True)
    
    
    @staticmethod
    def datetime(df: pd.DataFrame, feature: str, hour: bool = False):
        """
        Generates a bunch of new datetime features and drops the original feature inplace

        :param df: Data to work with.
        :param feature: Name of a column in df that contains date.
        :param hour: Whether feature contains time.
        """
        df[feature] = pd.to_datetime(df[feature])

        df[feature + '_time'] = df[feature].apply(lambda d: d.year + d.dayofyear / 365)
        df[feature + '_doy_sin'] = df[feature].apply(lambda d: np.sin(2 * np.pi * d.dayofyear / 365))
        df[feature + '_doy_cos'] = df[feature].apply(lambda d: np.cos(2 * np.pi * d.dayofyear / 365))
        df[feature + '_dow'] = df[feature].apply(lambda d: d.weekday())

        if hour:
            df[feature + '_hour_sin'] = df[feature].apply(lambda d: np.sin(2 * np.pi * (d.hour + d.minute / 60) / 24))
            df[feature + '_hour_cos'] = df[feature].apply(lambda d: np.cos(2 * np.pi * (d.hour + d.minute / 60) / 24))

        df.drop(columns=feature, inplace=True)
    
    
    @staticmethod
    def timedelta(df: pd.DataFrame, feature: str):
        """
        Generates the new timedelta feature

        :param df: Data to work with.
        :param feature: Name of a column in df that contains timedelta.
        """
        df[feature] = pd.to_timedelta(df[feature])

        df[feature] = df[feature] / pd.Timedelta("1 day")
    
    
    @staticmethod
    def _get_preprocessor(fit_data: np.array, feature: str, base, pp: dict, oblige_fit: bool):
        """
        Creates new preprocessor having class base or uses existing one in preprocessors.pickle
        Returns this preprocessor

        :param fit_data: NumPy array of data to fit new preprocessor.
        :param feature: Feature name to search for in preprocessors.pickle.
        :param base: Preprocessor's class.
        :param pp: Object with preprocessors.
        :param oblige_fit: Whether to fit new preprocessor on feature even if there already exists one.
        :returns: Preprocessor object.
        """    
        if feature in pp and not oblige_fit:
            preproc = pp[feature]
        else:
            preproc = base()
            preproc.fit(fit_data)
            pp[feature] = preproc
            with open('preprocessors.pickle', 'wb') as file:
                pickle.dump(pp, file)
        return preproc
    
    
    @staticmethod
    def numerical(df: pd.DataFrame, feature: str, pp: dict, oblige_fit: bool = False):
        """
        Transforms via StandardScaler

        :param df: Data to work with.
        :param feature: Name of a column in df that contains numerical data.
        :param pp: Object with preprocessors.
        :param oblige_fit: Whether to fit new StandardScaler on feature even if there already exists one.
        """
        fit_data = df[feature].values.reshape(-1, 1).astype('float64')
        sc = Preprocessor._get_preprocessor(fit_data, feature, StandardScaler, pp, oblige_fit)
        df[feature] = sc.transform(fit_data)
    
    
    @staticmethod
    def categorical(df: pd.DataFrame, feature: str, n: int, pp: dict, oblige_fit: bool = False):
        """
        Encodes top n most popular values with different labels from 0 to n-1,
        remaining values with n and NaNs with n+1

        :param df: Data to work with.
        :param feature: Name of a column in df that contains categorical data.
        :param n: Number of top by popularity values to move in separate categories.
                  0 to encode everything with different labels.
        :param pp: Object with preprocessors.
        :param oblige_fit: Whether to fit new LabelEncoder on feature even if there already exists one.
        """
        vc = df[feature].value_counts()
        n = len(vc) if n == 0 else n

        top = set(vc[:n].index)
        isin_top = df[feature].isin(top)

        fit_data = df.loc[isin_top, feature]
        le = Preprocessor._get_preprocessor(fit_data, feature, LabelEncoder, pp, oblige_fit)

        isin_le = df[feature].isin(set(le.classes_))
        df.loc[isin_le, feature] = le.transform(df.loc[isin_le, feature])

        bottom = set(vc.index) - set(le.classes_)
        isin_bottom = df[feature].isin(bottom)
        df.loc[isin_bottom, feature] = n
        df[feature].fillna(n + 1, inplace=True)

In [6]:
pp = Preprocessor(created=True)

In [39]:
ques_groups = pp.qa_data.groupby('professionals_id').groups
ques_groups

{'00009a0f9bda43eba47104e9ac62aff5': Int64Index([9214, 28000], dtype='int64'),
 '000d4635e5da41e3bfd83677ee11dda4': Int64Index([3570, 37903, 44504], dtype='int64'),
 '00271cc10e0245fba4a35e76e669c281': Int64Index([  696,   697,  5206,  7592, 13212, 32576, 34734, 34735, 35902,
             35916, 35917, 36724, 38546, 38549, 38552, 38553, 38555, 38559,
             38561, 38562, 38567, 38568, 38573, 38576],
            dtype='int64'),
 '003cc21be89d4e42bc4424131a378e86': Int64Index([34620, 34793, 35243, 35255], dtype='int64'),
 '0046ab8089c04b3a8df3f8c28621a818': Int64Index([1520, 4381, 4382, 5454, 16367], dtype='int64'),
 '004cb439b2fb4abcbf823380a1170e83': Int64Index([3601, 12758, 27312], dtype='int64'),
 '0053becb71d94164b014a5a7d9673210': Int64Index([27930, 48226], dtype='int64'),
 '005cbd7ae54949db98efaa35641a3f17': Int64Index([32586], dtype='int64'),
 '00680f924e8f49d4962876df5b38eb94': Int64Index([4527, 7319, 32603], dtype='int64'),
 '007e521571a248378a7f335fccb67658': Int64Index(

In [40]:
ques_group = ques_groups['0126facaedba48199f6af1141f84f791']
ques_group = pp.qa_data.iloc[ques_group].sort_values(by='professionals_last_answer_date_time')
ques_group

Unnamed: 0,students_id,questions_id,questions_title,questions_body,professionals_id,answers_id,answers_body,questions_date_added_time,questions_date_added_doy_sin,questions_date_added_doy_cos,questions_date_added_dow,questions_date_added_hour_sin,questions_date_added_hour_cos,professionals_last_answer_date_time,professionals_last_answer_date_doy_sin,professionals_last_answer_date_doy_cos,professionals_last_answer_date_dow,professionals_last_answer_date_hour_sin,professionals_last_answer_date_hour_cos
1793,a54d9eba469b41c8a3545e080c081576,6c0a00c20ed3430b9720d7b66d87e8dc,Should I study at home or abroad?,Are domestic or foreign universities the best ...,0126facaedba48199f6af1141f84f791,200978804da84aa0920035ec03fd2cd9,"<p>hello, For me it was a great experience to...",0.388247,0.806343,-0.658402,5,-0.622515,0.782608,0.205864,0.39359,-0.919286,0.224369,0.746057,-0.665882
3987,e8ad02608ea94b3f8a0bc7e1f0a72204,2562698ccb754ab3bc7f8a8a93cdf24a,"What is the best way to answer those ""worst fl...","I know that you aren't supposed to say ""I don'...",0126facaedba48199f6af1141f84f791,48b0194384a844ada210ccdd60db3825,"<p>hello! make sure you´re honest, but letting...",0.3841,0.837446,-0.632103,3,-0.833886,-0.551937,0.205864,0.39359,-0.919286,0.224369,0.829038,-0.559193
9332,52105da2608b4f25b0ca2c78b0bf592f,91abfec3e39843c38addd8e6379427a1,what are the career options for a computer sci...,what are the available career options for a co...,0126facaedba48199f6af1141f84f791,170f3aafe5b24a018936488480374b70,<p>hello! Sometimes companies offer rotation ...,0.276271,1.121589,0.209315,0,-0.639439,-0.768842,0.205864,0.39359,-0.919286,0.224369,0.785317,-0.619094
45072,4d59f2b7698140a9812656c798cb516e,996ffcff0d2646439f8e80ce0b441162,Where is a good place to look for summer inter...,I am a second year chemistry student on a 4 ye...,0126facaedba48199f6af1141f84f791,e80d84453f6d4fbfb519b2ae3fe82a6b,<p>hello! there are some websites that might ...,0.27005,1.104638,0.259512,4,0.152123,0.988362,0.205864,0.39359,-0.919286,0.224369,0.809017,-0.587785
13655,2ed49637afe340ee9aeef360971e8a10,1ed48d2fbf0941dd8d19eff03c8cf8d7,"When you have an video interview, what is the ...","I had a video interview before, and I was dres...",0126facaedba48199f6af1141f84f791,e764b7e40a914900a16a96ae6265f4fe,"<p>hello! Apart from all the good advice, we a...",0.112455,-0.247365,1.0,4,-0.965926,-0.258819,0.326114,-0.54524,-0.83828,0.771827,0.573576,-0.819152


In [41]:
index = np.searchsorted(ques_group.professionals_last_answer_date_time,
                        ques_group.professionals_last_answer_date_time.iloc[4])[0]
print(index)
ques_group.professionals_last_answer_date_time.iloc[4] = ques_group.professionals_last_answer_date_time.iloc[index-1]

4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [42]:
ques_group

Unnamed: 0,students_id,questions_id,questions_title,questions_body,professionals_id,answers_id,answers_body,questions_date_added_time,questions_date_added_doy_sin,questions_date_added_doy_cos,questions_date_added_dow,questions_date_added_hour_sin,questions_date_added_hour_cos,professionals_last_answer_date_time,professionals_last_answer_date_doy_sin,professionals_last_answer_date_doy_cos,professionals_last_answer_date_dow,professionals_last_answer_date_hour_sin,professionals_last_answer_date_hour_cos
1793,a54d9eba469b41c8a3545e080c081576,6c0a00c20ed3430b9720d7b66d87e8dc,Should I study at home or abroad?,Are domestic or foreign universities the best ...,0126facaedba48199f6af1141f84f791,200978804da84aa0920035ec03fd2cd9,"<p>hello, For me it was a great experience to...",0.388247,0.806343,-0.658402,5,-0.622515,0.782608,0.205864,0.39359,-0.919286,0.224369,0.746057,-0.665882
3987,e8ad02608ea94b3f8a0bc7e1f0a72204,2562698ccb754ab3bc7f8a8a93cdf24a,"What is the best way to answer those ""worst fl...","I know that you aren't supposed to say ""I don'...",0126facaedba48199f6af1141f84f791,48b0194384a844ada210ccdd60db3825,"<p>hello! make sure you´re honest, but letting...",0.3841,0.837446,-0.632103,3,-0.833886,-0.551937,0.205864,0.39359,-0.919286,0.224369,0.829038,-0.559193
9332,52105da2608b4f25b0ca2c78b0bf592f,91abfec3e39843c38addd8e6379427a1,what are the career options for a computer sci...,what are the available career options for a co...,0126facaedba48199f6af1141f84f791,170f3aafe5b24a018936488480374b70,<p>hello! Sometimes companies offer rotation ...,0.276271,1.121589,0.209315,0,-0.639439,-0.768842,0.205864,0.39359,-0.919286,0.224369,0.785317,-0.619094
45072,4d59f2b7698140a9812656c798cb516e,996ffcff0d2646439f8e80ce0b441162,Where is a good place to look for summer inter...,I am a second year chemistry student on a 4 ye...,0126facaedba48199f6af1141f84f791,e80d84453f6d4fbfb519b2ae3fe82a6b,<p>hello! there are some websites that might ...,0.27005,1.104638,0.259512,4,0.152123,0.988362,0.205864,0.39359,-0.919286,0.224369,0.809017,-0.587785
13655,2ed49637afe340ee9aeef360971e8a10,1ed48d2fbf0941dd8d19eff03c8cf8d7,"When you have an video interview, what is the ...","I had a video interview before, and I was dres...",0126facaedba48199f6af1141f84f791,e764b7e40a914900a16a96ae6265f4fe,"<p>hello! Apart from all the good advice, we a...",0.112455,-0.247365,1.0,4,-0.965926,-0.258819,0.205864,-0.54524,-0.83828,0.771827,0.573576,-0.819152


In [7]:
import numpy as np
import pandas as pd
import keras
from sklearn.utils import shuffle

Using TensorFlow backend.


In [8]:
from tqdm import tqdm
import time

In [14]:
class DataGenerator(keras.utils.Sequence):
    """
    """
    
    def __init__(self, pp, batch_size=50, shuffle=True):
        """
        """
        self.qa_data = pp.qa_data
        self.prof_data = pp.prof_data
        self.stud_data = pp.stud_data
        self.unique_profs = pp.prof_data.professionals_id.unique()
        
        #self.prof_ques_dict = {prof_id:df_slice.sort_values(by='professionals_last_answer_date_time')
        #                       for prof_id, df_slice in self.qa_data.groupby('professionals_id')}
        
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()
    
    
    def __len__(self):
        """
        Denotes the number of batches per epoch
        """
        return self.qa_data.shape[0] // (self.batch_size)
    
    
    def __getitem__(self, index):
        """
        Generates one batch of data
        """
        positive_batch = self.qa_data.iloc[index * self.batch_size : (index + 1) * self.batch_size, :]
        negative_batch = positive_batch
        
        cur_profs = negative_batch.professionals_id
        new_profs = np.random.choice(self.unique_profs, self.batch_size)
        
        while np.sum(cur_profs == new_profs) > 0:
            new_profs = np.random.choice(self.unique_profs, self.batch_size)
        
        negative_batch.assign(professionals_id=new_profs)
        
        """
        for i, prof in tqdm(enumerate(new_profs)):
            prof_ques = self.prof_ques_dict[prof]
            index = np.searchsorted(np.array(prof_ques.professionals_last_answer_date_time),
                                    negative_batch.professionals_last_answer_date_time.iloc[i])
            if index < 1:
                index = 1
            
            negative_batch.iloc[i, 13:] = prof_ques.iloc[index-1, 13:]
        """
        
        single_batch = pd.concat([positive_batch, negative_batch])
        single_batch = single_batch.merge(self.prof_data, on='professionals_id')
        single_batch = single_batch.merge(self.stud_data, on='students_id')
        
        x_ques = single_batch[
            ['students_location', 'students_state',
            'students_questions_asked', 'students_average_question_age'] + \
            list(single_batch.loc[:, 'students_date_joined_time':'students_date_joined_dow'].columns) + \
            list(single_batch.loc[:, 'questions_date_added_time':'questions_date_added_hour_cos'].columns)
        ]
        
        x_prof = single_batch[
            ['professionals_industry', 'professionals_location', 'professionals_state',
            'professionals_questions_answered', 'professionals_average_question_age'] + \
            list(single_batch.loc[:, 'professionals_date_joined_time':'professionals_date_joined_dow'].columns) + \
            list(single_batch.loc[:, 'professionals_last_answer_date_time':'professionals_last_answer_date_hour_cos'].columns)
        ]
        
        y = np.concatenate([np.ones(self.batch_size), np.zeros(self.batch_size)])
        
        return [x_ques.values, x_prof.values], y
    
    
    def on_epoch_end(self):
        """
        Shuffle qa_data after each epoch
        """
        if self.shuffle == True:
            self.qa_data = shuffle(self.qa_data)

In [15]:
generator = DataGenerator(pp)

In [18]:
%%timeit
generator.__getitem__(0)

14.2 ms ± 187 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
%lprun -f generator.__getitem__ generator.__getitem__(0)

In [None]:
%load_ext line_profiler