## Prepare data

In [2]:
import numpy as np
import pandas as pd
import pickle, re
from keras import Model
from sklearn.neighbors import KDTree
from processors import QueProc
from models import ContentModel

In [3]:
with open('proc_data/train_que_data.pkl', 'rb') as f:
    train_que_data = pickle.load(f)
with open('proc_data/train_stu_data.pkl', 'rb') as f:
    train_stu_data = pickle.load(f)
with open('proc_data/train_pro_data.pkl', 'rb') as f:
    train_pro_data = pickle.load(f)

In [4]:
que_cols = list(train_stu_data.columns[2:]) + list(train_que_data.columns[2:]) + ['questions_current_time']
pro_cols = list(train_pro_data.columns[2:]) + ['professionals_current_time']
print(len(que_cols), len(pro_cols))

que_content_mask = np.zeros(len(que_cols)-1, dtype=bool) # Change
for i, col in enumerate(que_cols[:-1]): # Change
    if re.search(r'emb', col):
        que_content_mask[i] = True

pro_content_mask = np.zeros(len(pro_cols)-1, dtype=bool) # Change
for i, col in enumerate(pro_cols[:-1]): # Change
    if re.search(r'emb', col):
        pro_content_mask[i] = True
print(que_content_mask.size, pro_content_mask.size)

28 33
27 32


In [5]:
content_model = ContentModel(
    len(que_cols), que_content_mask,
    len(pro_cols), pro_content_mask,
    10, 5,
)
content_model.load_weights('content_model.h5')

## Predictor class

In [6]:
class Predictor:
    """
    Class that creates KNN tree for professionals
    and which is used to find closest professionals for a particular question
    """
    
    def __init__(self, content_model: ContentModel):
        """
        Prepare required datasets and create KNN tree for professionals
        based on latent vectors from content model
        :param content_model: compiled model of class ContentModel
        """
        # load raw datasets
        self.pro = pd.read_csv('../../data/professionals.csv')
        
        tags = pd.read_csv('../../data/tags.csv')
        tag_pro = pd.read_csv('../../data/tag_users.csv').merge(
            tags, left_on='tag_users_tag_id', right_on='tags_tag_id')
        
        # append aggregated subscribed tags to each professional
        pro_tags = tag_pro.groupby('tag_users_user_id', as_index=False)[['tags_tag_name']] \
            .aggregate(lambda x: ' '.join(set(x))) \
            .rename({'tag_users_user_id': 'professionals_id',
                     'tags_tag_name': 'professionals_subscribed_tags'}, axis=1)
        self.pro = self.pro.merge(pro_tags, how='left', on='professionals_id')
        
        # load datasets with preprocessed features
        with open('proc_data/test_stu_data.pkl', 'rb') as f:
            self.stu_data = pickle.load(f)
        with open('proc_data/test_pro_data.pkl', 'rb') as f:
            self.pro_data = pickle.load(f)
        
        self.content_model = content_model
        
        # prepare professional features
        pro_dict = {pro: group.values[-1, 2:] for pro, group in self.pro_data.groupby('professionals_id')}
        self.pros = np.array(list(pro_dict.keys()))
        pro_feat = np.vstack(pro_dict.values())
        pro_feat = np.hstack([pro_feat, np.zeros((pro_feat.shape[0], 1))])
        
        # prepare student features
        stu_dict = {stu: group.iloc[-1, 2:] for stu, group in self.stu_data.groupby('students_id')}
        self.stu_data = pd.DataFrame.from_dict(stu_dict).transpose()
        
        # compute latent vectors for professionals from content model
        pro_lat_model = Model(inputs=content_model.inputs[1], outputs=content_model.pro_encoded)
        pro_lat_vecs = pro_lat_model.predict(pro_feat)
        
        # create model that receieves question features and returns question latent vector
        self.que_lat_model = Model(inputs=content_model.inputs[0], outputs=content_model.que_encoded)
        
        # create KNN tree consisting of professional latent vectors
        self.lat_vec_tree = KDTree(pro_lat_vecs)
        
        # initialize QueProc
        self.que_proc = QueProc(oblige_fit=False, path='dump/')
    
    
    def predict_df(self, que_df: pd.DataFrame, que_tags: pd.DataFrame,
                   top: int=10, pro_expand: bool=False) -> pd.DataFrame:
        """
        Returns top professionals for given questions
        :param que_df: DataFrame of question data
        :param que_tags: DataFrame of question_tags
        :param top: how many top professionals to return
        :param pro_expand: whether to add professional data to returned DataFrame
        """
        que_df['questions_date_added'] = pd.to_datetime(que_df['questions_date_added'])
        
        # prepare student features
        stu_feat = que_df[['questions_author_id']] \
            .merge(self.stu_data, how='left', left_on='questions_author_id', right_index=True).values[:, 1:]
        
        # prepare question features and add them to student features
        que_feat = self.que_proc.transform(que_df, que_tags).values[:, 2:]
        que_feat = np.hstack([stu_feat, que_feat, np.zeros((que_feat.shape[0], 1))])
        
        # get top professionals for questions
        que_lat_vecs = self.que_lat_model.predict(que_feat)
        dists, pros = self.lat_vec_tree.query(que_lat_vecs, k=top)
        pros = self.pros[pros]
        scores = np.exp(-dists)
        ques = que_df['questions_id'].values
        
        # create que-pro-score tuples
        tuples = []
        for i, que in enumerate(ques):
            for j, pro in enumerate(pros[i]):
                tuples.append((que, pro, scores[i, j]))
        
        # create DataFrame from que-pro-score tuples
        score_df = pd.DataFrame(tuples, columns=['questions_id', 'professionals_id', 'professionals_score'])
        
        if pro_expand:
            # add professionals features
            score_df = score_df.merge(self.pro, how='left', on='professionals_id')
        
        return score_df
    
    
    def predict_dict(self, que_dict: dict, top: int=10, pro_expand: bool=False) -> pd.DataFrame:
        """
        Converts dictionary of questions into desired form, then call predict_df method
        :param que_dict: dictionary of question data
        :param top: how many top professionals to return
        :param pro_expand: whether to add professional data to returned DataFrame
        """
        # get DataFrame from dict
        que_df = pd.DataFrame.from_dict(que_dict)
        ques = que_df['questions_id'].values
        
        # create que-tag tuples
        tuples = []
        for i, tags in enumerate(que_df['questions_tags'].values):
            que = ques[i]
            for tag in tags.split(' '):
                tuples.append((que, tag))
        
        # create DataFrame from que-tag tuples
        que_tags = pd.DataFrame(tuples, columns=['tag_questions_question_id', 'tags_tag_name'])
        
        que_df.drop(columns='questions_tags', inplace=True)
        
        # pass computed DataFrames to predict_df method
        return self.predict_df(que_df, que_tags, top, pro_expand)

## Test Predictor class

In [11]:
predictor = Predictor(content_model)

In [8]:
que_dict = {
    'questions_id': ['332a511f1569444485cf7a7a556a5e54'],
    'questions_author_id': ['8f6f374ffd834d258ab69d376dd998f5'],
    'questions_date_added': ['2016-04-26 11:14:26'],
    'questions_title': ['Teacher   career   question'],
    'questions_body': ['What  is  a  maths  teacher?   what  is  a  maths  teacher  useful? #college #professor #lecture'],
    'questions_tags': ['college professor lecture']
}

In [9]:
predictor.predict_dict(que_dict, pro_expand=False)

Unnamed: 0,questions_id,professionals_id,professionals_score
0,332a511f1569444485cf7a7a556a5e54,505fb5d32c1e41c5affe896328644832,0.858492
1,332a511f1569444485cf7a7a556a5e54,fe151e90cc154d7da63b14fe6ed3c3e5,0.846544
2,332a511f1569444485cf7a7a556a5e54,3ce003197d28478884ae0183a645d968,0.840861
3,332a511f1569444485cf7a7a556a5e54,eab382099e2b4b03abdcbb4d85b5ec0d,0.828587
4,332a511f1569444485cf7a7a556a5e54,d754969c1565445db381bb2d75273ee1,0.808249
5,332a511f1569444485cf7a7a556a5e54,dcffe989be294141a523d71f3908c6bb,0.806587
6,332a511f1569444485cf7a7a556a5e54,bf6edb3e76b94594a982c5776764cf75,0.797939
7,332a511f1569444485cf7a7a556a5e54,aad8c9eaabf24305912b30a5d58137f6,0.795392
8,332a511f1569444485cf7a7a556a5e54,9bf67236d34743768be67bd789dc618e,0.793254
9,332a511f1569444485cf7a7a556a5e54,c2c6bf81c2e444ff834d98b58ab37687,0.792261


In [10]:
%%timeit
predictor.predict_dict(que_dict, pro_expand=True)

46 ms ± 881 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
