## Prepare data

In [1]:
import numpy as np
import pandas as pd
import pickle, re
from keras import Model
from sklearn.neighbors import KDTree
from processors import QueProc, ProProc
from models import ContentModel

Using TensorFlow backend.


In [2]:
with open('proc_data/train_que_data.pkl', 'rb') as f:
    train_que_data = pickle.load(f)
with open('proc_data/train_stu_data.pkl', 'rb') as f:
    train_stu_data = pickle.load(f)
with open('proc_data/train_pro_data.pkl', 'rb') as f:
    train_pro_data = pickle.load(f)

In [3]:
que_cols = list(train_stu_data.columns[2:]) + list(train_que_data.columns[2:]) + ['questions_current_time']
pro_cols = list(train_pro_data.columns[2:]) + ['professionals_current_time']
print(len(que_cols), len(pro_cols))

que_content_mask = np.zeros(len(que_cols)-1, dtype=bool) # Change
for i, col in enumerate(que_cols[:-1]): # Change
    if re.search(r'emb', col):
        que_content_mask[i] = True

pro_content_mask = np.zeros(len(pro_cols)-1, dtype=bool) # Change
for i, col in enumerate(pro_cols[:-1]): # Change
    if re.search(r'emb', col):
        pro_content_mask[i] = True
print(que_content_mask.size, pro_content_mask.size)

28 33
27 32


In [4]:
content_model = ContentModel(
    len(que_cols), que_content_mask,
    len(pro_cols), pro_content_mask,
    10, 5,
)
content_model.load_weights('content_model.h5')

---

## Predictor class

In [5]:
class Predictor:
    """
    Class that creates KNN tree for professionals
    and which is used to find closest professionals for a particular question
    """
    
    def __init__(self, content_model: ContentModel):
        """
        Prepare required datasets and create KNN tree for professionals
        based on latent vectors from content model
        :param content_model: compiled model of class ContentModel
        """
        self.content_model = content_model
        
        # load raw datasets
        self.que = pd.read_csv('../../data/questions.csv')
        self.ans = pd.read_csv('../../data/answers.csv')
        self.stu = pd.read_csv('../../data/students.csv')
        self.pro = pd.read_csv('../../data/professionals.csv')
        
        # process date columns
        for df, col in [(self.que, 'questions_date_added'), (self.ans, 'answers_date_added')]:
            df[col] = pd.to_datetime(df[col])
        
        # form question-student pairs dataframe
        que_stu = self.que.merge(self.stu, left_on='questions_author_id', right_on='students_id') \
            [['questions_id', 'students_id']]
        
        # load and merge raw tag datasets
        tags = pd.read_csv('../../data/tags.csv')
        tag_que = pd.read_csv('../../data/tag_questions.csv').merge(
            tags, left_on='tag_questions_tag_id', right_on='tags_tag_id')
        tag_pro = pd.read_csv('../../data/tag_users.csv').merge(
            tags, left_on='tag_users_tag_id', right_on='tags_tag_id')
        
        # append tags to each question
        que_tags = tag_que.groupby('tag_questions_question_id', as_index=False)[['tags_tag_name']] \
            .aggregate(lambda x: ' '.join(set(x))) \
            .rename({'tag_questions_question_id': 'questions_id',
                     'tags_tag_name': 'questions_tags'}, axis=1)
        self.que = self.que.merge(que_tags, how='left', on='questions_id')
        
        # append subscribed tags to each professional
        pro_tags = tag_pro.groupby('tag_users_user_id', as_index=False)[['tags_tag_name']] \
            .aggregate(lambda x: ' '.join(set(x))) \
            .rename({'tag_users_user_id': 'professionals_id',
                     'tags_tag_name': 'professionals_subscribed_tags'}, axis=1)
        self.pro = self.pro.merge(pro_tags, how='left', on='professionals_id')
        
        # load datasets with preprocessed features
        with open('proc_data/test_que_data.pkl', 'rb') as f:
            self.que_data = pickle.load(f)
        with open('proc_data/test_stu_data.pkl', 'rb') as f:
            self.stu_data = pickle.load(f)
        with open('proc_data/test_pro_data.pkl', 'rb') as f:
            self.pro_data = pickle.load(f)
        
        # prepare student features
        stu_dict = {stu: group.iloc[-1, 2:] for stu, group in self.stu_data.groupby('students_id')}
        self.stu_data = pd.DataFrame.from_dict(stu_dict).transpose()
        
        # prepare student features for questions from que_data
        stus = self.que_data.merge(que_stu, how='left', on='questions_id')[['students_id']]
        stu_feat = stus.merge(self.stu_data, how='left', left_on='students_id', right_index=True).values[:, 1:]
        
        # prepare question features and add them to student features
        self.ques = self.que_data['questions_id'].values
        que_feat = self.que_data.values[:, 2:]
        que_feat = np.hstack([stu_feat, que_feat, np.zeros((que_feat.shape[0], 1))])
        
        # prepare professional features
        pro_dict = {pro: group.values[-1, 2:] for pro, group in self.pro_data.groupby('professionals_id')}
        self.pros = np.array(list(pro_dict.keys()))
        pro_feat = np.vstack(pro_dict.values())
        pro_feat = np.hstack([pro_feat, np.zeros((pro_feat.shape[0], 1))])
        
        # create two models that receieve question and professional features respectively
        # and return their latent vectors
        self.que_lat_model = Model(inputs=content_model.inputs[0], outputs=content_model.que_encoded)
        self.pro_lat_model = Model(inputs=content_model.inputs[1], outputs=content_model.pro_encoded)
        
        # compute latent vectors for questions and professionals
        que_lat_vecs = self.que_lat_model.predict(que_feat)
        pro_lat_vecs = self.pro_lat_model.predict(pro_feat)
        
        # create two KNN trees consisting of question and professional latent vectors
        self.que_tree = KDTree(que_lat_vecs)
        self.pro_tree = KDTree(pro_lat_vecs)
        
        # initialize QueProc and ProProc
        self.que_proc = QueProc(oblige_fit=False, path='dump/')
        self.pro_proc = ProProc(oblige_fit=False, path='dump/')
    
    
    def find_pros_by_que(self, que_df: pd.DataFrame, que_tags: pd.DataFrame,
                         top: int=10, expand: bool=False) -> pd.DataFrame:
        """
        Returns top professionals for given questions
        :param que_df: DataFrame of question data
        :param que_tags: DataFrame of question tags
        :param top: how many top professionals to return
        :param expand: whether to add professional data to returned DataFrame
        """
        que_df['questions_date_added'] = pd.to_datetime(que_df['questions_date_added'])
        
        # prepare student features
        stu_feat = que_df[['questions_author_id']] \
            .merge(self.stu_data, how='left', left_on='questions_author_id', right_index=True).values[:, 1:]
        
        # prepare question features and add them to student features
        que_feat = self.que_proc.transform(que_df, que_tags).values[:, 2:]
        que_feat = np.hstack([stu_feat, que_feat, np.zeros((que_feat.shape[0], 1))])
        
        # get top professionals for questions
        que_lat_vecs = self.que_lat_model.predict(que_feat)
        dists, pros = self.pro_tree.query(que_lat_vecs, k=top)
        pros = self.pros[pros]
        scores = np.exp(-dists)
        ques = que_df['questions_id'].values
        
        # create question-professional-score tuples
        tuples = []
        for i, que in enumerate(ques):
            for j, pro in enumerate(pros[i]):
                tuples.append((que, pro, scores[i, j]))
        
        # create DataFrame from tuples
        score_df = pd.DataFrame(tuples, columns=['questions_id', 'professionals_id', 'professionals_score'])
        
        if expand:
            # add professionals features
            score_df = score_df.merge(self.pro, how='left', on='professionals_id')
        
        return score_df
    
    
    def find_ques_by_que(self, que_df: pd.DataFrame, que_tags: pd.DataFrame,
                         top: int=10, expand: bool=False) -> pd.DataFrame:
        """
        Returns top similar questions for given questions
        :param que_df: DataFrame of question data
        :param que_tags: DataFrame of question tags
        :param top: how many top professionals to return
        :param expand: whether to add professional data to returned DataFrame
        """
        que_df['questions_date_added'] = pd.to_datetime(que_df['questions_date_added'])
        
        # prepare student features
        stu_feat = que_df[['questions_author_id']] \
            .merge(self.stu_data, how='left', left_on='questions_author_id', right_index=True).values[:, 1:]
        
        # prepare question features and add them to student features
        que_feat = self.que_proc.transform(que_df, que_tags).values[:, 2:]
        que_feat = np.hstack([stu_feat, que_feat, np.zeros((que_feat.shape[0], 1))])
        
        # get top similar questions for initial questions
        que_lat_vecs = self.que_lat_model.predict(que_feat)
        dists, sim_ques = self.que_tree.query(que_lat_vecs, k=top)
        sim_ques = self.ques[sim_ques]
        scores = np.exp(-dists)
        ques = que_df['questions_id'].values
        
        # create question-similar_question-score tuples
        tuples = []
        for i, que in enumerate(ques):
            for j, sim_que in enumerate(sim_ques[i]):
                tuples.append((que, sim_que, scores[i, j]))
        
        # create DataFrame from tuples
        score_df = pd.DataFrame(tuples, columns=['initial_questions_id', 'questions_id', 'questions_score'])
        
        if expand:
            # add similar question features
            score_df = score_df.merge(self.que, how='left', on='questions_id')
        
        return score_df
    
    
    def convert_que_dict(self, que_dict: dict) -> (pd.DataFrame, pd.DataFrame):
        """
        Converts dictionary of question data into desired form
        :param que_dict: dictionary of question data
        """
        # get DataFrame from dict
        que_df = pd.DataFrame.from_dict(que_dict)
        ques = que_df['questions_id'].values
        
        # create question-tag tuples
        tuples = []
        for i, tags in enumerate(que_df['questions_tags'].values):
            que = ques[i]
            for tag in tags.split(' '):
                tuples.append((que, tag))
        
        # create DataFrame from tuples
        que_tags = pd.DataFrame(tuples, columns=['tag_questions_question_id', 'tags_tag_name'])
        que_df.drop(columns='questions_tags', inplace=True)
        
        return que_df, que_tags
    
    
    def find_ques_by_pro(self, pro_df: pd.DataFrame, pro_tags: pd.DataFrame,
                         top: int=10, expand: bool=False) -> pd.DataFrame:
        """
        Returns top questions for given professionals
        :param pro_df: DataFrame of professional data
        :param pro_tags: DataFrame of professional subscribed tags
        :param top: how many top professionals to return
        :param expand: whether to add professional data to returned DataFrame
        """
        pro_df['professionals_date_joined'] = pd.to_datetime(pro_df['professionals_date_joined'])
        
        # prepare professional features
        pro_feat = self.pro_proc.transform(pro_df, self.que, self.ans, pro_tags).values[:, 2:]
        pro_feat = np.hstack([pro_feat, np.zeros((pro_feat.shape[0], 1))])
        
        # get top questions for professionals
        pro_lat_vecs = self.pro_lat_model.predict(pro_feat)
        dists, ques = self.que_tree.query(pro_lat_vecs, k=top)
        ques = self.ques[ques]
        scores = np.exp(-dists)
        pros = pro_df['professionals_id'].values
        
        # create professional-question-score tuples
        tuples = []
        for i, pro in enumerate(pros):
            for j, que in enumerate(ques[i]):
                tuples.append((pro, que, scores[i, j]))
        
        # create DataFrame from tuples
        score_df = pd.DataFrame(tuples, columns=['professionals_id', 'questions_id', 'questions_score'])
        
        if expand:
            # add question features
            score_df = score_df.merge(self.que, how='left', on='questions_id')
        
        return score_df
    
    
    def convert_pro_dict(self, pro_dict: dict) -> (pd.DataFrame, pd.DataFrame):
        """
        Converts dictionary of professional data into desired form
        :param pro_dict: dictionary of professional data
        """
        # get DataFrame from dict
        pro_df = pd.DataFrame.from_dict(pro_dict)
        pros = pro_df['professionals_id'].values
        
        # create professional-tag tuples
        tuples = []
        for i, tags in enumerate(pro_df['professionals_subscribed_tags'].values):
            pro = pros[i]
            for tag in tags.split(' '):
                tuples.append((pro, tag))
        
        # create DataFrame from tuples
        pro_tags = pd.DataFrame(tuples, columns=['tag_users_user_id', 'tags_tag_name'])
        pro_df.drop(columns='professionals_subscribed_tags', inplace=True)
        
        return pro_df, pro_tags

---

## Test Predictor class

In [6]:
predictor = Predictor(content_model)

In [7]:
que_dict = {
    'questions_id': ['332a511f1569444485cf7a7a556a5e54'],
    'questions_author_id': ['8f6f374ffd834d258ab69d376dd998f5'],
    'questions_date_added': ['2016-04-26 11:14:26'],
    'questions_title': ['Teacher   career   question'],
    'questions_body': ['What  is  a  maths  teacher?   what  is  a  maths  teacher  useful? #college #professor #lecture'],
    'questions_tags': ['college professor lecture']
}

In [8]:
que_df, que_tags = predictor.convert_que_dict(que_dict)

### Recommend the question to professionals

In [9]:
predictor.find_pros_by_que(que_df, que_tags, expand=False)

Unnamed: 0,questions_id,professionals_id,professionals_score
0,332a511f1569444485cf7a7a556a5e54,505fb5d32c1e41c5affe896328644832,0.858492
1,332a511f1569444485cf7a7a556a5e54,fe151e90cc154d7da63b14fe6ed3c3e5,0.846544
2,332a511f1569444485cf7a7a556a5e54,3ce003197d28478884ae0183a645d968,0.840861
3,332a511f1569444485cf7a7a556a5e54,eab382099e2b4b03abdcbb4d85b5ec0d,0.828587
4,332a511f1569444485cf7a7a556a5e54,d754969c1565445db381bb2d75273ee1,0.808249
5,332a511f1569444485cf7a7a556a5e54,dcffe989be294141a523d71f3908c6bb,0.806587
6,332a511f1569444485cf7a7a556a5e54,bf6edb3e76b94594a982c5776764cf75,0.797939
7,332a511f1569444485cf7a7a556a5e54,aad8c9eaabf24305912b30a5d58137f6,0.795392
8,332a511f1569444485cf7a7a556a5e54,9bf67236d34743768be67bd789dc618e,0.793254
9,332a511f1569444485cf7a7a556a5e54,c2c6bf81c2e444ff834d98b58ab37687,0.792261


### Find similar questions

In [10]:
predictor.find_ques_by_que(que_df, que_tags, expand=False)

Unnamed: 0,initial_questions_id,questions_id,questions_score
0,332a511f1569444485cf7a7a556a5e54,332a511f1569444485cf7a7a556a5e54,1.0
1,332a511f1569444485cf7a7a556a5e54,f304f28d05ec473b8707ed88bb02b33e,0.904889
2,332a511f1569444485cf7a7a556a5e54,f7d47936c73b49b2b480c7375a52acdc,0.903413
3,332a511f1569444485cf7a7a556a5e54,f8294444f2fa4f78bc829f04ccbc06b7,0.873109
4,332a511f1569444485cf7a7a556a5e54,6c98bbce49714e53a488a27e95ca4132,0.864905
5,332a511f1569444485cf7a7a556a5e54,38f6b1a67866487b8707e389231ab4b7,0.864606
6,332a511f1569444485cf7a7a556a5e54,f1a86d3c087c474b9457c9f10a94a21e,0.864024
7,332a511f1569444485cf7a7a556a5e54,231dbfb6162b4518bb1789dd5a78fa6b,0.860059
8,332a511f1569444485cf7a7a556a5e54,b38d19e10f404ff4867f160563283b63,0.856841
9,332a511f1569444485cf7a7a556a5e54,bf41531f8a604ccdb7dd8ae9af6141af,0.851405


In [11]:
%%timeit
predictor.find_ques_by_que(que_df, que_tags, expand=True)

46.4 ms ± 3.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
%load_ext line_profiler

In [13]:
%lprun -f predictor.find_ques_by_que predictor.find_ques_by_que(que_df, que_tags, expand=True)

---

In [14]:
pro_dict = {
    'professionals_id': ['44b2484ecd3642c6a47514f3876cf14a'],
    'professionals_location': ['Cambridge, Massachusetts'],
    'professionals_industry': ['Healthcare, Pharmaceuticals, Life Science'],
    'professionals_headline': ['Scientist and Healthcare Entrepreneur'],
    'professionals_date_joined': ['2012-01-25 20:40:43'],
    'professionals_subscribed_tags': ['pharmaceutical-industry medicine healthcare-it biology science healthcare research']
}

In [15]:
pro_df, pro_tags = predictor.convert_pro_dict(pro_dict)

### Recommend questions to the professional

In [16]:
predictor.find_ques_by_pro(pro_df, pro_tags, expand=False)

Unnamed: 0,professionals_id,questions_id,questions_score
0,44b2484ecd3642c6a47514f3876cf14a,8aa85db69de34a238abf3808565f7ff0,0.915494
1,44b2484ecd3642c6a47514f3876cf14a,e71d0e419d5a4b40bb4d6bd4f30f3d43,0.915494
2,44b2484ecd3642c6a47514f3876cf14a,b8e51f4ab25e486f8c7fb9505d94fe31,0.907321
3,44b2484ecd3642c6a47514f3876cf14a,400c82f7a66f448ea590fd37ba82d881,0.892274
4,44b2484ecd3642c6a47514f3876cf14a,1426fd64bcb24c49ad89d54c681befd2,0.885919
5,44b2484ecd3642c6a47514f3876cf14a,a141f926dfff4fe0903ab23133ae3e00,0.882401
6,44b2484ecd3642c6a47514f3876cf14a,1ba041b8731841c4bf2b289a1ba38129,0.875935
7,44b2484ecd3642c6a47514f3876cf14a,c76814fcd9f5435796af8f886f642297,0.873863
8,44b2484ecd3642c6a47514f3876cf14a,2c46a965e09f455093ad21bf1cccb826,0.873863
9,44b2484ecd3642c6a47514f3876cf14a,5e165d6bad5e46dc80e5a004bb2760f7,0.8722


In [18]:
%%timeit
predictor.find_ques_by_pro(pro_df, pro_tags, expand=True)

132 ms ± 1.42 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
%lprun -f predictor.find_ques_by_pro predictor.find_ques_by_pro(pro_df, pro_tags, expand=True)