# Gathering Data

In [1]:
# Importing libs
import numpy as np
import pandas as pd

# all lightfm imports
from lightfm.data import Dataset
from lightfm import LightFM, cross_validation
from lightfm.evaluation import precision_at_k, auc_score # type: ignore

# imports re for text cleaning
import re
from datetime import datetime, timedelta

# we will ignore pandas warnings
import warnings
warnings.filterwarnings('ignore')



In [2]:
# Read all datasets and store them in pandas df
base_path = './input/'
df_answer_scores = pd.read_csv(
    base_path + 'answer_scores.csv'
)

df_answers = pd.read_csv(
    base_path + 'answers.csv',
    parse_dates=['answers_date_added']
)

df_comments = pd.read_csv(
    base_path + 'comments.csv'
)

df_emails = pd.read_csv(
    base_path + 'emails.csv'
)

df_group_memberships = pd.read_csv(
    base_path + 'group_memberships.csv'
)

df_groups = pd.read_csv(
    base_path + 'groups.csv'
)

df_matches = pd.read_csv(
    base_path + 'matches.csv'
)

df_professionals = pd.read_csv(
    base_path + 'professionals.csv',
    parse_dates=['professionals_date_joined']
)

df_question_scores = pd.read_csv(
    base_path + 'question_scores.csv'
)

df_questions = pd.read_csv(
    base_path + 'questions.csv',
    parse_dates=['questions_date_added']
)

df_school_memberships = pd.read_csv(
    base_path + 'school_memberships.csv'
)

df_students = pd.read_csv(
    base_path + 'students.csv',
    parse_dates=['students_date_joined']
)

df_tag_questions = pd.read_csv(
    base_path + 'tag_questions.csv'
)

df_tag_users = pd.read_csv(
    base_path + 'tag_users.csv'
)

df_tags = pd.read_csv(
    base_path + 'tags.csv'
)

In [3]:
df_answer_scores.head()

Unnamed: 0,id,score
0,7b2bb0fc0d384e298cffa6afde9cf6ab,1
1,7640a6e5d5224c8681cc58de860858f4,5
2,3ce32e236fa9435183b2180fb213375c,2
3,fa30fe4c016043e382c441a7ef743bfb,0
4,71229eb293314c8a9e545057ecc32c93,2


In [4]:
df_answers.head()

Unnamed: 0,answers_id,answers_author_id,answers_question_id,answers_date_added,answers_body
0,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,332a511f1569444485cf7a7a556a5e54,2016-04-29 19:40:14+00:00,<p>Hi!</p>\n<p>You are asking a very interesti...
1,ada720538c014e9b8a6dceed09385ee3,2aa47af241bf42a4b874c453f0381bd4,eb80205482e4424cad8f16bc25aa2d9c,2018-05-01 14:19:08+00:00,<p>Hi. I joined the Army after I attended coll...
2,eaa66ef919bc408ab5296237440e323f,cbd8f30613a849bf918aed5c010340be,eb80205482e4424cad8f16bc25aa2d9c,2018-05-02 02:41:02+00:00,"<p>Dear Priyanka,</p><p>Greetings! I have answ..."
3,1a6b3749d391486c9e371fbd1e605014,7e72a630c303442ba92ff00e8ea451df,4ec31632938a40b98909416bdd0decff,2017-05-10 19:00:47+00:00,<p>I work for a global company who values high...
4,5229c514000446d582050f89ebd4e184,17802d94699140b0a0d2995f30c034c6,2f6a9a99d9b24e5baa50d40d0ba50a75,2017-10-13 22:07:33+00:00,I agree with Denise. Every single job I've had...


In [5]:
df_comments.head()

Unnamed: 0,comments_id,comments_author_id,comments_parent_content_id,comments_date_added,comments_body
0,f30250d3c2ca489db1afa9b95d481e08,9fc88a7c3323466dbb35798264c7d497,b476f9c6d9cd4c50a7bacdd90edd015a,2019-01-31 23:39:40 UTC+0000,"First, you speak to recruiters. They are train..."
1,ca9bfc4ba9464ea383a8b080301ad72c,de2415064b9b445c8717425ed70fd99a,ef4b6ae24d1f4c3b977731e8189c7fd7,2019-01-31 20:30:47 UTC+0000,Most large universities offer study abroad pro...
2,c354f6e33956499aa8b03798a60e9386,6ed20605002a42b0b8e3d6ac97c50c7f,ca7a9d7a95df471c816db82ee758f57d,2019-01-31 18:44:04 UTC+0000,"First, I want to put you at ease that the oppo..."
3,73a6223948714c5da6231937157e4cb7,d02f6d9faac24997a7003a59e5f34bd3,c7a88aa76f5f49b0830bfeb46ba17e4d,2019-01-31 17:53:28 UTC+0000,Your question submission was great! I just wan...
4,55a89a9061d44dd19569c45f90a22779,e78f75c543e84e1c94da1801d8560f65,c7a88aa76f5f49b0830bfeb46ba17e4d,2019-01-31 14:51:53 UTC+0000,Thank you. I'm new to this site. I'm sorry if ...


# Defining our necessary funcs

In [6]:
def generate_int_id(dataframe, id_col_name):
    """
    Generate unique integer id for users, questions and answers

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    id_col_name : String 
        New integer id's column name.
        
    Returns
    -------
    Dataframe
        Updated dataframe containing new id column 
    """
    new_dataframe=dataframe.assign(
        int_id_col_name=np.arange(len(dataframe))
        ).reset_index(drop=True)
    return new_dataframe.rename(columns={'int_id_col_name': id_col_name})

def create_features(dataframe, features_name, id_col_name):
    """
    Generate features that will be ready for feeding into lightfm

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe which contains features
    features_name : List
        List of feature columns name avaiable in dataframe
    id_col_name: String
        Column name which contains id of the question or
        answer that the features will map to.
        There are two possible values for this variable.
        1. questions_id_num
        2. professionals_id_num

    Returns
    -------
    Pandas Series
        A pandas series containing process features
        that are ready for feed into lightfm.
        The format of each value
        will be (user_id, ['feature_1', 'feature_2', 'feature_3'])
        Ex. -> (1, ['military', 'army', '5'])
    """

    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = list(zip(dataframe[id_col_name], features))
    return features

def generate_feature_list(dataframe, features_name):
    """
    Generate features list for mapping 

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    features_name: List
        List of feature columns name avaiable in dataframe. 
        
    Returns
    -------
    List of all features for mapping 
    """
    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = features.apply(pd.Series).stack().reset_index(drop=True)
    return features

def calculate_auc_score(lightfm_model, interactions_matrix, 
                        question_features, professional_features): 
    """
    Measure the ROC AUC metric for a model. 
    A perfect score is 1.0.

    Parameters
    ----------
    lightfm_model: LightFM model 
        A fitted lightfm model 
    interactions_matrix: 
        A lightfm interactions matrix 
    question_features, professional_features: 
        Lightfm features 
        
    Returns
    -------
    String containing AUC score 
    """
    score = auc_score( 
        lightfm_model, interactions_matrix, 
        item_features=question_features, 
        user_features=professional_features, 
        num_threads=4).mean()
    return score

# Data Preprocessing and feature creation

In [7]:
# generating unique integer id for users and q&a
df_professionals = generate_int_id(df_professionals, 'professionals_id_num')
df_students = generate_int_id(df_students, 'students_id_num')
df_questions = generate_int_id(df_questions, 'questions_id_num')
df_answers = generate_int_id(df_answers, 'answers_id_num')

**Merging Datasets**

In [8]:
# merging dataset

# just dropna from tags 
df_tags = df_tags.dropna()
df_tags['tags_tag_name'] = df_tags['tags_tag_name'].str.replace('#', '')

# merge tag_questions with tags name
# then group all tags for each question into single rows
df_tags_question = df_tag_questions.merge(
    df_tags, how='inner',
    left_on='tag_questions_tag_id', right_on='tags_tag_id')
df_tags_question = df_tags_question.groupby(
    ['tag_questions_question_id'])['tags_tag_name'].apply(
        ','.join).reset_index()
df_tags_question = df_tags_question.rename(columns={'tags_tag_name': 'questions_tag_name'})

# merge tag_users with tags name 
# then group all tags for each user into single rows 
# after that rename the tag column name 
df_tags_pro = df_tag_users.merge(
    df_tags, how='inner',
    left_on='tag_users_tag_id', right_on='tags_tag_id')
df_tags_pro = df_tags_pro.groupby(
    ['tag_users_user_id'])['tags_tag_name'].apply(
        ','.join).reset_index()
df_tags_pro = df_tags_pro.rename(columns={'tags_tag_name': 'professionals_tag_name'})

# merge professionals and questions tags with main merge_dataset 
df_questions = df_questions.merge(
    df_tags_question, how='left',
    left_on='questions_id', right_on='tag_questions_question_id')
df_professionals = df_professionals.merge(
    df_tags_pro, how='left',
    left_on='professionals_id', right_on='tag_users_user_id')

# merge questions with scores 
df_questions = df_questions.merge(
    df_question_scores, how='left',
    left_on='questions_id', right_on='id')
# merge questions with students 
df_questions = df_questions.merge(
    df_students, how='left',
    left_on='questions_author_id', right_on='students_id')

# merge answers with questions 
# then merge professionals and questions score with that 
df_merge = df_answers.merge(
    df_questions, how='inner',
    left_on='answers_question_id', right_on='questions_id')
df_merge = df_merge.merge(
    df_professionals, how='inner',
    left_on='answers_author_id', right_on='professionals_id')
df_merge = df_merge.merge(
    df_question_scores, how='inner',
    left_on='questions_id', right_on='id')

**Generate some features**

In [9]:
# Generate some features for calculates weights
# that will use with interaction matrix 

df_merge['num_of_ans_by_professional'] = df_merge.groupby(['answers_author_id'])['questions_id'].transform('count')
df_merge['num_ans_per_ques'] = df_merge.groupby(['questions_id'])['answers_id'].transform('count')
df_merge['num_tags_professional'] = df_merge['professionals_tag_name'].str.split(",").str.len()
df_merge['num_tags_question'] = df_merge['questions_tag_name'].str.split(",").str.len()

In [10]:
df_merge.head()

Unnamed: 0,answers_id,answers_author_id,answers_question_id,answers_date_added,answers_body,answers_id_num,questions_id,questions_author_id,questions_date_added,questions_title,...,professionals_date_joined,professionals_id_num,tag_users_user_id,professionals_tag_name,id_y,score_y,num_of_ans_by_professional,num_ans_per_ques,num_tags_professional,num_tags_question
0,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,332a511f1569444485cf7a7a556a5e54,2016-04-29 19:40:14+00:00,<p>Hi!</p>\n<p>You are asking a very interesti...,0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26+00:00,Teacher career question,...,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",332a511f1569444485cf7a7a556a5e54,1,1710,1,12.0,3.0
1,f3519ab99a1a4a13a8a9ecb814287d2a,36ff3b3666df400f956f8335cf53e09e,0f1d6a4f276c4a05878dd48e03e52289,2016-07-31 15:35:54+00:00,<p>Hi Rodrigo!</p>\n<p>The important thing to ...,11,0f1d6a4f276c4a05878dd48e03e52289,585ac233015447cc9e9a217044e515e1,2016-05-19 22:16:25+00:00,what kind of college could i go to for a soc...,...,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",0f1d6a4f276c4a05878dd48e03e52289,1,1710,1,12.0,3.0
2,825f6e316a5f48328d6f8af831df9940,36ff3b3666df400f956f8335cf53e09e,0149c6d63e214040b44d4a3789bb00ba,2018-04-15 23:08:46+00:00,<p>Congratulations on being interested in find...,71,0149c6d63e214040b44d4a3789bb00ba,34217a1861d640a58c85e033414cf9cb,2018-04-12 17:13:45+00:00,What is the best way to prepare for studying e...,...,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",0149c6d63e214040b44d4a3789bb00ba,2,1710,2,12.0,3.0
3,fb2c794175304c4caeb55e654270421f,a32736b04c27437da3078374d47af1b1,0149c6d63e214040b44d4a3789bb00ba,2018-04-13 18:18:05+00:00,<p>Hi Elisabeth! </p><p><br></p><p>If you are ...,72,0149c6d63e214040b44d4a3789bb00ba,34217a1861d640a58c85e033414cf9cb,2018-04-12 17:13:45+00:00,What is the best way to prepare for studying e...,...,2018-04-13 17:48:09+00:00,18373,a32736b04c27437da3078374d47af1b1,computer-software,0149c6d63e214040b44d4a3789bb00ba,2,1,2,1.0,3.0
4,f3fc23809cda472780fc565334f35000,36ff3b3666df400f956f8335cf53e09e,acc611cfb5c44daa8a3d7d65dfffa5ff,2018-08-14 10:37:01+00:00,<p>The most important thing that you can do is...,102,acc611cfb5c44daa8a3d7d65dfffa5ff,5b751a8ee4a047f7a08ce9eb5e43e5a2,2018-08-14 04:49:33+00:00,How should I prepare myself for my job search ...,...,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",acc611cfb5c44daa8a3d7d65dfffa5ff,1,1710,1,12.0,4.0


In [11]:
print("Maximum number of answer per question : " + str(df_merge['num_ans_per_ques'].max()))
print("Maximum number of tags per professional : " + str(df_merge['num_tags_professional'].max()))
print("Maximum number of tags per question : " + str(df_merge['num_tags_question'].max()))

Maximum number of answer per question : 58
Maximum number of tags per professional : 82.0
Maximum number of tags per question : 54.0


**Merge answered questions tags with professional's tag**

In [12]:
# Merge professionals previous answered
# questions tags into professionals tags

# select professionals answered questions tags
# and stored as a dataframe
professionals_prev_ans_tags = df_merge[['professionals_id', 'questions_tag_name']]

# drop nulss values from that
professionals_prev_ans_tags = professionals_prev_ans_tags.dropna()

# because professionals answers multiple questions,
# we group all of tags of each user into single row
professionals_prev_ans_tags = professionals_prev_ans_tags.groupby(['professionals_id'])['questions_tag_name'].apply(','.join).reset_index()

# drop duplicates tags from each professionals rows
professionals_prev_ans_tags['questions_tag_name'] = (
    professionals_prev_ans_tags['questions_tag_name'].str.split(',').apply(set).str.join(',')
)

# finally merge the dataframe with professionals dataframe
df_professionals = df_professionals.merge(professionals_prev_ans_tags, how='left', on='professionals_id')

# join professionals tags and their answered tags
# we replace nan values with ""
df_professionals['professional_all_tags'] = (
    df_professionals[['professionals_tag_name', 'questions_tag_name']].apply(
        lambda x: ','.join(x.dropna()),
        axis=1
    )
)

**Handling null and duplicates values**

In [13]:
# handling null values 
df_questions['score'] = df_questions['score'].fillna(0)
df_questions['score'] = df_questions['score'].astype(int)
df_questions['questions_tag_name'] = df_questions['questions_tag_name'].fillna('No Tag')

# remove duplicates tags from each questions 
df_questions['questions_tag_name'] = df_questions['questions_tag_name'].str.split(',').apply(set).str.join(',')

# fill nan with 'No Tag' if any
df_professionals['professional_all_tags'] = df_professionals['professional_all_tags'].fillna('No Tag')

# replace "" with "No Tag", because previously we replace nan with ""
df_professionals['professional_all_tags'] = df_professionals['professional_all_tags'].replace('', 'No Tag')
df_professionals['professionals_location'] = df_professionals['professionals_location'].fillna('No Location')
df_professionals['professionals_industry'] = df_professionals['professionals_industry'].fillna('No Industry')

# remove duplicates tags from each professionals 
df_professionals['professional_all_tags'] = df_professionals['professional_all_tags'].str.split(',').apply(set).str.join(',')

# remove some null values from df_merge
df_merge['num_ans_per_ques']  = df_merge['num_ans_per_ques'].fillna(0)
df_merge['num_tags_professional'] = df_merge['num_tags_professional'].fillna(0)
df_merge['num_tags_question'] = df_merge['num_tags_question'].fillna(0)

In [14]:
df_merge.head()

Unnamed: 0,answers_id,answers_author_id,answers_question_id,answers_date_added,answers_body,answers_id_num,questions_id,questions_author_id,questions_date_added,questions_title,...,professionals_date_joined,professionals_id_num,tag_users_user_id,professionals_tag_name,id_y,score_y,num_of_ans_by_professional,num_ans_per_ques,num_tags_professional,num_tags_question
0,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,332a511f1569444485cf7a7a556a5e54,2016-04-29 19:40:14+00:00,<p>Hi!</p>\n<p>You are asking a very interesti...,0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26+00:00,Teacher career question,...,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",332a511f1569444485cf7a7a556a5e54,1,1710,1,12.0,3.0
1,f3519ab99a1a4a13a8a9ecb814287d2a,36ff3b3666df400f956f8335cf53e09e,0f1d6a4f276c4a05878dd48e03e52289,2016-07-31 15:35:54+00:00,<p>Hi Rodrigo!</p>\n<p>The important thing to ...,11,0f1d6a4f276c4a05878dd48e03e52289,585ac233015447cc9e9a217044e515e1,2016-05-19 22:16:25+00:00,what kind of college could i go to for a soc...,...,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",0f1d6a4f276c4a05878dd48e03e52289,1,1710,1,12.0,3.0
2,825f6e316a5f48328d6f8af831df9940,36ff3b3666df400f956f8335cf53e09e,0149c6d63e214040b44d4a3789bb00ba,2018-04-15 23:08:46+00:00,<p>Congratulations on being interested in find...,71,0149c6d63e214040b44d4a3789bb00ba,34217a1861d640a58c85e033414cf9cb,2018-04-12 17:13:45+00:00,What is the best way to prepare for studying e...,...,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",0149c6d63e214040b44d4a3789bb00ba,2,1710,2,12.0,3.0
3,fb2c794175304c4caeb55e654270421f,a32736b04c27437da3078374d47af1b1,0149c6d63e214040b44d4a3789bb00ba,2018-04-13 18:18:05+00:00,<p>Hi Elisabeth! </p><p><br></p><p>If you are ...,72,0149c6d63e214040b44d4a3789bb00ba,34217a1861d640a58c85e033414cf9cb,2018-04-12 17:13:45+00:00,What is the best way to prepare for studying e...,...,2018-04-13 17:48:09+00:00,18373,a32736b04c27437da3078374d47af1b1,computer-software,0149c6d63e214040b44d4a3789bb00ba,2,1,2,1.0,3.0
4,f3fc23809cda472780fc565334f35000,36ff3b3666df400f956f8335cf53e09e,acc611cfb5c44daa8a3d7d65dfffa5ff,2018-08-14 10:37:01+00:00,<p>The most important thing that you can do is...,102,acc611cfb5c44daa8a3d7d65dfffa5ff,5b751a8ee4a047f7a08ce9eb5e43e5a2,2018-08-14 04:49:33+00:00,How should I prepare myself for my job search ...,...,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",acc611cfb5c44daa8a3d7d65dfffa5ff,1,1710,1,12.0,4.0


# Building model in LightFM
In this steps, we are going to build our lightFM model using lightFM python library. Firstly, we have to create lightFM `Dataset` for our model. LightFM Dataset class makes it really easy for us for creating `interaction matrix`, `weights` and `user/item features`.

* `interaction matrix`: it is a matrix that contains user/item interactions or professional/question interactions.
* `weights`: weight of interaction matrix. Less weight means less importance to that interaction matrix
* `user/item features`: user/item features supplied as like this `(user_id, ['feature_1', 'feature_2', 'feature_3'])

If you want to how lightFM python library's dataset class works and how to use it, please go to this link [Building LightFM Datasets](http://https://lyst.github.io/lightfm/docs/examples/dataset.html).

Then, after that we will be start building our lightFM model using LightFM class. LightFM class makes it really easy for making lightFM model. After that we will train our model by our data.

**Creating features list for Dataset class**

In [16]:
# generating features list for mapping
question_feature_list = generate_feature_list(
    df_questions,
    ['questions_tag_name'])

professional_feature_list = generate_feature_list(
    df_professionals,
    ['professional_all_tags'])

In [17]:
question_feature_list

0                     lecture
1                   professor
2                     college
3                    military
4                        army
                 ...         
77191         law-enforcement
77192                    java
77193    computer-engineering
77194        computer-science
77195             programming
Length: 77196, dtype: object

In [18]:
professional_feature_list

0                                        consulting
1                                            resume
2                                            No Tag
3                                          engineer
4                                       photography
                            ...                    
228831    indirect-sales-(channel)-and-direct-sales
228832                              contact-centers
228833                          women-in-leadership
228834                            staff-development
228835                                    mentoring
Length: 228836, dtype: object

In [19]:
# calculate our weight value
df_merge['total_weights'] = 1 / (
    df_merge['num_ans_per_ques']
)

# creating features for feeding into lightfm
df_questions['question_features'] = create_features(
    df_questions, ['questions_tag_name'],
    'questions_id_num'
)

df_professionals['professional_features'] = create_features(
    df_professionals,
    ['professional_all_tags'],
    'professionals_id_num'
)

In [20]:
df_questions.head()

Unnamed: 0,questions_id,questions_author_id,questions_date_added,questions_title,questions_body,questions_id_num,tag_questions_question_id,questions_tag_name,id,score,students_id,students_location,students_date_joined,students_id_num,question_features
0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26+00:00,Teacher career question,What is a maths teacher? what is a ma...,0,332a511f1569444485cf7a7a556a5e54,"lecture,professor,college",332a511f1569444485cf7a7a556a5e54,1,8f6f374ffd834d258ab69d376dd998f5,"Coimbatore, Tamil Nadu, India",2016-04-22 10:07:32+00:00,6890.0,"(0, [lecture, professor, college])"
1,eb80205482e4424cad8f16bc25aa2d9c,acccbda28edd4362ab03fb8b6fd2d67b,2016-05-20 16:48:25+00:00,I want to become an army officer. What can I d...,I am Priyanka from Bangalore . Now am in 10th ...,1,eb80205482e4424cad8f16bc25aa2d9c,"military,army",eb80205482e4424cad8f16bc25aa2d9c,5,acccbda28edd4362ab03fb8b6fd2d67b,"Providence, Rhode Island",2016-05-20 16:29:08+00:00,10189.0,"(1, [military, army])"
2,4ec31632938a40b98909416bdd0decff,f2c179a563024ccc927399ce529094b5,2017-02-08 19:13:38+00:00,Will going abroad for your first job increase ...,I'm planning on going abroad for my first job....,2,4ec31632938a40b98909416bdd0decff,"working-abroad,overseas",4ec31632938a40b98909416bdd0decff,2,f2c179a563024ccc927399ce529094b5,,2017-02-07 15:51:57+00:00,18023.0,"(2, [working-abroad, overseas])"
3,2f6a9a99d9b24e5baa50d40d0ba50a75,2c30ffba444e40eabb4583b55233a5a4,2017-09-01 14:05:32+00:00,To become a specialist in business management...,i hear business management is a hard way to ge...,3,2f6a9a99d9b24e5baa50d40d0ba50a75,"networking,business",2f6a9a99d9b24e5baa50d40d0ba50a75,2,2c30ffba444e40eabb4583b55233a5a4,"North Lauderdale, Florida",2017-09-01 14:02:02+00:00,20803.0,"(3, [networking, business])"
4,5af8880460c141dbb02971a1a8369529,aa9eb1a2ab184ebbb00dc01ab663428a,2017-09-01 02:36:54+00:00,Are there any scholarships out there for stude...,I'm trying to find scholarships for first year...,4,5af8880460c141dbb02971a1a8369529,"highschoolsenior,firstgeneration,college,schol...",5af8880460c141dbb02971a1a8369529,2,aa9eb1a2ab184ebbb00dc01ab663428a,"Tunnel Hill, Georgia",2017-09-01 02:29:06+00:00,20505.0,"(4, [highschoolsenior, firstgeneration, colleg..."


In [25]:
df_professionals.head()

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num,tag_users_user_id,professionals_tag_name,questions_tag_name,professional_all_tags,professional_features
0,9ced4ce7519049c0944147afb75a8ce3,No Location,No Industry,,2011-10-05 20:35:19+00:00,0,,,"consulting,resume","consulting,resume","(0, [consulting, resume])"
1,f718dcf6d2ec4cb0a52a9db59d7f9e67,No Location,No Industry,,2011-10-05 20:49:21+00:00,1,,,,No Tag,"(1, [No Tag])"
2,0c673e046d824ec0ad0ebe012a0673e4,"New York, New York",No Industry,,2011-10-18 17:31:26+00:00,2,0c673e046d824ec0ad0ebe012a0673e4,"consulting,consulting,consulting,consulting,co...","engineer,photography,schools,sports-management...","engineer,photography,schools,sports-management...","(2, [engineer, photography, schools, sports-ma..."
3,977428d851b24183b223be0eb8619a8c,"Boston, Massachusetts",No Industry,,2011-11-09 20:39:29+00:00,3,,,"crime,public-defenders,neurosurgeon,law-enforc...","crime,public-defenders,neurosurgeon,law-enforc...","(3, [crime, public-defenders, neurosurgeon, la..."
4,e2d57e5041a44f489288397c9904c2b2,No Location,No Industry,,2011-12-10 22:14:44+00:00,4,,,,No Tag,"(4, [No Tag])"


**LightFM Dataset**: In this steps we are going to build lightfm datasets. And then we will be building our ineractions matrix, weights and professional/question features.

In [26]:
question_feature_list

0                     lecture
1                   professor
2                     college
3                    military
4                        army
                 ...         
77191         law-enforcement
77192                    java
77193    computer-engineering
77194        computer-science
77195             programming
Length: 77196, dtype: object

In [27]:
professional_feature_list

0                                        consulting
1                                            resume
2                                            No Tag
3                                          engineer
4                                       photography
                            ...                    
228831    indirect-sales-(channel)-and-direct-sales
228832                              contact-centers
228833                          women-in-leadership
228834                            staff-development
228835                                    mentoring
Length: 228836, dtype: object

In [28]:
len(set(df_professionals['professionals_id_num']))

28152

In [39]:
len(set(df_questions['questions_id_num']))

23931

In [46]:
df_merge.head()

Unnamed: 0,answers_id,answers_author_id,answers_question_id,answers_date_added,answers_body,answers_id_num,questions_id,questions_author_id,questions_date_added,questions_title,...,professionals_id_num,tag_users_user_id,professionals_tag_name,id_y,score_y,num_of_ans_by_professional,num_ans_per_ques,num_tags_professional,num_tags_question,total_weights
0,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,332a511f1569444485cf7a7a556a5e54,2016-04-29 19:40:14+00:00,<p>Hi!</p>\n<p>You are asking a very interesti...,0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26+00:00,Teacher career question,...,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",332a511f1569444485cf7a7a556a5e54,1,1710,1,12.0,3.0,1.0
1,f3519ab99a1a4a13a8a9ecb814287d2a,36ff3b3666df400f956f8335cf53e09e,0f1d6a4f276c4a05878dd48e03e52289,2016-07-31 15:35:54+00:00,<p>Hi Rodrigo!</p>\n<p>The important thing to ...,11,0f1d6a4f276c4a05878dd48e03e52289,585ac233015447cc9e9a217044e515e1,2016-05-19 22:16:25+00:00,what kind of college could i go to for a soc...,...,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",0f1d6a4f276c4a05878dd48e03e52289,1,1710,1,12.0,3.0,1.0
2,825f6e316a5f48328d6f8af831df9940,36ff3b3666df400f956f8335cf53e09e,0149c6d63e214040b44d4a3789bb00ba,2018-04-15 23:08:46+00:00,<p>Congratulations on being interested in find...,71,0149c6d63e214040b44d4a3789bb00ba,34217a1861d640a58c85e033414cf9cb,2018-04-12 17:13:45+00:00,What is the best way to prepare for studying e...,...,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",0149c6d63e214040b44d4a3789bb00ba,2,1710,2,12.0,3.0,0.5
3,fb2c794175304c4caeb55e654270421f,a32736b04c27437da3078374d47af1b1,0149c6d63e214040b44d4a3789bb00ba,2018-04-13 18:18:05+00:00,<p>Hi Elisabeth! </p><p><br></p><p>If you are ...,72,0149c6d63e214040b44d4a3789bb00ba,34217a1861d640a58c85e033414cf9cb,2018-04-12 17:13:45+00:00,What is the best way to prepare for studying e...,...,18373,a32736b04c27437da3078374d47af1b1,computer-software,0149c6d63e214040b44d4a3789bb00ba,2,1,2,1.0,3.0,0.5
4,f3fc23809cda472780fc565334f35000,36ff3b3666df400f956f8335cf53e09e,acc611cfb5c44daa8a3d7d65dfffa5ff,2018-08-14 10:37:01+00:00,<p>The most important thing that you can do is...,102,acc611cfb5c44daa8a3d7d65dfffa5ff,5b751a8ee4a047f7a08ce9eb5e43e5a2,2018-08-14 04:49:33+00:00,How should I prepare myself for my job search ...,...,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",acc611cfb5c44daa8a3d7d65dfffa5ff,1,1710,1,12.0,4.0,1.0


In [50]:
len(
    list(
        zip(
            df_merge['professionals_id_num'],
            df_merge['questions_id_num'],
            df_merge['total_weights']
        )
    )
)

50098

In [59]:
# Dataset building for lightFM

# define our dataset variable
# then we feed unique professionals and questions ids
# and item and professional feature list
# this will create lightfm internel mapping
dataset = Dataset()
dataset.fit(
    set(df_professionals['professionals_id_num']),
    set(df_questions['questions_id_num']),
    item_features=question_feature_list,
    user_features=professional_feature_list
)

# now we are building ineractions matrix between professionals and questions.
# we are passing professional and question id as a tuple
# e.g -> pd.Series ((pro_id, question_id), (pro_id, question_id))
# then we use lightFM build in method for building interactions matrix
df_merge['author_question_id_tuple'] = list(
    zip(
        df_merge['professionals_id_num'],
        df_merge['questions_id_num'],
        df_merge['total_weights']
    )
)

interactions, weights = dataset.build_interactions(
    df_merge['author_question_id_tuple']
)

# now we are building our questions and professionals features
# in a way that lightFM understand.
# we are using lightFM build in method for building
# questions and professionals features
questions_features = dataset.build_item_features(
    df_questions['question_features']
)

professional_features = dataset.build_user_features(
    df_professionals['professional_features']
)

**Model building and training**

In [62]:
# Model building part

# define lightFm model by specifying hyper-parameter
# then fit the model with interactions matrix, item and user features
model = LightFM(
    no_components=150,
    learning_rate=0.05,
    loss='warp',
    random_state=2019
)

model.fit(
    interactions,
    item_features=questions_features,
    user_features=professional_features,
    sample_weight=weights,
    epochs=5,
    num_threads=4,
    verbose=True
)

Epoch: 100%|██████████| 5/5 [00:21<00:00,  4.22s/it]


<lightfm.lightfm.LightFM at 0x32d6c41f0>

# Evaluating the performance of the model 
Now we have to evaluate our model to see it's performance. No matter how good your model is, if you can't evaluate your model correctly you can't imporove and trust your model. For recommendation problem, there is not very good matrics for evaluating. But luckily lightfm provides us a very rich set of evaluating matrics. In this steps, we will be calculating AUC scores for our model.

**What is AUC score in lightfm library?**: It measure the ROC AUC metric for a model: the probability that a randomly chosen positive example has a higher score than a randomly chosen negative example. A perfect score is 1.0. 

Let's see what is our model score. 

In [63]:
calculate_auc_score(model, interactions, questions_features, professional_features)

0.9132993

Wow! That is really impresive. Over AUC is over 90 percent. That is really excellent. This tells us that the quality of our overall model is very good.

**Make real recommendations**: Now we already see how our model is by looking at AUC score. But now let's see some real example of recommendation.

In [64]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

def recommend_questions(professional_ids):
    for professional in professional_ids:
        # print their previous answered question title
        previous_q_id_num = df_merge.loc[df_merge['professionals_id_num'] == professional][:3]['questions_id_num']
        df_previous_questions = df_questions.loc[df_questions['questions_id_num'].isin(previous_q_id_num)]
        print('Professional Id (' + str(professional) + "): Previous Answered Questions")
        display_side_by_side(
            df_previous_questions[['questions_title', 'question_features']],
            df_professionals.loc[df_professionals.professionals_id_num == professional][['professionals_id_num','professionals_tag_name']])
        
        # predict
        discard_qu_id = df_previous_questions['questions_id_num'].values.tolist()
        df_use_for_prediction = df_questions.loc[~df_questions['questions_id_num'].isin(discard_qu_id)]
        questions_id_for_predict = df_use_for_prediction['questions_id_num'].values.tolist()
        
        scores = model.predict(
            professional,
            questions_id_for_predict,
            item_features=questions_features,
            user_features=professional_features)
        
        df_use_for_prediction['scores'] = scores
        df_use_for_prediction = df_use_for_prediction.sort_values(by='scores', ascending=False)[:8]
        print('Professional Id (' + str(professional) + "): Recommended Questions: ")
        display(df_use_for_prediction[['questions_title', 'question_features']])