In [None]:
!pip install zipfile36
!pip install lightfm

Collecting zipfile36
  Downloading https://files.pythonhosted.org/packages/fd/8a/3b7da0b0bd87d1ef05b74207827c72d348b56a0d6d83242582be18a81e02/zipfile36-0.1.3-py3-none-any.whl
Installing collected packages: zipfile36
Successfully installed zipfile36-0.1.3
Collecting lightfm
[?25l  Downloading https://files.pythonhosted.org/packages/e9/8e/5485ac5a8616abe1c673d1e033e2f232b4319ab95424b42499fabff2257f/lightfm-1.15.tar.gz (302kB)
[K     |████████████████████████████████| 307kB 4.5MB/s 
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.15-cp36-cp36m-linux_x86_64.whl size=707622 sha256=253f5a8e0aa53c5379d5fc7268db34404206b7023c13edc5ed4eb6a606cc8086
  Stored in directory: /root/.cache/pip/wheels/eb/bb/ac/188385a5da6627956be5d9663928483b36da576149ab5b8f79
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.15


In [None]:
from zipfile import ZipFile 

path = "/content/drive/My Drive/datasets/data-science-for-good-careervillage.zip"

# Create a ZipFile Object and losad sample.zip in it
with ZipFile(path, 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall()

In [None]:
import pandas as pd
import numpy as np

############################################
# Read all our datasets and store them in pandas dataframe objects. 
############################################
base_path = '/content/'
df_answer_scores = pd.read_csv(
    base_path + 'answer_scores.csv')

df_answers = pd.read_csv(
    base_path + 'answers.csv',
    parse_dates=['answers_date_added'])

df_comments = pd.read_csv(
    base_path + 'comments.csv')

df_emails = pd.read_csv(
    base_path + 'emails.csv')

df_group_memberships = pd.read_csv(
    base_path + 'group_memberships.csv')

df_groups = pd.read_csv(
    base_path + 'groups.csv')

df_matches = pd.read_csv(
    base_path + 'matches.csv')

df_professionals = pd.read_csv(
    base_path + 'professionals.csv',
    parse_dates=['professionals_date_joined'])

df_question_scores = pd.read_csv(
    base_path + 'question_scores.csv')

df_questions = pd.read_csv(
    base_path + 'questions.csv',
    parse_dates=['questions_date_added'])

df_school_memberships = pd.read_csv(
    base_path + 'school_memberships.csv')

df_students = pd.read_csv(
    base_path + 'students.csv',
    parse_dates=['students_date_joined'])

df_tag_questions = pd.read_csv(
    base_path + 'tag_questions.csv')

df_tag_users = pd.read_csv(
    base_path + 'tag_users.csv')

df_tags = pd.read_csv(
    base_path + 'tags.csv')

In [None]:
import lightfm.evaluation

def generate_int_id(dataframe, id_col_name):
    """
    Generate unique integer id for users, questions and answers

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    id_col_name : String 
        New integer id's column name.
        
    Returns
    -------
    Dataframe
        Updated dataframe containing new id column 
    """
    new_dataframe=dataframe.assign(
        int_id_col_name=np.arange(len(dataframe))
        ).reset_index(drop=True)
    return new_dataframe.rename(columns={'int_id_col_name': id_col_name})



def create_features(dataframe, features_name, id_col_name):
    """
    Generate features that will be ready for feeding into lightfm

    Parameters
    ----------
    -dataframe: Dataframe
        Pandas Dataframe which contains features
    -features_name : List
        List of feature columns name avaiable in dataframe
    -id_col_name: String
        Column name which contains id of the question or
        answer that the features will map to.
        There are two possible values for this variable.
        1. questions_id_num
        2. professionals_id_num

    Returns
    -------
    Pandas Series
        A pandas series containing process features
        that are ready for feed into lightfm.
        The format of each value
        will be (user_id, ['feature_1', 'feature_2', 'feature_3'])
        Ex. -> (1, ['military', 'army', '5'])
    """

    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = list(zip(dataframe[id_col_name], features))
    return features



def generate_feature_list(dataframe, features_name):
    """
    Generate features list for mapping 

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    features_name : List
        List of feature columns name avaiable in dataframe. 
        
    Returns
    -------
    List of all features for mapping 
    """
    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = features.apply(pd.Series).stack().reset_index(drop=True)
    return features


def calculate_auc_score(lightfm_model, interactions_matrix, 
                        question_features, professional_features): 
    """
    Measure the ROC AUC metric for a model. 
    A perfect score is 1.0.

    Parameters
    ----------
    lightfm_model: LightFM model 
        A fitted lightfm model 
    interactions_matrix : 
        A lightfm interactions matrix 
    question_features, professional_features: 
        Lightfm features 
        
    Returns
    -------
    String containing AUC score 
    """
    score = lightfm.evaluation.auc_score( 
        lightfm_model, interactions_matrix, 
        item_features=question_features, 
        user_features=professional_features, 
        num_threads=4).mean()
    return score

## **Data Preprocessing and feature creation**

**Generate numeric identifier:**
LightFM python only except numeric id. But the data CareerVillage has provided us is contains uuid for identifying users and professionals and others. In this step, I will make unique identifier for each professionals, students, questions and answers.

In [None]:
# generating unique integer id for users and q&a
df_professionals = generate_int_id(df_professionals, 'professionals_id_num')
df_students = generate_int_id(df_students, 'students_id_num')
df_questions = generate_int_id(df_questions, 'questions_id_num')
df_answers = generate_int_id(df_answers, 'answers_id_num')

In [None]:
df_professionals.head()

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num
0,9ced4ce7519049c0944147afb75a8ce3,,,,2011-10-05 20:35:19+00:00,0
1,f718dcf6d2ec4cb0a52a9db59d7f9e67,,,,2011-10-05 20:49:21+00:00,1
2,0c673e046d824ec0ad0ebe012a0673e4,"New York, New York",,,2011-10-18 17:31:26+00:00,2
3,977428d851b24183b223be0eb8619a8c,"Boston, Massachusetts",,,2011-11-09 20:39:29+00:00,3
4,e2d57e5041a44f489288397c9904c2b2,,,,2011-12-10 22:14:44+00:00,4


In [None]:
df_students.head()

Unnamed: 0,students_id,students_location,students_date_joined,students_id_num
0,12a89e96755a4dba83ff03e03043d9c0,,2011-12-16 14:19:24+00:00,0
1,e37a5990fe354c60be5e87376b08d5e3,,2011-12-27 03:02:44+00:00,1
2,12b402cceeda43dcb6e12ef9f2d221ea,,2012-01-01 05:00:00+00:00,2
3,a0f431fc79794edcb104f68ce55ab897,,2012-01-01 05:00:00+00:00,3
4,23aea4702d804bd88d1e9fb28074a1b4,,2012-01-01 05:00:00+00:00,4


# **Merging Datasets:** 
This is one of the most important steps for our solution. Our professionals, students, q&a and tags are stored in seperate datasets. For purpose of model, we have to merge our datasets in very carefull way so that they are useful for our model.

All tags (q&a) are stored in a separate dataset. So firstly we merge those tags with questions and answers datasets.
Then, we merge answers with quesitons because one question can have multiple answers.

In [None]:
###########################
# merging dataset
###########################

# just dropna from tags 
df_tags = df_tags.dropna()
# removing hash values.
df_tags['tags_tag_name'] = df_tags['tags_tag_name'].str.replace('#', '')


# merge tag_questions with tags name
# then group all tags for each question into single rows
df_tags_question = df_tag_questions.merge(
    df_tags, how='inner',
    left_on='tag_questions_tag_id', right_on='tags_tag_id')
df_tags_question = df_tags_question.groupby(
    ['tag_questions_question_id'])['tags_tag_name'].apply(
        ','.join).reset_index()
# .join is used to change seperator. seperator sy muraad 2 columns ko aapas me jis bhi seperator sy join kia ho. for e.g. space or ,
# ak id ky jitny bhi tags thy, usny un sb ko ak hi row me merge kr dia hai with , seperator 
df_tags_question = df_tags_question.rename(columns={'tags_tag_name': 'questions_tag_name'})

# merge tag_users with tags name 
# then group all tags for each user into single rows 
# after that rename the tag column name 
df_tags_pro = df_tag_users.merge(
    df_tags, how='inner',
    left_on='tag_users_tag_id', right_on='tags_tag_id')
df_tags_pro = df_tags_pro.groupby(
    ['tag_users_user_id'])['tags_tag_name'].apply(
        ','.join).reset_index()
# ak id sy related jitny bhi tag hain, chahy wo alag alag rows me hi q na hon, un sb ko ak row me merge kr dena. 
df_tags_pro = df_tags_pro.rename(columns={'tags_tag_name': 'professionals_tag_name'})


# merge professionals and questions tags with main merge_dataset 
df_questions = df_questions.merge(
    df_tags_question, how='left',
    left_on='questions_id', right_on='tag_questions_question_id')
df_professionals = df_professionals.merge(
    df_tags_pro, how='left',
    left_on='professionals_id', right_on='tag_users_user_id')

# merge questions with scores 
df_questions = df_questions.merge(
    df_question_scores, how='left',
    left_on='questions_id', right_on='id')
# merge questions with students 
df_questions = df_questions.merge(
    df_students, how='left',
    left_on='questions_author_id', right_on='students_id')


# merge answers with questions 
# then merge professionals and questions score with that 
df_merge = df_answers.merge(
    df_questions, how='inner',
    left_on='answers_question_id', right_on='questions_id')
df_merge = df_merge.merge(
    df_professionals, how='inner',
    left_on='answers_author_id', right_on='professionals_id')
df_merge = df_merge.merge(
    df_question_scores, how='inner',
    left_on='questions_id', right_on='id')

In [None]:
df_merge.head()

Unnamed: 0,answers_id,answers_author_id,answers_question_id,answers_date_added,answers_body,answers_id_num,questions_id,questions_author_id,questions_date_added,questions_title,questions_body,questions_id_num,tag_questions_question_id,questions_tag_name,id_x,score_x,students_id,students_location,students_date_joined,students_id_num,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num,tag_users_user_id,professionals_tag_name,id_y,score_y
0,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,332a511f1569444485cf7a7a556a5e54,2016-04-29 19:40:14+00:00,<p>Hi!</p>\n<p>You are asking a very interesti...,0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26+00:00,Teacher career question,What is a maths teacher? what is a ma...,0,332a511f1569444485cf7a7a556a5e54,"lecture,college,professor",332a511f1569444485cf7a7a556a5e54,1.0,8f6f374ffd834d258ab69d376dd998f5,"Coimbatore, Tamil Nadu, India",2016-04-22 10:07:32+00:00,6890.0,36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",332a511f1569444485cf7a7a556a5e54,1
1,f3519ab99a1a4a13a8a9ecb814287d2a,36ff3b3666df400f956f8335cf53e09e,0f1d6a4f276c4a05878dd48e03e52289,2016-07-31 15:35:54+00:00,<p>Hi Rodrigo!</p>\n<p>The important thing to ...,11,0f1d6a4f276c4a05878dd48e03e52289,585ac233015447cc9e9a217044e515e1,2016-05-19 22:16:25+00:00,what kind of college could i go to for a soc...,I like soccer because i been playing sense i w...,7,0f1d6a4f276c4a05878dd48e03e52289,"college,building,soccer",0f1d6a4f276c4a05878dd48e03e52289,1.0,585ac233015447cc9e9a217044e515e1,"Morgan Hill, California",2016-05-19 22:08:48+00:00,10014.0,36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",0f1d6a4f276c4a05878dd48e03e52289,1
2,825f6e316a5f48328d6f8af831df9940,36ff3b3666df400f956f8335cf53e09e,0149c6d63e214040b44d4a3789bb00ba,2018-04-15 23:08:46+00:00,<p>Congratulations on being interested in find...,71,0149c6d63e214040b44d4a3789bb00ba,34217a1861d640a58c85e033414cf9cb,2018-04-12 17:13:45+00:00,What is the best way to prepare for studying e...,"I am interested in Computational Neuroscience,...",33,0149c6d63e214040b44d4a3789bb00ba,"engineering,neuroscience,gradschool",0149c6d63e214040b44d4a3789bb00ba,2.0,34217a1861d640a58c85e033414cf9cb,"Austin, Texas",2018-04-12 17:09:31+00:00,26796.0,36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",0149c6d63e214040b44d4a3789bb00ba,2
3,fb2c794175304c4caeb55e654270421f,a32736b04c27437da3078374d47af1b1,0149c6d63e214040b44d4a3789bb00ba,2018-04-13 18:18:05+00:00,<p>Hi Elisabeth! </p><p><br></p><p>If you are ...,72,0149c6d63e214040b44d4a3789bb00ba,34217a1861d640a58c85e033414cf9cb,2018-04-12 17:13:45+00:00,What is the best way to prepare for studying e...,"I am interested in Computational Neuroscience,...",33,0149c6d63e214040b44d4a3789bb00ba,"engineering,neuroscience,gradschool",0149c6d63e214040b44d4a3789bb00ba,2.0,34217a1861d640a58c85e033414cf9cb,"Austin, Texas",2018-04-12 17:09:31+00:00,26796.0,a32736b04c27437da3078374d47af1b1,"San Francisco, California",Computer Software,Product Management @ Okta,2018-04-13 17:48:09+00:00,18373,a32736b04c27437da3078374d47af1b1,computer-software,0149c6d63e214040b44d4a3789bb00ba,2
4,f3fc23809cda472780fc565334f35000,36ff3b3666df400f956f8335cf53e09e,acc611cfb5c44daa8a3d7d65dfffa5ff,2018-08-14 10:37:01+00:00,<p>The most important thing that you can do is...,102,acc611cfb5c44daa8a3d7d65dfffa5ff,5b751a8ee4a047f7a08ce9eb5e43e5a2,2018-08-14 04:49:33+00:00,How should I prepare myself for my job search ...,"I am a Sociology, Political Science, and Inter...",47,acc611cfb5c44daa8a3d7d65dfffa5ff,"job-search,career-choice,job,college-jobs",acc611cfb5c44daa8a3d7d65dfffa5ff,1.0,5b751a8ee4a047f7a08ce9eb5e43e5a2,"Kingston, Pennsylvania",2018-08-14 04:47:13+00:00,28533.0,36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",acc611cfb5c44daa8a3d7d65dfffa5ff,1


**Generate some features:** In this steps, we are going to generate some features. We are going to generate number of answers by professionals, num of answers in each question, num of tags per professionals and number of tags per question. I will not use all of these features in this model. But I will use number of answers per question for weighting our model so that our model pay less attention to those quesitons that have higher number of answers.

In [None]:
#######################
# Generate some features for calculates weights
# that will use with interaction matrix 
#######################
original = df_merge.copy()

original['num_of_ans_by_professional'] = original.groupby(['answers_author_id'])['questions_id'].transform('count')
# mtlb har ak professional ny kitny no of questions ko ans kra hai.
original['num_ans_per_ques'] = original.groupby(['questions_id'])['answers_id'].transform('count')
# mtlb har ak swaal k kitny ans hain
original['num_tags_professional'] = original['professionals_tag_name'].str.split(",").str.len()
# here we r counting no of tags yani her professional ny kitny tags ko like ya enroll kra va hai apni profile me. 
original['num_tags_question'] = original['questions_tag_name'].str.split(",").str.len()
# here we r counnting k her question k sth kitny tag hain. yani question ko post krty vkt user ny us k sth kitny tag lgaye we r counting tat.


In [None]:
original.head()

Unnamed: 0,answers_id,answers_author_id,answers_question_id,answers_date_added,answers_body,answers_id_num,questions_id,questions_author_id,questions_date_added,questions_title,questions_body,questions_id_num,tag_questions_question_id,questions_tag_name,id_x,score_x,students_id,students_location,students_date_joined,students_id_num,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num,tag_users_user_id,professionals_tag_name,id_y,score_y,num_of_ans_by_professional,num_ans_per_ques,num_tags_professional,num_tags_question
0,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,332a511f1569444485cf7a7a556a5e54,2016-04-29 19:40:14+00:00,<p>Hi!</p>\n<p>You are asking a very interesti...,0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26+00:00,Teacher career question,What is a maths teacher? what is a ma...,0,332a511f1569444485cf7a7a556a5e54,"lecture,college,professor",332a511f1569444485cf7a7a556a5e54,1.0,8f6f374ffd834d258ab69d376dd998f5,"Coimbatore, Tamil Nadu, India",2016-04-22 10:07:32+00:00,6890.0,36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",332a511f1569444485cf7a7a556a5e54,1,1710,1,12.0,3.0
1,f3519ab99a1a4a13a8a9ecb814287d2a,36ff3b3666df400f956f8335cf53e09e,0f1d6a4f276c4a05878dd48e03e52289,2016-07-31 15:35:54+00:00,<p>Hi Rodrigo!</p>\n<p>The important thing to ...,11,0f1d6a4f276c4a05878dd48e03e52289,585ac233015447cc9e9a217044e515e1,2016-05-19 22:16:25+00:00,what kind of college could i go to for a soc...,I like soccer because i been playing sense i w...,7,0f1d6a4f276c4a05878dd48e03e52289,"college,building,soccer",0f1d6a4f276c4a05878dd48e03e52289,1.0,585ac233015447cc9e9a217044e515e1,"Morgan Hill, California",2016-05-19 22:08:48+00:00,10014.0,36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",0f1d6a4f276c4a05878dd48e03e52289,1,1710,1,12.0,3.0
2,825f6e316a5f48328d6f8af831df9940,36ff3b3666df400f956f8335cf53e09e,0149c6d63e214040b44d4a3789bb00ba,2018-04-15 23:08:46+00:00,<p>Congratulations on being interested in find...,71,0149c6d63e214040b44d4a3789bb00ba,34217a1861d640a58c85e033414cf9cb,2018-04-12 17:13:45+00:00,What is the best way to prepare for studying e...,"I am interested in Computational Neuroscience,...",33,0149c6d63e214040b44d4a3789bb00ba,"engineering,neuroscience,gradschool",0149c6d63e214040b44d4a3789bb00ba,2.0,34217a1861d640a58c85e033414cf9cb,"Austin, Texas",2018-04-12 17:09:31+00:00,26796.0,36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",0149c6d63e214040b44d4a3789bb00ba,2,1710,2,12.0,3.0
3,fb2c794175304c4caeb55e654270421f,a32736b04c27437da3078374d47af1b1,0149c6d63e214040b44d4a3789bb00ba,2018-04-13 18:18:05+00:00,<p>Hi Elisabeth! </p><p><br></p><p>If you are ...,72,0149c6d63e214040b44d4a3789bb00ba,34217a1861d640a58c85e033414cf9cb,2018-04-12 17:13:45+00:00,What is the best way to prepare for studying e...,"I am interested in Computational Neuroscience,...",33,0149c6d63e214040b44d4a3789bb00ba,"engineering,neuroscience,gradschool",0149c6d63e214040b44d4a3789bb00ba,2.0,34217a1861d640a58c85e033414cf9cb,"Austin, Texas",2018-04-12 17:09:31+00:00,26796.0,a32736b04c27437da3078374d47af1b1,"San Francisco, California",Computer Software,Product Management @ Okta,2018-04-13 17:48:09+00:00,18373,a32736b04c27437da3078374d47af1b1,computer-software,0149c6d63e214040b44d4a3789bb00ba,2,1,2,1.0,3.0
4,f3fc23809cda472780fc565334f35000,36ff3b3666df400f956f8335cf53e09e,acc611cfb5c44daa8a3d7d65dfffa5ff,2018-08-14 10:37:01+00:00,<p>The most important thing that you can do is...,102,acc611cfb5c44daa8a3d7d65dfffa5ff,5b751a8ee4a047f7a08ce9eb5e43e5a2,2018-08-14 04:49:33+00:00,How should I prepare myself for my job search ...,"I am a Sociology, Political Science, and Inter...",47,acc611cfb5c44daa8a3d7d65dfffa5ff,"job-search,career-choice,job,college-jobs",acc611cfb5c44daa8a3d7d65dfffa5ff,1.0,5b751a8ee4a047f7a08ce9eb5e43e5a2,"Kingston, Pennsylvania",2018-08-14 04:47:13+00:00,28533.0,36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",acc611cfb5c44daa8a3d7d65dfffa5ff,1,1710,1,12.0,4.0


In [None]:
print("Maximum number of answer per question : " + str(original['num_ans_per_ques'].max()))
print("Maximum number of tags per professional : " + str(original['num_tags_professional'].max()))
print("Maximum number of tags per question : " + str(original['num_tags_question'].max()))

Maximum number of answer per question : 58
Maximum number of tags per professional : 82.0
Maximum number of tags per question : 54.0


**Merge answered questions tags with professional's tags:**
Professionals can follow some tags. But not all professional follow tags and most especially we see from EDA that sometime professionals answers questions that is not related to their tags. For that reason, I have merge questions tags that each professional has answered with professional tags. This makes our model more robust and context aware.

In [None]:
########################
# Merge professionals previous answered 
# questions tags into professionals tags 
########################
# mtlb for e.g professional datascientist hai or usny tag bhi datascience ka like kra va hai, and then usny datascience k tags k questions ko ans kra va hai
# to hum datascience sy relatd tamaam questions answered by tat datascientist ko merge kr dengy.

# select professionals answered questions tags 
# and stored as a dataframe
professionals_prev_ans_tags = original[['professionals_id', 'questions_tag_name']]
# drop null values from that 
professionals_prev_ans_tags = professionals_prev_ans_tags.dropna()
# because professsionals answers multiple questions, 
# we group all of tags of each user into single row 
professionals_prev_ans_tags = professionals_prev_ans_tags.groupby(
    ['professionals_id'])['questions_tag_name'].apply(
        ','.join).reset_index()
# ak professional id ny boht sary questions ko ans kra va hai so un tamam questions k sth koi na koi tags hongy to hum un tamam tags ko jin k questions professional ny ans
# kry hain, merge kr dengy.

# drop duplicates tags from each professionals rows
professionals_prev_ans_tags['questions_tag_name'] = (
    professionals_prev_ans_tags['questions_tag_name'].str.split(',').apply(set).str.join(','))
# yha par set operation lgaya jaa rha hai har row par taky duplicate tags remove ho jaen.

# finally merge the dataframe with professionals dataframe 
df_professionals = df_professionals.merge(professionals_prev_ans_tags, how='left', on='professionals_id')

prof_copy = df_professionals.copy()
# join professionals tags and their answered tags 
# we replace nan values with ""
prof_copy['professional_all_tags'] = (
    prof_copy[['professionals_tag_name', 'questions_tag_name']].apply(
        lambda x: ','.join(x.dropna()),
        axis=1))

In [None]:
prof_copy.head()

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num,tag_users_user_id,professionals_tag_name,questions_tag_name,professional_all_tags
0,9ced4ce7519049c0944147afb75a8ce3,,,,2011-10-05 20:35:19+00:00,0,,,"resume,consulting","resume,consulting"
1,f718dcf6d2ec4cb0a52a9db59d7f9e67,,,,2011-10-05 20:49:21+00:00,1,,,,
2,0c673e046d824ec0ad0ebe012a0673e4,"New York, New York",,,2011-10-18 17:31:26+00:00,2,0c673e046d824ec0ad0ebe012a0673e4,"consulting,consulting,consulting,consulting,co...","daycare,steps,veterinarian,money,doctor,colleg...","consulting,consulting,consulting,consulting,co..."
3,977428d851b24183b223be0eb8619a8c,"Boston, Massachusetts",,,2011-11-09 20:39:29+00:00,3,,,"math,public-defenders,justice,college,science,...","math,public-defenders,justice,college,science,..."
4,e2d57e5041a44f489288397c9904c2b2,,,,2011-12-10 22:14:44+00:00,4,,,,


**Handling null and duplicates values:** Now we want clean our data a little bit. We will handle null and duplicate values. Because if we don't remove that they will cause error and wrong prediction. Also, we will replace null values with generic name or value.

In [None]:
# handling null values 
df_questions['score'] = df_questions['score'].fillna(0)
df_questions['score'] = df_questions['score'].astype(int)
df_questions['questions_tag_name'] = df_questions['questions_tag_name'].fillna('No Tag')
# remove duplicates tags from each questions 
df_questions['questions_tag_name'] = df_questions['questions_tag_name'].str.split(',').apply(set).str.join(',')


# fill nan with 'No Tag' if any 
prof_copy['professional_all_tags'] = prof_copy['professional_all_tags'].fillna('No Tag')
# replace "" with "No Tag", because previously we replace nan with ""
prof_copy['professional_all_tags'] = prof_copy['professional_all_tags'].replace('', 'No Tag')
prof_copy['professionals_location'] = prof_copy['professionals_location'].fillna('No Location')
prof_copy['professionals_industry'] = prof_copy['professionals_industry'].fillna('No Industry')

# remove duplicates tags from each professionals 
prof_copy['professional_all_tags'] = prof_copy['professional_all_tags'].str.split(',').apply(set).str.join(',')



# remove some null values from df_merge
original['num_ans_per_ques']  = original['num_ans_per_ques'].fillna(0)
original['num_tags_professional'] = original['num_tags_professional'].fillna(0)
original['num_tags_question'] = original['num_tags_question'].fillna(0)

# **Building model in LightFM**
In this steps, we are going to build our lighFM model using lightFM python library. Firstly, we have to create lightFM Dataset for our model. LightFM Datset class makes it really easy for us for creating interection matrix, weights and user/item features.

interection matrix: It is a matrix that contains user/ item interections or professional/quesiton intereactions.
weights: weight of interection matrix. Less weight means less importance to that interection matrix.
user/item features: user/item features supplied as like this (user_id, ['feature_1', 'feature_2', 'feature_3'])
If you want to how lightfm python library's dataset class works and how to use it, please go to this link Building LightFM Datasets.

Then, after that we will be start building our LightFM model using LightFM class. LightFM class makes it really easy for making lightFM model. After that we will train our model by our data.

Creating features list for Dataset class: LightFM library has a Dataset class that makes it really easy for building necessary information for model. But we have feed set of all professionals/questions unique ids and all questions and professional features list. This will create internel mapping for lightFM to use.

**Creating features list for Dataset class:** 
LightFM library has a Dataset class that makes it really easy for building necessary information for model. But we have feed set of all professionals/questions unique ids and all questions and professional features list. This will create internel mapping for lightFM to use.

In [None]:
# generating features list for mapping 
question_feature_list = generate_feature_list(
    df_questions,
    ['questions_tag_name'])

question_feature_list.head()


0      college
1      lecture
2    professor
3         army
4     military
dtype: object

In [None]:
# df_questions.head()

In [None]:
professional_feature_list = generate_feature_list(
    prof_copy,
    ['professional_all_tags'])

professional_feature_list.head()

0        resume
1    consulting
2        No Tag
3       daycare
4         steps
dtype: object

In [None]:
prof_copy.head()

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num,tag_users_user_id,professionals_tag_name,questions_tag_name,professional_all_tags
0,9ced4ce7519049c0944147afb75a8ce3,No Location,No Industry,,2011-10-05 20:35:19+00:00,0,,,"resume,consulting","resume,consulting"
1,f718dcf6d2ec4cb0a52a9db59d7f9e67,No Location,No Industry,,2011-10-05 20:49:21+00:00,1,,,,No Tag
2,0c673e046d824ec0ad0ebe012a0673e4,"New York, New York",No Industry,,2011-10-18 17:31:26+00:00,2,0c673e046d824ec0ad0ebe012a0673e4,"consulting,consulting,consulting,consulting,co...","daycare,steps,veterinarian,money,doctor,colleg...","daycare,steps,veterinarian,interviews,money,do..."
3,977428d851b24183b223be0eb8619a8c,"Boston, Massachusetts",No Industry,,2011-11-09 20:39:29+00:00,3,,,"math,public-defenders,justice,college,science,...","math,public-defenders,justice,college,science,..."
4,e2d57e5041a44f489288397c9904c2b2,No Location,No Industry,,2011-12-10 22:14:44+00:00,4,,,,No Tag


In [None]:
# # calculate our weight value 

In [None]:
# calculate our weight value 
original['total_weights'] = 1 / (
    original['num_ans_per_ques'])


In [None]:
# creating features for feeding into lightfm 
df_questions['question_features'] = create_features(
    df_questions, ['questions_tag_name'], 
    'questions_id_num')


In [None]:
df_questions.head()
# question_features ka mtlb hai k user ki id or usky tags


Unnamed: 0,questions_id,questions_author_id,questions_date_added,questions_title,questions_body,questions_id_num,tag_questions_question_id,questions_tag_name,id,score,students_id,students_location,students_date_joined,students_id_num,question_features
0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26+00:00,Teacher career question,What is a maths teacher? what is a ma...,0,332a511f1569444485cf7a7a556a5e54,"college,lecture,professor",332a511f1569444485cf7a7a556a5e54,1,8f6f374ffd834d258ab69d376dd998f5,"Coimbatore, Tamil Nadu, India",2016-04-22 10:07:32+00:00,6890.0,"(0, [college, lecture, professor])"
1,eb80205482e4424cad8f16bc25aa2d9c,acccbda28edd4362ab03fb8b6fd2d67b,2016-05-20 16:48:25+00:00,I want to become an army officer. What can I d...,I am Priyanka from Bangalore . Now am in 10th ...,1,eb80205482e4424cad8f16bc25aa2d9c,"army,military",eb80205482e4424cad8f16bc25aa2d9c,5,acccbda28edd4362ab03fb8b6fd2d67b,"Providence, Rhode Island",2016-05-20 16:29:08+00:00,10189.0,"(1, [army, military])"
2,4ec31632938a40b98909416bdd0decff,f2c179a563024ccc927399ce529094b5,2017-02-08 19:13:38+00:00,Will going abroad for your first job increase ...,I'm planning on going abroad for my first job....,2,4ec31632938a40b98909416bdd0decff,"working-abroad,overseas",4ec31632938a40b98909416bdd0decff,2,f2c179a563024ccc927399ce529094b5,,2017-02-07 15:51:57+00:00,18023.0,"(2, [working-abroad, overseas])"
3,2f6a9a99d9b24e5baa50d40d0ba50a75,2c30ffba444e40eabb4583b55233a5a4,2017-09-01 14:05:32+00:00,To become a specialist in business management...,i hear business management is a hard way to ge...,3,2f6a9a99d9b24e5baa50d40d0ba50a75,"business,networking",2f6a9a99d9b24e5baa50d40d0ba50a75,2,2c30ffba444e40eabb4583b55233a5a4,"North Lauderdale, Florida",2017-09-01 14:02:02+00:00,20803.0,"(3, [business, networking])"
4,5af8880460c141dbb02971a1a8369529,aa9eb1a2ab184ebbb00dc01ab663428a,2017-09-01 02:36:54+00:00,Are there any scholarships out there for stude...,I'm trying to find scholarships for first year...,4,5af8880460c141dbb02971a1a8369529,"college,highschoolsenior,firstgeneration,schol...",5af8880460c141dbb02971a1a8369529,2,aa9eb1a2ab184ebbb00dc01ab663428a,"Tunnel Hill, Georgia",2017-09-01 02:29:06+00:00,20505.0,"(4, [college, highschoolsenior, firstgeneratio..."


In [None]:
prof_copy['professional_features'] = create_features(
    prof_copy,
    ['professional_all_tags'],
    'professionals_id_num')

# yha par bhi same oper vala kaam ho rha hai, so no need to show the output again. 
# professional_features ka mtlb hai k professional ki id or usky tags


In [None]:
prof_copy.head()

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num,tag_users_user_id,professionals_tag_name,questions_tag_name,professional_all_tags,professional_features
0,9ced4ce7519049c0944147afb75a8ce3,No Location,No Industry,,2011-10-05 20:35:19+00:00,0,,,"resume,consulting","resume,consulting","(0, [resume, consulting])"
1,f718dcf6d2ec4cb0a52a9db59d7f9e67,No Location,No Industry,,2011-10-05 20:49:21+00:00,1,,,,No Tag,"(1, [No Tag])"
2,0c673e046d824ec0ad0ebe012a0673e4,"New York, New York",No Industry,,2011-10-18 17:31:26+00:00,2,0c673e046d824ec0ad0ebe012a0673e4,"consulting,consulting,consulting,consulting,co...","daycare,steps,veterinarian,money,doctor,colleg...","daycare,steps,veterinarian,interviews,money,do...","(2, [daycare, steps, veterinarian, interviews,..."
3,977428d851b24183b223be0eb8619a8c,"Boston, Massachusetts",No Industry,,2011-11-09 20:39:29+00:00,3,,,"math,public-defenders,justice,college,science,...","math,public-defenders,justice,college,science,...","(3, [math, public-defenders, justice, college,..."
4,e2d57e5041a44f489288397c9904c2b2,No Location,No Industry,,2011-12-10 22:14:44+00:00,4,,,,No Tag,"(4, [No Tag])"


**LightFM Dataset**: In this steps we are going to build lightfm datasets. And then we will be building our interactions matrix, weights and professional/question features.

**To do this, we create a dataset and call its fit method. The first argument is an iterable of all user ids in our data, and the second is an iterable of all item ids. In this case, we use generator expressions to lazily iterate over our data and yield user and item ids:**

In [None]:
# ########################
# # Dataset building for lightfm
# ########################

# from lightfm import LightFM
# from lightfm.data import Dataset

# # define our dataset variable
# # then we feed unique professionals and questions ids
# # and item and professional feature list
# # this will create lightfm internel mapping
# dataset = Dataset()
# dataset.fit(
#     set(df_professionals['professionals_id_num']), 
#     set(df_questions['questions_id_num']),
#     item_features=question_feature_list, 
#     user_features=professional_feature_list)


# # now we are building interactions matrix between professionals and quesitons
# # we are passing professional and questions id as a tuple
# # e.g -> pd.Series((pro_id, question_id), (pro_id, questin_id))
# # then we use lightfm build in method for building interactions matrix
# original['author_question_id_tuple'] = list(zip(
#     original.professionals_id_num, original.questions_id_num, original.total_weights))

# interactions, weights = dataset.build_interactions(
#     original['author_question_id_tuple'])



# # now we are building our questions and professionals features
# # in a way that lightfm understand.
# # we are using lightfm build in method for building
# # questions and professionals features 
# questions_features = dataset.build_item_features(
#     df_questions['question_features'])

# professional_features = dataset.build_user_features(
#     df_professionals['professional_features'])

In [None]:
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit(
    set(df_professionals['professionals_id_num']), 
    set(df_questions['questions_id_num']),
    item_features=question_feature_list, 
    user_features=professional_feature_list)

In [None]:
# now we are building interactions matrix between professionals and questions
# we are passing professional and questions id as a tuple
# e.g -> pd.Series((pro_id, question_id), (pro_id, questin_id))
# then we use lightfm build in method for building interactions matrix
original['author_question_id_tuple'] = list(zip(
    original['professionals_id_num'], original.questions_id_num, original.total_weights))

original.head()
# ak hi author ny multiple swaalo k jawab dye hue hain (author, question_id, weitage)

Unnamed: 0,answers_id,answers_author_id,answers_question_id,answers_date_added,answers_body,answers_id_num,questions_id,questions_author_id,questions_date_added,questions_title,questions_body,questions_id_num,tag_questions_question_id,questions_tag_name,id_x,score_x,students_id,students_location,students_date_joined,students_id_num,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num,tag_users_user_id,professionals_tag_name,id_y,score_y,num_of_ans_by_professional,num_ans_per_ques,num_tags_professional,num_tags_question,total_weights,author_question_id_tuple
0,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,332a511f1569444485cf7a7a556a5e54,2016-04-29 19:40:14+00:00,<p>Hi!</p>\n<p>You are asking a very interesti...,0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26+00:00,Teacher career question,What is a maths teacher? what is a ma...,0,332a511f1569444485cf7a7a556a5e54,"lecture,college,professor",332a511f1569444485cf7a7a556a5e54,1.0,8f6f374ffd834d258ab69d376dd998f5,"Coimbatore, Tamil Nadu, India",2016-04-22 10:07:32+00:00,6890.0,36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",332a511f1569444485cf7a7a556a5e54,1,1710,1,12.0,3.0,1.0,"(2410, 0, 1.0)"
1,f3519ab99a1a4a13a8a9ecb814287d2a,36ff3b3666df400f956f8335cf53e09e,0f1d6a4f276c4a05878dd48e03e52289,2016-07-31 15:35:54+00:00,<p>Hi Rodrigo!</p>\n<p>The important thing to ...,11,0f1d6a4f276c4a05878dd48e03e52289,585ac233015447cc9e9a217044e515e1,2016-05-19 22:16:25+00:00,what kind of college could i go to for a soc...,I like soccer because i been playing sense i w...,7,0f1d6a4f276c4a05878dd48e03e52289,"college,building,soccer",0f1d6a4f276c4a05878dd48e03e52289,1.0,585ac233015447cc9e9a217044e515e1,"Morgan Hill, California",2016-05-19 22:08:48+00:00,10014.0,36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",0f1d6a4f276c4a05878dd48e03e52289,1,1710,1,12.0,3.0,1.0,"(2410, 7, 1.0)"
2,825f6e316a5f48328d6f8af831df9940,36ff3b3666df400f956f8335cf53e09e,0149c6d63e214040b44d4a3789bb00ba,2018-04-15 23:08:46+00:00,<p>Congratulations on being interested in find...,71,0149c6d63e214040b44d4a3789bb00ba,34217a1861d640a58c85e033414cf9cb,2018-04-12 17:13:45+00:00,What is the best way to prepare for studying e...,"I am interested in Computational Neuroscience,...",33,0149c6d63e214040b44d4a3789bb00ba,"engineering,neuroscience,gradschool",0149c6d63e214040b44d4a3789bb00ba,2.0,34217a1861d640a58c85e033414cf9cb,"Austin, Texas",2018-04-12 17:09:31+00:00,26796.0,36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",0149c6d63e214040b44d4a3789bb00ba,2,1710,2,12.0,3.0,0.5,"(2410, 33, 0.5)"
3,fb2c794175304c4caeb55e654270421f,a32736b04c27437da3078374d47af1b1,0149c6d63e214040b44d4a3789bb00ba,2018-04-13 18:18:05+00:00,<p>Hi Elisabeth! </p><p><br></p><p>If you are ...,72,0149c6d63e214040b44d4a3789bb00ba,34217a1861d640a58c85e033414cf9cb,2018-04-12 17:13:45+00:00,What is the best way to prepare for studying e...,"I am interested in Computational Neuroscience,...",33,0149c6d63e214040b44d4a3789bb00ba,"engineering,neuroscience,gradschool",0149c6d63e214040b44d4a3789bb00ba,2.0,34217a1861d640a58c85e033414cf9cb,"Austin, Texas",2018-04-12 17:09:31+00:00,26796.0,a32736b04c27437da3078374d47af1b1,"San Francisco, California",Computer Software,Product Management @ Okta,2018-04-13 17:48:09+00:00,18373,a32736b04c27437da3078374d47af1b1,computer-software,0149c6d63e214040b44d4a3789bb00ba,2,1,2,1.0,3.0,0.5,"(18373, 33, 0.5)"
4,f3fc23809cda472780fc565334f35000,36ff3b3666df400f956f8335cf53e09e,acc611cfb5c44daa8a3d7d65dfffa5ff,2018-08-14 10:37:01+00:00,<p>The most important thing that you can do is...,102,acc611cfb5c44daa8a3d7d65dfffa5ff,5b751a8ee4a047f7a08ce9eb5e43e5a2,2018-08-14 04:49:33+00:00,How should I prepare myself for my job search ...,"I am a Sociology, Political Science, and Inter...",47,acc611cfb5c44daa8a3d7d65dfffa5ff,"job-search,career-choice,job,college-jobs",acc611cfb5c44daa8a3d7d65dfffa5ff,1.0,5b751a8ee4a047f7a08ce9eb5e43e5a2,"Kingston, Pennsylvania",2018-08-14 04:47:13+00:00,28533.0,36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",acc611cfb5c44daa8a3d7d65dfffa5ff,1,1710,1,12.0,4.0,1.0,"(2410, 47, 1.0)"


In [None]:
interactions, weights = dataset.build_interactions(
    original['author_question_id_tuple'])

In [None]:
# now we are building our questions and professionals features
# in a way that lightfm understand.
# we are using lightfm build in method for building
# questions and professionals features 
questions_features = dataset.build_item_features(
    df_questions['question_features'])

professional_features = dataset.build_user_features(
    prof_copy['professional_features'])


**Model building and training:** 
In ths steps, I am going to build lightfm model and then train the model. If you want to learn how to create lightfm model using this library please read this post recommender for the Movielens dataset.

In [None]:
################################
# Model building part
################################

# define lightfm model by specifying hyper-parametre
# then fit the model with ineteractions matrix, item and user features 

from lightfm import LightFM
from lightfm.data import Dataset

model = LightFM(
    no_components=150,
    learning_rate=0.05,
    loss='warp',
    random_state=2019)

model.fit(
    interactions,
    item_features=questions_features,
    user_features=professional_features, sample_weight=weights,
    epochs=10, num_threads=4, verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


<lightfm.lightfm.LightFM at 0x7f436e086828>

**Evaluating the performance of the model**
Now we have to evaluate our model to see it's performance. No matter how good your model is, if you can't evaluate your model correctly you can't improve and trust your model. For recommendation problem, there is not very good matrics for evaluating. But luckily lightfm provides us a very rich set of evaluating matrics. In this steps, we will be calculating AUC scores for our model.

What is AUC score in lightfm library?: It measure the ROC AUC metric for a model: the probability that a randomly chosen positive example has a higher score than a randomly chosen negative example. A perfect score is 1.0.

Let's see what is our model score.

In [None]:
calculate_auc_score(model, interactions, questions_features, professional_features)

0.9569965

**Make real recommendations:**

Now we already see how our model is by looking at AUC score. But now let's see some real example of recommendation.

In [None]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

def recommend_questions(professional_ids):
     
    for professional in professional_ids:
        # print their previous answered question title
        previous_q_id_num = original.loc[original['professionals_id_num'] == professional][:3]['questions_id_num']
        df_previous_questions = df_questions.loc[df_questions['questions_id_num'].isin(previous_q_id_num)]
        print('Professional Id (' + str(professional) + "): Previous Answered Questions")
        display_side_by_side(
            df_previous_questions[['questions_title', 'question_features']],
            df_professionals.loc[df_professionals.professionals_id_num == professional][['professionals_id_num','professionals_tag_name']])
        
        # predict
        discard_qu_id = df_previous_questions['questions_id_num'].values.tolist()
        df_use_for_prediction = df_questions.loc[~df_questions['questions_id_num'].isin(discard_qu_id)]
        questions_id_for_predict = df_use_for_prediction['questions_id_num'].values.tolist()
        
        scores = model.predict(
            professional,
            questions_id_for_predict,
            item_features=questions_features,
            user_features=professional_features)
        
        df_use_for_prediction['scores'] = scores
        df_use_for_prediction = df_use_for_prediction.sort_values(by='scores', ascending=False)[:8]
        print('Professional Id (' + str(professional) + "): Recommended Questions: ")
        display(df_use_for_prediction[['questions_title', 'question_features']])

In [None]:
recommend_questions([1200 ,19897, 3])

Professional Id (1200): Previous Answered Questions


Unnamed: 0,questions_title,question_features

Unnamed: 0,professionals_id_num,professionals_tag_name
1200,1200,"marketing,strategy,entrepreneurship,management,java,advertising,python,data-analysis,online-advertising,real-estate,team-leadership,dj,analytics,display-advertising,football,blackjack,hip-hop,billiards,break"


Professional Id (1200): Recommended Questions: 


Unnamed: 0,questions_title,question_features
9330,how can i get my business heard of ?,"(9330, [entrepreneurship, marketing, advertisi..."
22676,What are some specific daily responsibilities ...,"(22676, [business, marketing-and-advertising, ..."
19011,How do you get started in starting your own bu...,"(19011, [business, marketing, management])"
18481,"What are the tried and tested, best social med...","(18481, [business, social-media, sales, market..."
14121,How is an MBA different from a Bachelor of Sci...,"(14121, [business, business-management, mba, m..."
22737,Hotel Industry,"(22737, [business-development, hospitality, ma..."
13541,How do I get a job out of college as an accou...,"(13541, [business-development, sales, marketin..."
829,Is advertising a good major?,"(829, [communications, design, advertising, bu..."


Professional Id (19897): Previous Answered Questions


Unnamed: 0,questions_title,question_features
22784,Do companies truly focus on your college major when applying for jobs?,"(22784, [major])"

Unnamed: 0,professionals_id_num,professionals_tag_name
19897,19897,"illustration,graphic-design,adobe-creative-suite,comic-books"


Professional Id (19897): Recommended Questions: 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,questions_title,question_features
19407,How can you be a successful photographer? What...,"(19407, [photography, art, graphic-design])"
9682,How to get started in animation?,"(9682, [animation, art, artist])"
2310,what is one of best things about being an anim...,"(2310, [animation, design, art, artist])"
13484,Would a Graphic Design degree be a feesible op...,"(13484, [art, graphic-design])"
19471,Graphic Design - job outlook for the next 10 y...,"(19471, [art, graphic-design])"
6058,How should you start in the Graphic Design ind...,"(6058, [design, art, graphic-design])"
11231,Do I have to live in a big city to do well in ...,"(11231, [web-design, graphic-design, design, a..."
17325,what are the required fields forgraphic design?,"(17325, [art, graphic-design])"


Professional Id (3): Previous Answered Questions


Unnamed: 0,questions_title,question_features
11339,What are the different jobs a person can do in Forensic Science?,"(11339, [justice, criminal, science, forensic])"
14818,What does a typical work day for a forensic scientist look like?,"(14818, [No Tag])"
19077,Is most of your day spent working when being a detective?,"(19077, [detective])"

Unnamed: 0,professionals_id_num,professionals_tag_name
3,3,


Professional Id (3): Recommended Questions: 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,questions_title,question_features
2423,How long does it take to become a Detective?,"(2423, [law-enforcement, police, law, criminal..."
9778,I want to be a police officer or a police disp...,"(9778, [law-enforcement, police, law, criminal..."
17184,What types of Detectives are there?,"(17184, [law-enforcement, police, law, crimina..."
11514,What does an aspiring cop have to look forward...,"(11514, [police, criminal-justice, law-enforce..."
8863,What qualifications are needed to be promoted ...,"(8863, [police, criminal-justice, law-enforcem..."
1936,What degrees do you have to have in order to g...,"(1936, [police, criminal-justice, law-enforcem..."
17823,What do I need to do to get started on crimina...,"(17823, [criminal, law-enforcement, police, la..."
6273,How do I get qualified for law enforcement?,"(6273, [detective, police, criminal-justice, l..."


# **Now Storing the model checkpoints**

In [None]:
import pickle

with open('hybrid_recommendation.pickle', 'wb') as fle:
    pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL)

# **NOW PREPARING CODE FOR PRODUCTION**

In [None]:
class CareerVillageDataPreparation:
    """
    Clean and process data CareerVillage Data. 
    
    This class process data in a way that will be useful
    for building lightFM dataset. 
    """
    
    def __init__(self):
        pass

    def _assign_unique_id(self, data, id_col_name):
        """
        Generate unique integer id for users, questions and answers

        Parameters
        ----------
        data: Dataframe
            Pandas Dataframe for Users or Q&A. 
        id_col_name : String 
            New integer id's column name.

        Returns
        -------
        Dataframe
            Updated dataframe containing new id column
        """
        new_dataframe=data.assign(
            int_id_col_name=np.arange(len(data))
            ).reset_index(drop=True)
        return new_dataframe.rename(columns={'int_id_col_name': id_col_name})

    def _dropna(self, data, column, axis):
        """Drop null values from specific column"""
        return data.dropna(column, axis=axis)

    def _merge_data(self, left_data, left_key, right_data, right_key, how):
        """
        This function is used for merging two dataframe.
        
        Parameters
        -----------
        left_data: Dataframe
            Left side dataframe for merge
        left_key: String
            Left Dataframe merge key
        right_data: Dataframe
            Right side dataframe for merge
        right_key: String
            Right Dataframe merge key
        how: String
            Method of merge (inner, left, right, outer)
            
        
        Returns
        --------
        Dataframe
            A new dataframe merging left and right dataframe
        """
        return left_data.merge(
            right_data,
            how=how,
            left_on=left_key,
            right_on=right_key)

    def _group_tags(self, data, group_by, tag_column):
        """Group multiple tags into single rows sepearated by comma"""
        return data.groupby(
            [group_by])[tag_column].apply(
            ','.join).reset_index()

    def _merge_cv_datasets(
        self,
        professionals,students,
        questions,answers,
        tags,tag_questions,tag_users, questions_score):
        """
        This function merges all the necessary 
        CareerVillage dataset in defined way. 
        
        Parameters
        ------------
        professionals,students,
        questions,answers,
        tags,tag_questions,
        tag_users,
        questions_score: Dataframe
            Pandas dataframe defined by it's name
        
        
        Returns
        ---------
        questions, professionals: Dataframe
            Updated dataframe after merge
        merge: Dataframe
            A new datframe after merging answers with questions
        """
        
        
        # merge tag_questions with tags name
        # then group all tags for each question into single rows
        tag_question = self._merge_data(
            left_data=tag_questions,
            left_key='tag_questions_tag_id',
            right_data=tags,
            right_key='tags_tag_id',
            how='inner')
        tag_question = self._group_tags(
            data=tag_question,
            group_by='tag_questions_question_id',
            tag_column='tags_tag_name')
        
        tag_question = tag_question.rename(
            columns={'tags_tag_name': 'questions_tag_name'})
        
        # merge tag_users with tags name
        # then group all tags for each user into single rows 
        # after that rename the tag column name
        tags_pro = self._merge_data(
            left_data=tag_users,
            left_key='tag_users_tag_id',
            right_data=tags,
            right_key='tags_tag_id',
            how='inner')
        tags_pro = self._group_tags(
            data=tags_pro,
            group_by='tag_users_user_id',
            tag_column='tags_tag_name')
        tags_pro = tags_pro.rename(
            columns={'tags_tag_name': 'professionals_tag_name'})
        
        # merge professionals and questions tags with main merge_dataset 
        questions = self._merge_data(
            left_data=questions,
            left_key='questions_id',
            right_data=tag_question,
            right_key='tag_questions_question_id',
            how='left')
        professionals = self._merge_data(
            left_data=professionals,
            left_key='professionals_id',
            right_data=tags_pro,
            right_key='tag_users_user_id',
            how='left')
        
        # merge questions with scores 
        questions = self._merge_data(
            left_data=questions,
            left_key='questions_id',
            right_data=questions_score,
            right_key='id',
            how='left')
        
        # merge questions with students
        questions = self._merge_data(
            left_data=questions,
            left_key='questions_author_id',
            right_data=students,
            right_key='students_id',
            how='left')
        
        # merge answers with questions
        # then merge professionals and questions score with that
        merge = self._merge_data(
            left_data=answers,
            left_key='answers_question_id',
            right_data=questions,
            right_key='questions_id',
            how='inner')
        
        merge = self._merge_data(
            left_data=merge,
            left_key='answers_author_id',
            right_data=professionals,
            right_key='professionals_id',
            how='inner')
        
        return questions, professionals, merge
  
    def _drop_duplicates_tags(self, data, col_name):
        # drop duplicates tags from each row
        return (
            data[col_name].str.split(
                ',').apply(set).str.join(','))


    def _merge_pro_pre_ans_tags(self, professionals, merge):
        ########################
        # Merge professionals previous answered
        # questions tags into professionals tags
        ########################
        
        # yha par professional ny apny prefered ya liked tags sy hat kr bhi kch swaalo k jawab dye hue hongy
        # yani agr me datascience ka tag like kra hai to ho skta hai k mene aesy swaal ka ans kra ho jiska tag animation ho. to hum un swaalo k tag professsional
        # k liked tags k sth merge kr dengy. 

        # select professionals answered questions tags
        # and stored as a dataframe
        professionals_prev_ans_tags = (
            merge[['professionals_id', 'questions_tag_name']])
        # drop null values from that
        professionals_prev_ans_tags = professionals_prev_ans_tags.dropna()
        
        # because professsionals answers multiple questions,
        # we group all of tags of each user into single row
        professionals_prev_ans_tags = self._group_tags(
            data=professionals_prev_ans_tags,
            group_by='professionals_id',
            tag_column='questions_tag_name')
        
        # drop duplicates tags from each professionals rows
        professionals_prev_ans_tags['questions_tag_name'] = \
        self._drop_duplicates_tags(
            professionals_prev_ans_tags, 'questions_tag_name')
        
        # finally merge the dataframe with professionals dataframe
        professionals = self._merge_data(
            left_data=professionals,
            left_key='professionals_id',
            right_data=professionals_prev_ans_tags,
            right_key='professionals_id',
            how='left')
        
        # join professionals tags and their answered tags 
        # we replace nan values with ""
        professionals['professional_all_tags'] = (
            professionals[['professionals_tag_name',
                           'questions_tag_name']].apply(
                lambda x: ','.join(x.dropna()),
                axis=1))
        # yha par hum 2 columns ko aapas me merge kr rhy hain. yani un 2 cols k andarr moujood tags merge ho kr ak hi column me aajae gy. 
        # remember hum 2 columns ko merge join() operation ya function sy krty hain. merge() hum 2 datasets k lye use krty hain. 
        return professionals

    def prepare(
        self,
        professionals,students,
        questions,answers,
        tags,tag_questions,tag_users, questions_score):
        
        """
        This function clean and process 
        CareerVillage Data sets. 
        """
        
        # assign unique integer id
        professionals = self._assign_unique_id(
            professionals, 'professionals_id_num')
        students = self._assign_unique_id(
            students, 'students_id_num')
        questions = self._assign_unique_id(
            questions, 'questions_id_num')
        answers = self._assign_unique_id(
            answers, 'answers_id_num')
        
        # just dropna from tags 
        tags = tags.dropna()
        tags['tags_tag_name'] = tags['tags_tag_name'].str.replace(
            '#', '')
        
        
        # merge necessary datasets
        df_questions, df_professionals, df_merge = self._merge_cv_datasets(
            professionals,students,
            questions,answers,
            tags,tag_questions,tag_users,
            questions_score)
        
        #######################
        # Generate some features for calculates weights
        # that will use with interaction matrix
        #######################
        df_merge['num_ans_per_ques'] = df_merge.groupby(
            ['questions_id'])['answers_id'].transform('count')
        # ak question id par jitny logo no ans kiye hongy un sb ki id count kro yani total answers per question count kro.

        # merge pro previoius answered question tags with pro tags 
        df_professionals = self._merge_pro_pre_ans_tags(
            df_professionals, df_merge)
        
        # some more pre-processing 
        # handling null values 
        df_questions['score'] = df_questions['score'].fillna(0)
        df_questions['score'] = df_questions['score'].astype(int)
        df_questions['questions_tag_name'] = \
        df_questions['questions_tag_name'].fillna('No Tag')
        
        # remove duplicates tags from each questions 
        df_questions['questions_tag_name'] = \
        df_questions['questions_tag_name'].str.split(
            ',').apply(set).str.join(',')

        # fill nan with 'No Tag' if any 
        df_professionals['professional_all_tags'] = \
        df_professionals['professional_all_tags'].fillna(
            'No Tag')
        # replace "" with "No Tag", because previously we replace nan with ""
        df_professionals['professional_all_tags'] = \
        df_professionals['professional_all_tags'].replace(
            '', 'No Tag')
        
        df_professionals['professionals_location'] = \
        df_professionals['professionals_location'].fillna(
            'No Location')
        
        df_professionals['professionals_industry'] = \
        df_professionals['professionals_industry'].fillna(
            'No Industry')

        # remove duplicates tags from each professionals
        df_professionals['professional_all_tags'] = \
        df_professionals['professional_all_tags'].str.split(
            ',').apply(set).str.join(',')

        # remove some null values from df_merge
        df_merge['num_ans_per_ques']  = \
        df_merge['num_ans_per_ques'].fillna(0)
        
        return df_questions, df_professionals, df_merge

**Building Data for LightFM Class:** From step 2 we already know that lightfm library except data in a very specific and elligent way. LightFM data format is already discussed in step 2. Feel free to read that. Now we are building a class that will be put all of dataset building puzzle in a specific class.

In [None]:
class LightFMDataPrep:
    def __init__(self):
        pass
    def create_features(self, dataframe, features_name, id_col_name):
        """
        Generate features that will be ready for feeding into lightfm

        Parameters
        ----------
        dataframe: Dataframe
            Pandas Dataframe which contains features
        features_name : List
            List of feature columns name avaiable in dataframe
        id_col_name: String
            Column name which contains id of the question or
            answer that the features will map to.
            There are two possible values for this variable.
            1. questions_id_num
            2. professionals_id_num

        Returns
        -------
        Pandas Series
            A pandas series containing process features
            that are ready for feed into lightfm.
            The format of each value
            will be (user_id, ['feature_1', 'feature_2', 'feature_3'])
            Ex. -> (1, ['military', 'army', '5'])
        """

        features = dataframe[features_name].apply(
            lambda x: ','.join(x.map(str)), axis=1)
        features = features.str.split(',')
        features = list(zip(dataframe[id_col_name], features))
        return features



    def generate_feature_list(self, dataframe, features_name):
        """
        Generate features list for mapping 

        Parameters
        ----------
        dataframe: Dataframe
            Pandas Dataframe for Users or Q&A. 
        features_name : List
            List of feature columns name avaiable in dataframe. 

        Returns
        -------
        List of all features for mapping 
        """
        features = dataframe[features_name].apply(
            lambda x: ','.join(x.map(str)), axis=1)
        features = features.str.split(',')
        features = features.apply(pd.Series).stack().reset_index(drop=True)
        return features
    
    def create_data(self, questions, professionals, merge):
        question_feature_list = self.generate_feature_list(
            questions,
            ['questions_tag_name'])

        professional_feature_list = self.generate_feature_list(
            professionals,
            ['professional_all_tags'])
        
        merge['total_weights'] = 1 / (
            merge['num_ans_per_ques'])
        
        # creating features for feeding into lightfm 
        questions['question_features'] = self.create_features(
            questions, ['questions_tag_name'], 
            'questions_id_num')
        
        # question_features ka mtlb hai k user ki id or usky tags

        professionals['professional_features'] = self.create_features(
            professionals,
            ['professional_all_tags'],
            'professionals_id_num')
        
        # professional_features ka mtlb hai k professional ki id or usky tags
        
        return question_feature_list,\
    professional_feature_list,merge,questions,professionals
        
    def fit(self, questions, professionals, merge):
        ########################
        # Dataset building for lightfm
        ########################
        question_feature_list, \
        professional_feature_list,\
        merge,questions,professionals = \
        self.create_data(questions, professionals, merge)
        
        
        # define our dataset variable
        # then we feed unique professionals and questions ids
        # and item and professional feature list
        # this will create lightfm internel mapping
        dataset = Dataset()
        dataset.fit(
            set(professionals['professionals_id_num']), 
            set(questions['questions_id_num']),
            item_features=question_feature_list, 
            user_features=professional_feature_list)


        # now we are building interactions
        # matrix between professionals and quesitons
        # we are passing professional and questions id as a tuple
        # e.g -> pd.Series((pro_id, question_id), (pro_id, questin_id))
        # then we use lightfm build in method for building interactions matrix
        merge['author_question_id_tuple'] = list(zip(
            merge.professionals_id_num,
            merge.questions_id_num,
            merge.total_weights))

        interactions, weights = dataset.build_interactions(
            merge['author_question_id_tuple'])



        # now we are building our questions and
        # professionals features
        # in a way that lightfm understand.
        # we are using lightfm build in method for building
        # questions and professionals features 
        questions_features = dataset.build_item_features(
            questions['question_features'])

        professional_features = dataset.build_user_features(
            professionals['professional_features'])
        
        return interactions,\
    weights,questions_features,professional_features
        

**Train Model Class:** In step 2, we saw how we build and train our model. Now we are going to put those all together in TrainLightFM class.

In [None]:
class TrainLightFM:
    def __init__(self):
        pass
        
    def train_test_split(self, interactions, weights):
        train_interactions, test_interactions = \
        cross_validation.random_train_test_split(
            interactions, 
            random_state=np.random.RandomState(2019))
        
        train_weights, test_weights = \
        cross_validation.random_train_test_split(
            weights, 
            random_state=np.random.RandomState(2019))
        return train_interactions,\
    test_interactions, train_weights, test_weights
    
    def fit(self, interactions, weights,
            questions_features, professional_features,
            cross_validation=False,no_components=150,
            learning_rate=0.05,
            loss='warp',
            random_state=2019,
            verbose=True,
            num_threads=4, epochs=5):
        ################################
        # Model building part
        ################################

        # define lightfm model by specifying hyper-parametre
        # then fit the model with ineteractions matrix,
        # item and user features
        
        model = LightFM(
            no_components,
            learning_rate,
            loss=loss,
            random_state=random_state)
        model.fit(
            interactions,
            item_features=questions_features,
            user_features=professional_features, sample_weight=weights,
            epochs=epochs, num_threads=num_threads, verbose=verbose)
        
        return model

**Recommendations classs:** Now we are going to build a class for making recommendations. This will make easy for making recommendations in djono api. This recommendations class build with extra features. You can use this for general prediction by giving professionals ids and questions features. It has another features that let's choose questions from range of two dates and make recommendation from those questions.

This is useful because those professionals that choose email frequency lavel as "weekly" or "daily", we can select questions from a week and then recommend those questions.

In [None]:
class LightFMRecommendations:
    """
    Make prediction given model and professional ids
    """
    def __init__(self, lightfm_model,
                 professionals_features,
                 questions_features,
                 questions,professionals,merge):
        self.model = lightfm_model
        self.professionals_features = professionals_features
        self.questions_features = questions_features
        self.questions = questions
        self.professionals = professionals
        self.merge = merge
        
    def previous_answered_questions(self, professionals_id):
        previous_q_id_num = (
            self.merge.loc[\
                self.merge['professionals_id_num'] == \
                professionals_id]['questions_id_num'])
        
        previous_answered_questions = self.questions.loc[\
            self.questions['questions_id_num'].isin(
            previous_q_id_num)]
        return previous_answered_questions
        
    
    def _filter_question_by_pro(self, professionals_id):
        """Drop questions that professional already answer"""
        previous_answered_questions = \
        self.previous_answered_questions(professionals_id)
        
        discard_qu_id = \
        previous_answered_questions['questions_id_num'].values.tolist()
        
        questions_for_prediction = \
        self.questions.loc[~self.questions['questions_id_num'].isin(discard_qu_id)]
        
        return questions_for_prediction
    
    def _filter_question_by_date(self, questions, start_date, end_date):
        mask = \
        (questions['questions_date_added'] > start_date) & \
        (questions['questions_date_added'] <= end_date)
        
        return questions.loc[mask]
        
    
    def recommend_by_pro_id_general(self,
                                    professional_id,
                                    num_prediction=8):
        questions_for_prediction = self._filter_question_by_pro(professional_id)
        score = self.model.predict(
            professional_id,
            questions_for_prediction['questions_id_num'].values.tolist(), 
            item_features=self.questions_features,
            user_features=self.professionals_features)
        
        questions_for_prediction['recommendation_score'] = score
        questions_for_prediction = questions_for_prediction.sort_values(
            by='recommendation_score', ascending=False)[:num_prediction]
        return questions_for_prediction
    
    def recommend_by_pro_id_frequency_date_range(self,
                                                 professional_id,
                                                 start_date,
                                                 end_date,
                                                 num_prediction=8):
        questions_for_prediction = \
        self._filter_question_by_pro(professional_id)
        
        start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d')
        end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
        
        questions_for_prediction = self._filter_question_by_date(
            questions_for_prediction, start_date, end_date)
        
        score = self.model.predict(
            professional_id,
            questions_for_prediction['questions_id_num'].values.tolist(), 
            item_features=self.questions_features,
            user_features=self.professionals_features)
        
        questions_for_prediction['recommendation_score'] = score
        questions_for_prediction = questions_for_prediction.sort_values(
            by='recommendation_score', ascending=False)[:num_prediction]
        return questions_for_prediction

**Put it all together:** Now we defined all our important class file. Let's use each of these class and build our model.

In [None]:
from lightfm import LightFM
from lightfm.data import Dataset
import datetime

# instiate all class instance
cv_data_prep = CareerVillageDataPreparation()
light_fm_data_prep = LightFMDataPrep()
train_lightfm = TrainLightFM()

# process raw data
df_questions_p, df_professionals_p, df_merge_p = \
cv_data_prep.prepare(
    df_professionals,df_students,
    df_questions,df_answers,
    df_tags,df_tag_questions,df_tag_users,
    df_question_scores)


# prepare data for lightfm 
interactions, weights, \
questions_features, professional_features = \
light_fm_data_prep.fit(
    df_questions_p, df_professionals_p, df_merge_p)


# finally build and trian our model
# model = train_lightfm.fit(interactions,
#                           weights,
#                           questions_features,
#                           professional_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Awesome! Do you see, how easy it was for building our model. We can surely apply this idea when putting the model into production. Now we are going to see some real recommendations.

In [None]:
# define our recommender class
lightfm_recommendations = LightFMRecommendations(
    model,
    professional_features,questions_features,
    df_questions_p, df_professionals_p, df_merge_p)

# let's what our model predict for user id 3
print("Recommendation for professional: " + str(3))
display(lightfm_recommendations.recommend_by_pro_id_general(99)[:8])

Recommendation for professional: 3


Unnamed: 0,questions_id,questions_author_id,questions_date_added,questions_title,questions_body,questions_id_num,tag_questions_question_id,questions_tag_name,id,score,students_id,students_location,students_date_joined,students_id_num,question_features,recommendation_score
12134,d71e4d328f7646e4a02e964f17494c65,af945d138db841eba81f3383c3b3f8b8,2016-08-26 10:41:54+00:00,i have to become a DCP Deputy commissinor of p...,about my career is to become a deputy commissi...,12134,d71e4d328f7646e4a02e964f17494c65,"professional,any",d71e4d328f7646e4a02e964f17494c65,1,af945d138db841eba81f3383c3b3f8b8,"Bengaluru, Karnataka, India",2016-08-26 10:36:18+00:00,14311.0,"(12134, [professional, any])",2.191327
2244,efae429d91f54939aa634e7ae709dfb0,c848d9a056b64eaa81d775db8f4f3692,2016-07-02 09:52:16+00:00,i am interested in nature conservation officer...,about my career is nature conservation officer...,2244,efae429d91f54939aa634e7ae709dfb0,"professional,any",efae429d91f54939aa634e7ae709dfb0,2,c848d9a056b64eaa81d775db8f4f3692,"Bengaluru, Karnataka, India",2016-07-02 09:59:02+00:00,13356.0,"(2244, [professional, any])",2.173749
15800,1932443afe3d48afbb1d331c03ee84ff,7f643bb973564d4797bb844720c8f870,2016-05-02 10:27:54+00:00,my aim is to become a education officer?which...,i am from ghs jb nagar bengaluru #any #profess...,15800,1932443afe3d48afbb1d331c03ee84ff,"professional,any",1932443afe3d48afbb1d331c03ee84ff,6,7f643bb973564d4797bb844720c8f870,"Bengaluru, Karnataka, India",2016-05-02 10:19:27+00:00,7073.0,"(15800, [professional, any])",2.160023
19480,a982ddfbcdc240bba5dae903b9ba1356,1ea759cd7f47476d9507fdd2b8f13ffe,2016-05-05 08:04:50+00:00,i want to become a doctor in future ?which sub...,hi! i am santosh from ghs j b nagar bengaluru ...,19480,a982ddfbcdc240bba5dae903b9ba1356,"professional,any",a982ddfbcdc240bba5dae903b9ba1356,1,1ea759cd7f47476d9507fdd2b8f13ffe,"Bengaluru, Karnataka, India",2016-05-05 08:01:43+00:00,7547.0,"(19480, [professional, any])",2.158695
20073,4b32182ae0284537834951cd58e251e1,0c4c682965f6443899fa23727e13ec8a,2016-07-02 10:09:16+00:00,i want to become a network administrator means...,about my career is to become a network adminis...,20073,4b32182ae0284537834951cd58e251e1,"professional,any",4b32182ae0284537834951cd58e251e1,2,0c4c682965f6443899fa23727e13ec8a,"Bengaluru, Karnataka, India",2016-07-02 10:06:02+00:00,13357.0,"(20073, [professional, any])",2.153397
11375,e62392f718ba4a71ab4c7662905a6869,1417fa2af23b4b79ac8701748d67ca04,2016-05-30 10:04:26+00:00,Is it possible learn programming languages in ...,hi! i am sushmitha i am from ghs jb nagar beng...,11375,e62392f718ba4a71ab4c7662905a6869,"professional,any",e62392f718ba4a71ab4c7662905a6869,2,1417fa2af23b4b79ac8701748d67ca04,"Bengaluru, Karnataka, India",2016-05-30 10:01:34+00:00,11768.0,"(11375, [professional, any])",2.152683
14190,ff3132755241418c887bad320c59cb2a,3070aee597fc478e94d3b39eff94f920,2016-07-18 08:02:33+00:00,i m interested in musican?please help me to wh...,about musican #any #professional,14190,ff3132755241418c887bad320c59cb2a,"professional,any",ff3132755241418c887bad320c59cb2a,3,3070aee597fc478e94d3b39eff94f920,"Bengaluru, Karnataka, India",2016-07-14 09:54:33+00:00,13476.0,"(14190, [professional, any])",2.14886
18187,8f18ff7d9470474e8b33d41960d12089,cebebcb4881f4575a59c9a3647beb4db,2016-06-30 09:32:46+00:00,my aim is to become a heart therapist?which su...,about my career is art therapist #any #profess...,18187,8f18ff7d9470474e8b33d41960d12089,"professional,any",8f18ff7d9470474e8b33d41960d12089,2,cebebcb4881f4575a59c9a3647beb4db,"Bengaluru, Karnataka, India",2016-06-30 09:12:31+00:00,13296.0,"(18187, [professional, any])",2.14482


In [None]:
# # from datetime import datetime
# import datetime

# # also let's see what our model predicts for professional 3
# # given questions between two dates
# print("Recommendations for professionals (question from 2016-1-1 to 2016-12-31): " + str(3))
# display(lightfm_recommendations.recommend_by_pro_id_frequency_date_range(3,
#                                                                  '2016-1-1','2016-12-31')[:8])

# **Rough Work**

In [None]:
professionals_prev_ans_tags = original[['professionals_id', 'questions_tag_name']]
professionals_prev_ans_tags.head()

Unnamed: 0,professionals_id,questions_tag_name
0,36ff3b3666df400f956f8335cf53e09e,"lecture,college,professor"
1,36ff3b3666df400f956f8335cf53e09e,"college,building,soccer"
2,36ff3b3666df400f956f8335cf53e09e,"engineering,neuroscience,gradschool"
3,a32736b04c27437da3078374d47af1b1,"engineering,neuroscience,gradschool"
4,36ff3b3666df400f956f8335cf53e09e,"job-search,career-choice,job,college-jobs"


In [None]:
professionals_prev_ans_tags = original[['professionals_id', 'questions_tag_name']]
# drop null values from that 
professionals_prev_ans_tags = professionals_prev_ans_tags.dropna()
# because professsionals answers multiple questions, 
# we group all of tags of each user into single row 
professionals_prev_ans_tags = professionals_prev_ans_tags.groupby(
    ['professionals_id'])['questions_tag_name'].apply(
        ','.join).reset_index()
# ak professional id ny boht sary questions ko ans kra va hai so un tamam questions k sth koi na koi tags hongy to hum un tamam tags ko jin k questions professional ny ans
# kry hain, merge kr dengy.


# drop duplicates tags from each professionals rows
professionals_prev_ans_tags['questions_tag_name'] = (
    professionals_prev_ans_tags['questions_tag_name'].str.split(',').apply(set).str.join(','))

professionals_prev_ans_tags.head()

Unnamed: 0,professionals_id,questions_tag_name
0,00009a0f9bda43eba47104e9ac62aff5,"air-force,army,photography,navy,engineer,tv-se..."
1,000d4635e5da41e3bfd83677ee11dda4,"gap-year,high-school,college,career,university..."
2,00271cc10e0245fba4a35e76e669c281,"pay,performing-arts,entrepreneurship,student,f..."
3,003cc21be89d4e42bc4424131a378e86,"graduate,law,college,criminal-justice,college-..."
4,0046ab8089c04b3a8df3f8c28621a818,"management,college,volunteering,international-..."


In [None]:
# finally merge the dataframe with professionals dataframe 
df_professionals = df_professionals.merge(professionals_prev_ans_tags, how='left', on='professionals_id')

prof_copy = df_professionals.copy()
prof_copy.head()

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num,tag_users_user_id,professionals_tag_name,questions_tag_name
0,9ced4ce7519049c0944147afb75a8ce3,,,,2011-10-05 20:35:19+00:00,0,,,"resume,consulting"
1,f718dcf6d2ec4cb0a52a9db59d7f9e67,,,,2011-10-05 20:49:21+00:00,1,,,
2,0c673e046d824ec0ad0ebe012a0673e4,"New York, New York",,,2011-10-18 17:31:26+00:00,2,0c673e046d824ec0ad0ebe012a0673e4,"consulting,consulting,consulting,consulting,co...","hire,resume,baby,consulting,entrepreneurship,l..."
3,977428d851b24183b223be0eb8619a8c,"Boston, Massachusetts",,,2011-11-09 20:39:29+00:00,3,,,"criminal-justice,neurosurgeon,justice,detectiv..."
4,e2d57e5041a44f489288397c9904c2b2,,,,2011-12-10 22:14:44+00:00,4,,,


In [None]:
# join professionals tags and their answered tags 
# we replace nan values with ""
prof_copy['professional_all_tags'] = (
    prof_copy[['professionals_tag_name', 'questions_tag_name']].apply(
        lambda x: ','.join(x.dropna()),
        axis=1))
prof_copy.head()


Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num,tag_users_user_id,professionals_tag_name,questions_tag_name,professional_all_tags
0,9ced4ce7519049c0944147afb75a8ce3,,,,2011-10-05 20:35:19+00:00,0,,,"resume,consulting","resume,consulting"
1,f718dcf6d2ec4cb0a52a9db59d7f9e67,,,,2011-10-05 20:49:21+00:00,1,,,,
2,0c673e046d824ec0ad0ebe012a0673e4,"New York, New York",,,2011-10-18 17:31:26+00:00,2,0c673e046d824ec0ad0ebe012a0673e4,"consulting,consulting,consulting,consulting,co...","hire,resume,baby,consulting,entrepreneurship,l...","consulting,consulting,consulting,consulting,co..."
3,977428d851b24183b223be0eb8619a8c,"Boston, Massachusetts",,,2011-11-09 20:39:29+00:00,3,,,"criminal-justice,neurosurgeon,justice,detectiv...","criminal-justice,neurosurgeon,justice,detectiv..."
4,e2d57e5041a44f489288397c9904c2b2,,,,2011-12-10 22:14:44+00:00,4,,,,


# **EXTRA WORK**

In [None]:
# just dropna from tags 
# df_tags.isna().sum()

In [None]:
# merge tag_questions with tags name
# then group all tags for each question into single rows
df_tags_question = df_tag_questions.merge(
    df_tags, how='inner',
    left_on='tag_questions_tag_id', right_on='tags_tag_id')
# inner join is an intersect operation
df_tags_question.head()


Unnamed: 0,tag_questions_tag_id,tag_questions_question_id,tags_tag_id,tags_tag_name
0,28930,cb43ebee01364c68ac61d347a393ae39,28930,minor
1,28930,47f55e85ce944242a5a347ab85a8ffb4,28930,minor
2,28930,ccc30a033a0f4dfdb2eb987012f25792,28930,minor
3,28930,e30b274e48d741f7bf50eb5e7171a3c0,28930,minor
4,28930,3d22742052df4989b311b4195cbb0f1a,28930,minor


In [None]:
df_tags_question = df_tags_question.groupby(
    ['tag_questions_question_id'])['tags_tag_name'].apply(
        ','.join).reset_index()
    # .join is used to change seperator. seperator sy muraad 2 columns ko aapas me jis bhi seperator sy join kia ho. for e.g. space or ,
    # ak id ky jitny bhi tags thy, usny un sb ko ak hi row me merge kr dia hai with , seperator 

df_tags_question = df_tags_question.rename(columns={'tags_tag_name': 'questions_tag_name'})
df_tags_question.head()

Unnamed: 0,tag_questions_question_id,questions_tag_name
0,0003e7bf48f24b5c985f8fce96e611f3,"internship,technology,high-school,information-..."
1,0006609dd4da40dcaa5a83e0499aba14,"psychology,law"
2,000af224bc2f4e94a19f8b62ba279cc4,"biology,marine"
3,000b30fb534b41f7b716fa9ebf9c3f35,"teaching,exercise-science,school,exercise"
4,0018752e44b44e26bb74a0a43232b4d6,"math,puremathematics"


In [None]:
# New dataframe

In [None]:
# df_tag_users.head()
# tag_users_tag_id column ka mtlb hai tag id.	 

df_tags_pro = df_tag_users.merge(
    df_tags, how='inner',
    left_on='tag_users_tag_id', right_on='tags_tag_id')

df_tags_pro.head()

Unnamed: 0,tag_users_tag_id,tag_users_user_id,tags_tag_id,tags_tag_name
0,593,c72ab38e073246e88da7e9a4ec7a4472,593,computer-software
1,593,8db519781ec24f2e8bdc67c2ac53f614,593,computer-software
2,593,9ab6b54d55b24299a4795584508db4ff,593,computer-software
3,593,e327399c48584fcf81e433828a6d8715,593,computer-software
4,593,92494d9dc2124507972c5306badc6727,593,computer-software


In [None]:
df_tags.head()

Unnamed: 0,tags_tag_id,tags_tag_name
0,27490,college
1,461,computer-science
2,593,computer-software
3,27292,business
4,18217,doctor


In [None]:
# here you can see k ak hi user ny multiple tags ko like kra va hai, taky us k pass us tag sy related swaal jaen.
df_tags_pro[df_tags_pro["tag_users_user_id"] == "c72ab38e073246e88da7e9a4ec7a4472"]


Unnamed: 0,tag_users_tag_id,tag_users_user_id,tags_tag_id,tags_tag_name
0,593,c72ab38e073246e88da7e9a4ec7a4472,593,computer-software
7,593,c72ab38e073246e88da7e9a4ec7a4472,593,computer-software
9,593,c72ab38e073246e88da7e9a4ec7a4472,593,computer-software
10,593,c72ab38e073246e88da7e9a4ec7a4472,593,computer-software
11,593,c72ab38e073246e88da7e9a4ec7a4472,593,computer-software
14,593,c72ab38e073246e88da7e9a4ec7a4472,593,computer-software
25,593,c72ab38e073246e88da7e9a4ec7a4472,593,computer-software
1444,1642,c72ab38e073246e88da7e9a4ec7a4472,1642,programming
1466,1642,c72ab38e073246e88da7e9a4ec7a4472,1642,programming
25620,12052,c72ab38e073246e88da7e9a4ec7a4472,12052,c


In [None]:
df_tags_pro = df_tags_pro.groupby(
    ['tag_users_user_id'])['tags_tag_name'].apply(
        ','.join).reset_index()
        
# df_tags_pro.head()

In [None]:
# another dataframe

In [None]:
df_professionals.head()

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num
0,9ced4ce7519049c0944147afb75a8ce3,,,,2011-10-05 20:35:19+00:00,0
1,f718dcf6d2ec4cb0a52a9db59d7f9e67,,,,2011-10-05 20:49:21+00:00,1
2,0c673e046d824ec0ad0ebe012a0673e4,"New York, New York",,,2011-10-18 17:31:26+00:00,2
3,977428d851b24183b223be0eb8619a8c,"Boston, Massachusetts",,,2011-11-09 20:39:29+00:00,3
4,e2d57e5041a44f489288397c9904c2b2,,,,2011-12-10 22:14:44+00:00,4


In [None]:
df_professionals[df_professionals["professionals_id"] == "00009a0f9bda43eba47104e9ac62aff5"]

# df_professionals and tag_users_user_id dono me jo id hain wo professionals ki hain. yani har professional ny tags like kry hue hain
# jo uski respective id k against aajaen gy. 

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num
4590,00009a0f9bda43eba47104e9ac62aff5,"New York, New York",Media,Digital Production & Content Consultant,2016-03-14 17:00:48+00:00,4590


In [None]:
df_professionals = df_professionals.merge(
    df_tags_pro, how='left',
    left_on='professionals_id', right_on='tag_users_user_id')

df_professionals.head()

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num,tag_users_user_id,tags_tag_name
0,9ced4ce7519049c0944147afb75a8ce3,,,,2011-10-05 20:35:19+00:00,0,,
1,f718dcf6d2ec4cb0a52a9db59d7f9e67,,,,2011-10-05 20:49:21+00:00,1,,
2,0c673e046d824ec0ad0ebe012a0673e4,"New York, New York",,,2011-10-18 17:31:26+00:00,2,0c673e046d824ec0ad0ebe012a0673e4,"consulting,consulting,consulting,consulting,co..."
3,977428d851b24183b223be0eb8619a8c,"Boston, Massachusetts",,,2011-11-09 20:39:29+00:00,3,,
4,e2d57e5041a44f489288397c9904c2b2,,,,2011-12-10 22:14:44+00:00,4,,


In [None]:
# another dataframe.

In [None]:
df_question_scores.head()

Unnamed: 0,id,score
0,38436aadef3d4b608ad089cf53ab0fe7,5
1,edb8c179c5d64c9cb812a59a32045f55,4
2,333464d7484b43e3866e86096bc4ddb9,6
3,4b995e60b99d4ee18346e893e007cb8f,6
4,f6b9ca94aed04ba28256492708e74f60,6


In [None]:
# merge questions with scores 
df_questions = df_questions.merge(
    df_question_scores, how='left',
    left_on='questions_id', right_on='id')

df_questions.head()

Unnamed: 0,questions_id,questions_author_id,questions_date_added,questions_title,questions_body,questions_id_num,id,score
0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26+00:00,Teacher career question,What is a maths teacher? what is a ma...,0,332a511f1569444485cf7a7a556a5e54,1.0
1,eb80205482e4424cad8f16bc25aa2d9c,acccbda28edd4362ab03fb8b6fd2d67b,2016-05-20 16:48:25+00:00,I want to become an army officer. What can I d...,I am Priyanka from Bangalore . Now am in 10th ...,1,eb80205482e4424cad8f16bc25aa2d9c,5.0
2,4ec31632938a40b98909416bdd0decff,f2c179a563024ccc927399ce529094b5,2017-02-08 19:13:38+00:00,Will going abroad for your first job increase ...,I'm planning on going abroad for my first job....,2,4ec31632938a40b98909416bdd0decff,2.0
3,2f6a9a99d9b24e5baa50d40d0ba50a75,2c30ffba444e40eabb4583b55233a5a4,2017-09-01 14:05:32+00:00,To become a specialist in business management...,i hear business management is a hard way to ge...,3,2f6a9a99d9b24e5baa50d40d0ba50a75,2.0
4,5af8880460c141dbb02971a1a8369529,aa9eb1a2ab184ebbb00dc01ab663428a,2017-09-01 02:36:54+00:00,Are there any scholarships out there for stude...,I'm trying to find scholarships for first year...,4,5af8880460c141dbb02971a1a8369529,2.0


In [None]:
df_questions = df_questions.merge(
    df_students, how='left',
    left_on='questions_author_id', right_on='students_id')

df_questions.head()

Unnamed: 0,questions_id,questions_author_id,questions_date_added,questions_title,questions_body,questions_id_num,id,score,students_id,students_location,students_date_joined,students_id_num
0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26+00:00,Teacher career question,What is a maths teacher? what is a ma...,0,332a511f1569444485cf7a7a556a5e54,1.0,8f6f374ffd834d258ab69d376dd998f5,"Coimbatore, Tamil Nadu, India",2016-04-22 10:07:32+00:00,6890.0
1,eb80205482e4424cad8f16bc25aa2d9c,acccbda28edd4362ab03fb8b6fd2d67b,2016-05-20 16:48:25+00:00,I want to become an army officer. What can I d...,I am Priyanka from Bangalore . Now am in 10th ...,1,eb80205482e4424cad8f16bc25aa2d9c,5.0,acccbda28edd4362ab03fb8b6fd2d67b,"Providence, Rhode Island",2016-05-20 16:29:08+00:00,10189.0
2,4ec31632938a40b98909416bdd0decff,f2c179a563024ccc927399ce529094b5,2017-02-08 19:13:38+00:00,Will going abroad for your first job increase ...,I'm planning on going abroad for my first job....,2,4ec31632938a40b98909416bdd0decff,2.0,f2c179a563024ccc927399ce529094b5,,2017-02-07 15:51:57+00:00,18023.0
3,2f6a9a99d9b24e5baa50d40d0ba50a75,2c30ffba444e40eabb4583b55233a5a4,2017-09-01 14:05:32+00:00,To become a specialist in business management...,i hear business management is a hard way to ge...,3,2f6a9a99d9b24e5baa50d40d0ba50a75,2.0,2c30ffba444e40eabb4583b55233a5a4,"North Lauderdale, Florida",2017-09-01 14:02:02+00:00,20803.0
4,5af8880460c141dbb02971a1a8369529,aa9eb1a2ab184ebbb00dc01ab663428a,2017-09-01 02:36:54+00:00,Are there any scholarships out there for stude...,I'm trying to find scholarships for first year...,4,5af8880460c141dbb02971a1a8369529,2.0,aa9eb1a2ab184ebbb00dc01ab663428a,"Tunnel Hill, Georgia",2017-09-01 02:29:06+00:00,20505.0


Unnamed: 0,questions_id,questions_author_id,questions_date_added,questions_title,questions_body,questions_id_num,id,score,students_id,students_location,students_date_joined,students_id_num


# **Extra Explanation of what's going inside**

In [None]:
df_tags_question.head()

Unnamed: 0,tag_questions_question_id,questions_tag_name
0,0003e7bf48f24b5c985f8fce96e611f3,"internship,technology,high-school,information-..."
1,0006609dd4da40dcaa5a83e0499aba14,"psychology,law"
2,000af224bc2f4e94a19f8b62ba279cc4,"biology,marine"
3,000b30fb534b41f7b716fa9ebf9c3f35,"teaching,exercise-science,school,exercise"
4,0018752e44b44e26bb74a0a43232b4d6,"math,puremathematics"


In [None]:
df_questions.head()

Unnamed: 0,questions_id,questions_author_id,questions_date_added,questions_title,questions_body
0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26+00:00,Teacher career question,What is a maths teacher? what is a ma...
1,eb80205482e4424cad8f16bc25aa2d9c,acccbda28edd4362ab03fb8b6fd2d67b,2016-05-20 16:48:25+00:00,I want to become an army officer. What can I d...,I am Priyanka from Bangalore . Now am in 10th ...
2,4ec31632938a40b98909416bdd0decff,f2c179a563024ccc927399ce529094b5,2017-02-08 19:13:38+00:00,Will going abroad for your first job increase ...,I'm planning on going abroad for my first job....
3,2f6a9a99d9b24e5baa50d40d0ba50a75,2c30ffba444e40eabb4583b55233a5a4,2017-09-01 14:05:32+00:00,To become a specialist in business management...,i hear business management is a hard way to ge...
4,5af8880460c141dbb02971a1a8369529,aa9eb1a2ab184ebbb00dc01ab663428a,2017-09-01 02:36:54+00:00,Are there any scholarships out there for stude...,I'm trying to find scholarships for first year...


In [None]:
df_questions[df_questions['questions_id'] == '0003e7bf48f24b5c985f8fce96e611f3']

Unnamed: 0,questions_id,questions_author_id,questions_date_added,questions_title,questions_body
6150,0003e7bf48f24b5c985f8fce96e611f3,02f6630914c04ae6a662cabdf7a0ecd5,2018-01-22 17:43:31+00:00,Does doing double major in tech academy at Hig...,I am a Junior in H.S. right now i am thinking ...


In [None]:
hh = df_merge.copy()

In [None]:
hh.head()

Unnamed: 0,answers_id,answers_author_id,answers_question_id,answers_date_added,answers_body,answers_id_num,questions_id,questions_author_id,questions_date_added,questions_title,questions_body,questions_id_num,tag_questions_question_id,questions_tag_name,id_x,score_x,students_id,students_location,students_date_joined,students_id_num,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num,tag_users_user_id,professionals_tag_name,id_y,score_y
0,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,332a511f1569444485cf7a7a556a5e54,2016-04-29 19:40:14+00:00,<p>Hi!</p>\n<p>You are asking a very interesti...,0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26+00:00,Teacher career question,What is a maths teacher? what is a ma...,0,332a511f1569444485cf7a7a556a5e54,"lecture,college,professor",332a511f1569444485cf7a7a556a5e54,1.0,8f6f374ffd834d258ab69d376dd998f5,"Coimbatore, Tamil Nadu, India",2016-04-22 10:07:32+00:00,6890.0,36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",332a511f1569444485cf7a7a556a5e54,1
1,f3519ab99a1a4a13a8a9ecb814287d2a,36ff3b3666df400f956f8335cf53e09e,0f1d6a4f276c4a05878dd48e03e52289,2016-07-31 15:35:54+00:00,<p>Hi Rodrigo!</p>\n<p>The important thing to ...,11,0f1d6a4f276c4a05878dd48e03e52289,585ac233015447cc9e9a217044e515e1,2016-05-19 22:16:25+00:00,what kind of college could i go to for a soc...,I like soccer because i been playing sense i w...,7,0f1d6a4f276c4a05878dd48e03e52289,"college,building,soccer",0f1d6a4f276c4a05878dd48e03e52289,1.0,585ac233015447cc9e9a217044e515e1,"Morgan Hill, California",2016-05-19 22:08:48+00:00,10014.0,36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",0f1d6a4f276c4a05878dd48e03e52289,1
2,825f6e316a5f48328d6f8af831df9940,36ff3b3666df400f956f8335cf53e09e,0149c6d63e214040b44d4a3789bb00ba,2018-04-15 23:08:46+00:00,<p>Congratulations on being interested in find...,71,0149c6d63e214040b44d4a3789bb00ba,34217a1861d640a58c85e033414cf9cb,2018-04-12 17:13:45+00:00,What is the best way to prepare for studying e...,"I am interested in Computational Neuroscience,...",33,0149c6d63e214040b44d4a3789bb00ba,"engineering,neuroscience,gradschool",0149c6d63e214040b44d4a3789bb00ba,2.0,34217a1861d640a58c85e033414cf9cb,"Austin, Texas",2018-04-12 17:09:31+00:00,26796.0,36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",0149c6d63e214040b44d4a3789bb00ba,2
3,fb2c794175304c4caeb55e654270421f,a32736b04c27437da3078374d47af1b1,0149c6d63e214040b44d4a3789bb00ba,2018-04-13 18:18:05+00:00,<p>Hi Elisabeth! </p><p><br></p><p>If you are ...,72,0149c6d63e214040b44d4a3789bb00ba,34217a1861d640a58c85e033414cf9cb,2018-04-12 17:13:45+00:00,What is the best way to prepare for studying e...,"I am interested in Computational Neuroscience,...",33,0149c6d63e214040b44d4a3789bb00ba,"engineering,neuroscience,gradschool",0149c6d63e214040b44d4a3789bb00ba,2.0,34217a1861d640a58c85e033414cf9cb,"Austin, Texas",2018-04-12 17:09:31+00:00,26796.0,a32736b04c27437da3078374d47af1b1,"San Francisco, California",Computer Software,Product Management @ Okta,2018-04-13 17:48:09+00:00,18373,a32736b04c27437da3078374d47af1b1,computer-software,0149c6d63e214040b44d4a3789bb00ba,2
4,f3fc23809cda472780fc565334f35000,36ff3b3666df400f956f8335cf53e09e,acc611cfb5c44daa8a3d7d65dfffa5ff,2018-08-14 10:37:01+00:00,<p>The most important thing that you can do is...,102,acc611cfb5c44daa8a3d7d65dfffa5ff,5b751a8ee4a047f7a08ce9eb5e43e5a2,2018-08-14 04:49:33+00:00,How should I prepare myself for my job search ...,"I am a Sociology, Political Science, and Inter...",47,acc611cfb5c44daa8a3d7d65dfffa5ff,"job-search,career-choice,job,college-jobs",acc611cfb5c44daa8a3d7d65dfffa5ff,1.0,5b751a8ee4a047f7a08ce9eb5e43e5a2,"Kingston, Pennsylvania",2018-08-14 04:47:13+00:00,28533.0,36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",acc611cfb5c44daa8a3d7d65dfffa5ff,1


# **END of EXTRA WORK**
# **END of Rough WORK**
