# Personalization
#### This module will handle collaborative filter based recommendation for individual employees, this will produce two outputs: 1. recommended project 2. similar person to check out

In [1]:
import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split
import sys

In [2]:
employee = pd.read_csv('../data/employee_M23.txt',sep = '|',index_col = 'ID')
project = pd.read_csv('../data/Projects_M23.txt',sep = '|', index_col = 'pID')
projectCategory = pd.read_csv('../data/Project_category_M23.txt',sep = '|')

In [32]:
# massage empolyee project data
projectData = pd.melt(employee[['ID','PastProjectsID']].set_index('ID')['PastProjectsID'].str.split(";", n = -1, expand = True).reset_index(),
              id_vars = ['ID'],
              value_name = 'PastProjectsID')\
        .dropna().drop(['variable'], axis = 1)\
        .groupby(['ID','PastProjectsID']).agg({'PastProjectsID':"count"})\
        .rename(columns={'PastProjectsID':'PastProjectsCount'}).reset_index()
projectData['PastProjectsCount'] = projectData['PastProjectsCount'].astype(np.int64)
projectData.head()

Unnamed: 0,ID,PastProjectsID,PastProjectsCount
0,12056,1,1
1,12056,100,1
2,12056,200,1
3,12056,300,1
4,12057,101,1


In [33]:
def split_data(data):
    train, test = train_test_split(data, test_size = 0.2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [34]:
train_data, test_data = split_data(projectData)

# Define model using Turicreate library
## Baseline: most popular items

In [38]:
user_id = 'ID'
item_id = 'PastProjectsID'
users_to_recommend = list(projectData['ID'])
n_rec = 10
n_display = 30

In [39]:
# Turicreate is a great library
def model(train_data, name, user_id, item_id, target,
         users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data,
                                                user_id = user_id,
                                                item_id = item_id,
                                                target = target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data,
                                                     user_id = user_id,
                                                     item_id = item_id,
                                                     target = target,
                                                     similarity_type = 'cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data,
                                                     user_id = user_id,
                                                     item_id = item_id,
                                                     target = target,
                                                     similarity_type = 'pearson')
#     recom = model.recommend(users = users_to_recommend, k = n_rec)
    return model

# Popularity model as baseline

In [40]:
name = 'popularity'
target = 'PastProjectsCount'
popularity_model = model(train_data, name, user_id, item_id, target,
                  users_to_recommend, n_rec, n_display)

In [41]:
popularity_model.recommend(users_to_recommend, k = n_rec).print_rows(30)

+-------+----------------+-------+------+
|   ID  | PastProjectsID | score | rank |
+-------+----------------+-------+------+
| 12056 |      134       |  1.0  |  1   |
| 12056 |       3        |  1.0  |  2   |
| 12056 |      360       |  1.0  |  3   |
| 12056 |      130       |  1.0  |  4   |
| 12056 |      437       |  1.0  |  5   |
| 12056 |      118       |  1.0  |  6   |
| 12056 |      414       |  1.0  |  7   |
| 12056 |      441       |  1.0  |  8   |
| 12056 |      128       |  1.0  |  9   |
| 12056 |      442       |  1.0  |  10  |
| 12056 |      134       |  1.0  |  1   |
| 12056 |       3        |  1.0  |  2   |
| 12056 |      360       |  1.0  |  3   |
| 12056 |      130       |  1.0  |  4   |
| 12056 |      437       |  1.0  |  5   |
| 12056 |      118       |  1.0  |  6   |
| 12056 |      414       |  1.0  |  7   |
| 12056 |      441       |  1.0  |  8   |
| 12056 |      128       |  1.0  |  9   |
| 12056 |      442       |  1.0  |  10  |
| 12056 |      134       |  1.0  |

# use collaborative filter


In [44]:
name = 'pearson'
target = 'PastProjectsCount'
pear = model(train_data, name, user_id, item_id, target,
           users_to_recommend, n_rec, n_display)

In [45]:
pear.recommend(users_to_recommend, k = n_rec).print_rows(n_display)

+-------+----------------+-------+------+
|   ID  | PastProjectsID | score | rank |
+-------+----------------+-------+------+
| 12056 |      134       |  0.0  |  1   |
| 12056 |       3        |  0.0  |  2   |
| 12056 |      360       |  0.0  |  3   |
| 12056 |      130       |  0.0  |  4   |
| 12056 |      437       |  0.0  |  5   |
| 12056 |      118       |  0.0  |  6   |
| 12056 |      414       |  0.0  |  7   |
| 12056 |      441       |  0.0  |  8   |
| 12056 |      128       |  0.0  |  9   |
| 12056 |      442       |  0.0  |  10  |
| 12056 |      134       |  0.0  |  1   |
| 12056 |       3        |  0.0  |  2   |
| 12056 |      360       |  0.0  |  3   |
| 12056 |      130       |  0.0  |  4   |
| 12056 |      437       |  0.0  |  5   |
| 12056 |      118       |  0.0  |  6   |
| 12056 |      414       |  0.0  |  7   |
| 12056 |      441       |  0.0  |  8   |
| 12056 |      128       |  0.0  |  9   |
| 12056 |      442       |  0.0  |  10  |
| 12056 |      134       |  0.0  |

In [47]:
models_w_dummy = [popularity_model, pear]

names_w_dummy = ['Popularity Model on Purchase Counts', 'Pearson Similarity on Purchase Counts']
eval_counts = tc.recommender.util.compare_models(test_data,
                                                models_w_dummy, model_names=names_w_dummy)

PROGRESS: Evaluate model Popularity Model on Purchase Counts

Precision and recall summary statistics by cutoff
+--------+----------------+-------------+
| cutoff | mean_precision | mean_recall |
+--------+----------------+-------------+
|   1    |      0.0       |     0.0     |
|   2    |      0.0       |     0.0     |
|   3    |      0.0       |     0.0     |
|   4    |      0.0       |     0.0     |
|   5    |      0.0       |     0.0     |
|   6    |      0.0       |     0.0     |
|   7    |      0.0       |     0.0     |
|   8    |      0.0       |     0.0     |
|   9    |      0.0       |     0.0     |
|   10   |      0.0       |     0.0     |
+--------+----------------+-------------+
[10 rows x 3 columns]


Overall RMSE: 0.0

Per User RMSE (best)
+-------+------+-------+
|   ID  | rmse | count |
+-------+------+-------+
| 12237 | 0.0  |   1   |
+-------+------+-------+
[1 rows x 3 columns]


Per User RMSE (worst)
+-------+------+-------+
|   ID  | rmse | count |
+-------+------+