# Personalization
#### This module will handle collaborative filter based recommendation for individual employees, this will produce two outputs: 1. recommended project 2. similar person to check out

In [1]:
import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split
import sys

In [60]:
employee = pd.read_csv('../data/employee_M23.txt',sep = '|',index_col = 'ID')
project = pd.read_csv('../data/project_M25_matched.txt',sep = '|', index_col = 'pID')
projectCategory = pd.read_csv('../data/Project_category_M23.txt',sep = '|')

In [62]:
# massage empolyee project data
employee['ID'] = employee.index
projectData = pd.melt(employee[['ID','PastProjectsID']].set_index('ID')['PastProjectsID'].str.split(";", n = -1, expand = True).reset_index(),
              id_vars = ['ID'],
              value_name = 'PastProjectsID')\
        .dropna().drop(['variable'], axis = 1)\
        .groupby(['ID','PastProjectsID']).agg({'PastProjectsID':"count"})\
        .rename(columns={'PastProjectsID':'PastProjectsCount'}).reset_index()
projectData['PastProjectsCount'] = projectData['PastProjectsCount'].astype(np.int64)
projectData.head()

Unnamed: 0,ID,PastProjectsID,PastProjectsCount
0,12056,1,1
1,12056,100,1
2,12056,200,1
3,12056,300,1
4,12057,101,1


In [63]:
def split_data(data):
    train, test = train_test_split(data, test_size = 0.2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [64]:
train_data, test_data = split_data(projectData)

# Define model using Turicreate library
## Baseline: most popular items

In [75]:
user_id = 'ID'
item_id = 'PastProjectsID'
project_to_recommend = list(projectData['ID'])
n_rec = 10
n_display = 30

In [76]:
# Turicreate is a great library
def model(train_data, name, user_id, item_id, target,
         project_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data,
                                                user_id = user_id,
                                                item_id = item_id,
                                                target = target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data,
                                                     user_id = user_id,
                                                     item_id = item_id,
                                                     target = target,
                                                     similarity_type = 'cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data,
                                                     user_id = user_id,
                                                     item_id = item_id,
                                                     target = target,
                                                     similarity_type = 'pearson')
#     recom = model.recommend(users = users_to_recommend, k = n_rec)
    return model

# Popularity model as baseline

In [77]:
name = 'popularity'
target = 'PastProjectsCount'
popularity_model = model(train_data, name, user_id, item_id, target,
                  project_to_recommend, n_rec, n_display)

In [78]:
popularity_model.recommend(project_to_recommend, k = n_rec).print_rows(30)

+-------+----------------+-------+------+
|   ID  | PastProjectsID | score | rank |
+-------+----------------+-------+------+
| 12056 |      178       |  1.0  |  1   |
| 12056 |       5        |  1.0  |  2   |
| 12056 |      391       |  1.0  |  3   |
| 12056 |      196       |  1.0  |  4   |
| 12056 |      314       |  1.0  |  5   |
| 12056 |      103       |  1.0  |  6   |
| 12056 |      382       |  1.0  |  7   |
| 12056 |      483       |  1.0  |  8   |
| 12056 |      469       |  1.0  |  9   |
| 12056 |      222       |  1.0  |  10  |
| 12056 |      178       |  1.0  |  1   |
| 12056 |       5        |  1.0  |  2   |
| 12056 |      391       |  1.0  |  3   |
| 12056 |      196       |  1.0  |  4   |
| 12056 |      314       |  1.0  |  5   |
| 12056 |      103       |  1.0  |  6   |
| 12056 |      382       |  1.0  |  7   |
| 12056 |      483       |  1.0  |  8   |
| 12056 |      469       |  1.0  |  9   |
| 12056 |      222       |  1.0  |  10  |
| 12056 |      178       |  1.0  |

# use collaborative filter


In [79]:
name = 'pearson'
target = 'PastProjectsCount'
pear = model(train_data, name, user_id, item_id, target,
           project_to_recommend, n_rec, n_display)

In [80]:
pear.recommend(project_to_recommend, k = n_rec).print_rows(n_display)

+-------+----------------+-------+------+
|   ID  | PastProjectsID | score | rank |
+-------+----------------+-------+------+
| 12056 |      178       |  0.0  |  1   |
| 12056 |       5        |  0.0  |  2   |
| 12056 |      391       |  0.0  |  3   |
| 12056 |      196       |  0.0  |  4   |
| 12056 |      314       |  0.0  |  5   |
| 12056 |      103       |  0.0  |  6   |
| 12056 |      382       |  0.0  |  7   |
| 12056 |      483       |  0.0  |  8   |
| 12056 |      469       |  0.0  |  9   |
| 12056 |      222       |  0.0  |  10  |
| 12056 |      178       |  0.0  |  1   |
| 12056 |       5        |  0.0  |  2   |
| 12056 |      391       |  0.0  |  3   |
| 12056 |      196       |  0.0  |  4   |
| 12056 |      314       |  0.0  |  5   |
| 12056 |      103       |  0.0  |  6   |
| 12056 |      382       |  0.0  |  7   |
| 12056 |      483       |  0.0  |  8   |
| 12056 |      469       |  0.0  |  9   |
| 12056 |      222       |  0.0  |  10  |
| 12056 |      178       |  0.0  |

In [71]:
models_w_dummy = [popularity_model, pear]

names_w_dummy = ['Popularity Model on Purchase Counts', 'Pearson Similarity on Purchase Counts']
eval_counts = tc.recommender.util.compare_models(test_data,
                                                models_w_dummy, model_names=names_w_dummy)

PROGRESS: Evaluate model Popularity Model on Purchase Counts

Precision and recall summary statistics by cutoff
+--------+-----------------------+----------------------+
| cutoff |     mean_precision    |     mean_recall      |
+--------+-----------------------+----------------------+
|   1    |  0.008474576271186442 | 0.008474576271186442 |
|   2    |  0.004237288135593221 | 0.008474576271186442 |
|   3    |  0.002824858757062149 | 0.008474576271186442 |
|   4    | 0.0021186440677966106 | 0.008474576271186442 |
|   5    |  0.003389830508474576 | 0.016949152542372885 |
|   6    |  0.004237288135593221 | 0.025423728813559324 |
|   7    | 0.0036319612590799038 | 0.025423728813559324 |
|   8    | 0.0031779661016949155 | 0.025423728813559324 |
|   9    |  0.002824858757062148 | 0.025423728813559324 |
|   10   |  0.003389830508474575 | 0.029661016949152536 |
+--------+-----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.0

Per User RMSE (best)
+-------+---

# final model

In [81]:
final_model = tc.item_similarity_recommender.create(tc.SFrame(projectData), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='PastProjectsCount', similarity_type='cosine')
recom = final_model.recommend(users=project_to_recommend, k=n_rec)
recom.print_rows(n_display)
recom.to_dataframe().head()

+-------+----------------+--------------------+------+
|   ID  | PastProjectsID |       score        | rank |
+-------+----------------+--------------------+------+
| 12056 |      400       | 0.3535533845424652 |  1   |
| 12056 |      101       |        0.25        |  2   |
| 12056 |      103       |        0.0         |  3   |
| 12056 |      302       |        0.0         |  4   |
| 12056 |       3        |        0.0         |  5   |
| 12056 |      203       |        0.0         |  6   |
| 12056 |      102       |        0.0         |  7   |
| 12056 |      301       |        0.0         |  8   |
| 12056 |      201       |        0.0         |  9   |
| 12056 |       2        |        0.0         |  10  |
| 12056 |      400       | 0.3535533845424652 |  1   |
| 12056 |      101       |        0.25        |  2   |
| 12056 |      103       |        0.0         |  3   |
| 12056 |      302       |        0.0         |  4   |
| 12056 |       3        |        0.0         |  5   |
| 12056 | 

Unnamed: 0,ID,PastProjectsID,score,rank
0,12056,400,0.353553,1
1,12056,101,0.25,2
2,12056,103,0.0,3
3,12056,302,0.0,4
4,12056,3,0.0,5


In [82]:
def create_output(model, project_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProjects'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['ID', 'recommendedProjects']].drop_duplicates() \
        .sort_values('ID').set_index('ID')
    if print_csv:
        df_output.to_csv('../output/option1_recommendation.csv')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [83]:
df_output = create_output(pear, project_to_recommend, n_rec, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'option1_recommendation.csv'
(199, 1)


Unnamed: 0_level_0,recommendedProjects
ID,Unnamed: 1_level_1
12056,178|5|391|196|314|103|382|483|469|222|178|5|39...
12057,178|5|391|196|314|103|382|483|469|222|178|5|39...
12058,178|5|391|196|314|103|382|483|469|222|178|5|39...
12059,178|5|391|196|314|103|382|483|469|222|178|5|39...
12060,465|178|391|196|314|103|382|483|469|222|465|17...


In [84]:
df_output.shape

(199, 1)

In [85]:
df_output.to_csv('../output/project_recommendation_table.csv',index = True)

In [86]:
temp = pd.read_csv('../output/project_recommendation_table.csv')
temp.head()

Unnamed: 0,ID,recommendedProjects
0,12056,178|5|391|196|314|103|382|483|469|222|178|5|39...
1,12057,178|5|391|196|314|103|382|483|469|222|178|5|39...
2,12058,178|5|391|196|314|103|382|483|469|222|178|5|39...
3,12059,178|5|391|196|314|103|382|483|469|222|178|5|39...
4,12060,465|178|391|196|314|103|382|483|469|222|465|17...


In [93]:
temp.head().to_json('test.json')

# user recommend

In [89]:
# load the user similarity matrix
userSimilarityMatrix = pd.read_csv('employee_similarity_matrix.csv', index_col = 'ID')

In [91]:
"""
This function returns a list of employe ID (string) with highest similarity to lowest
"""
def findTopKSimilarEmployee(eId, topK = 'all', eSimilarityMatrixFile='employee_similarity_matrix.csv'):
    import pandas as pd
    import numpy as np
    matrix = pd.read_csv(eSimilarityMatrixFile, index_col = 'ID')
    matrix.index = matrix.index.map(str)
    # retrieve ranked employ based on e-e similarity
    sim = matrix.loc[str(eId),:]
    sortedSim = sim.sort_values(ascending=False)
    # return a list of employee id from high to low 
    eIdSorted = sortedSim.iloc[1:].index.tolist()
    if(topK == 'all'):
        return eIdSorted
    else:
        return eIdSorted[0:topK]

In [92]:
print("The top 5 similar employee for employee %d is: %s"%(5, '|'.join(findTopKSimilarEmployee('12070',5))))

The top 5 similar employee for employee 5 is: 12149|12203|12147|12217|12224
