In [20]:
## import packages
from util import createGrps
from main import createGrpsV2
from groupAggMain import aggregate
import pandas as pd
import numpy as np
from lenskit.algorithms import Recommender
from lenskit.algorithms import Predictor
from lenskit.algorithms.user_knn import UserUser
from lenskit import batch
import sys

In [21]:
## Get the training data and train the user-user collaborative filterring
# read the data (about applications)
# use full
data = pd.read_csv("../dataset/clean/user_ratings_neg_1000_20_20_1.csv", delimiter=',')
data = data.drop(columns="Unnamed: 0")
# Do not remove negatives because is a better way to generate synthetic groups (i.e. distinguish between "negative" vs "don't cares")
display(data)



Unnamed: 0,UserID,JobID,Rating
0,554,196603,1
1,554,300053,1
2,554,1078274,1
3,554,146817,1
4,554,654538,1
...,...,...,...
956220,1471233,1069395,-1
956221,1471233,1064781,-1
956222,1471233,1059618,-1
956223,1471233,1059605,-1


In [22]:
## First get the groups
# parameters for group generation
size = 10
grpNum = 50
seed = 12345
pathToPickle = "../groupGenv2/user_matrix_train_pivot_01.pkl"
pathToUserMatrix = "../dataset/clean/user_ratings_neg_1000_20_20_1_train.csv"
sim_thrh = 0.8

# Choose which group generator version
version = 2

# generate synthetic groups of users and display them (Version 1)
if version == 1:
    #TODO: change '../dataset/users.tsv' to match Tine's data before using option 1
    groups2 = createGrps(pd.read_csv('../dataset/users.tsv', sep='\t'), size, grpNum, seed)
else:
    groups2 = createGrpsV2(size, grpNum, pathToPickle, pathToUserMatrix, sim_thrh, seed)
display(groups2)
groups2.to_csv("syntheticGroups.csv")

50


Unnamed: 0,Users
0,"[1165352, 668454, 838185, 1056891, 578645, 950..."
1,"[549231, 583914, 941975, 294558, 803135, 99662..."
2,"[410122, 277505, 692599, 783536, 1420225, 5545..."
3,"[192178, 850445, 439070, 1340921, 473967, 1327..."
4,"[988920, 485759, 481324, 1294945, 669787, 1060..."
5,"[1360918, 1195068, 791001, 1132744, 1124402, 6..."
6,"[1249010, 643384, 448742, 187074, 588976, 2720..."
7,"[1187753, 440304, 928954, 1082928, 142184, 118..."
8,"[172933, 1217103, 1430857, 1074051, 1020507, 1..."
9,"[1174914, 755837, 1176363, 286772, 478643, 146..."


In [23]:
# construct dataframe in format (user, item, rating) via column addition
df_ui = data.rename(columns={"UserID": "user", "JobID": "item", "Rating":"rating"})
# check data being read properly
display(df_ui.head(10)) 

# train UserUser collaborative filterring
user_user = UserUser(10, min_nbrs=3)  # Minimum (3) and maximum (10) number of neighbors to consider
recsys = Recommender.adapt(user_user)
recsys.fit(df_ui)

Unnamed: 0,user,item,rating
0,554,196603,1
1,554,300053,1
2,554,1078274,1
3,554,146817,1
4,554,654538,1
5,554,336293,1
6,554,640492,1
7,554,271546,1
8,554,283949,1
9,554,1066757,1


<lenskit.algorithms.ranking.TopN at 0x7fd83d497e50>

In [24]:
# pd.set_option('display.max_rows', None)
all_ratings = []

## Create a User-Item matrix of scores so we can apply one of the aggregation strategies
# iterate through groups generated
for i, row in groups2.iterrows():
    # get the array of users from the row
    synGrp = row.iloc[0]

    # get recommendations for all group members
    ratings_grp = batch.recommend(recsys, synGrp, n=None,  n_jobs=1)
    
    # all_ratings contain array of users of the group and dataframe of ratings (item, score, user, rank)
    all_ratings.append((synGrp, ratings_grp))

In [27]:
num_pref = 5
limit = "sig"  # cap norm sig
strat = "misery" # add mult misery pleasure

#group_preferences_add_non = aggregate(all_ratings)
group_preferences_add_cap = aggregate(all_ratings, num_pref, limit="sig", strat='misery')
#group_preferences_mis_norm = aggregate(all_ratings, num_pref, limit="norm", strat='misery')

#display(group_preferences_add_non)
display(group_preferences_add_cap)
#display(group_preferences_mis_norm)

Unnamed: 0,Members,Recommendation
0,"[1165352, 668454, 838185, 1056891, 578645, 950...","[692024, 579894, 27261, 311691, 508864]"
1,"[549231, 583914, 941975, 294558, 803135, 99662...","[853314, 308908, 864097, 901476, 1032156]"
2,"[410122, 277505, 692599, 783536, 1420225, 5545...","[543918, 615880, 228396, 317252, 784650]"
3,"[192178, 850445, 439070, 1340921, 473967, 1327...","[1042869, 282364, 545615, 481315, 260713]"
4,"[988920, 485759, 481324, 1294945, 669787, 1060...","[156859, 263136, 940871, 1047376, 1040817]"
5,"[1360918, 1195068, 791001, 1132744, 1124402, 6...","[338737, 1055575, 184546, 284141, 297510]"
6,"[1249010, 643384, 448742, 187074, 588976, 2720...","[78579, 1040827, 1074185, 44194, 72524]"
7,"[1187753, 440304, 928954, 1082928, 142184, 118...","[451, 929404, 473965, 582118, 307079]"
8,"[172933, 1217103, 1430857, 1074051, 1020507, 1...","[1076994, 208481, 521315, 908559, 867261]"
9,"[1174914, 755837, 1176363, 286772, 478643, 146...","[201609, 980270, 25864, 165696, 195193]"


In [28]:
## Save the results of group aggregation to file to be later used in the evaluation
group_preferences_add_cap.to_csv("grpAggr_full.csv")