In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from statistics import mean
from sklearn.metrics import mean_absolute_error as MAE
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Historical Users


In [None]:
users = pd.read_csv('data/user_MBTI.csv')
ratings = pd.read_csv('data/trrain_historical_ratings.csv')

## 1. MBTI only + preferences

In [None]:
users.columns

Index(['ID', 'Zodiacs', 'Colours', 'Numbers', 'Seasons', 'Music_Genres',
       'Musical_instruments', 'Social_Media', 'Alignments', 'Genre_1',
       'Genre_2', 'Genre_3', 'Genre_4', 'Genre_5', 'Genre_6', 'MBTI_E',
       'MBTI_I', 'MBTI_S', 'MBTI_N', 'MBTI_T', 'MBTI_F', 'MBTI_J', 'MBTI_P',
       'off_mbti'],
      dtype='object')

In [None]:
df_train = pd.merge(users, ratings, on = 'ID').drop(['Genre_1','Genre_2', 'Genre_3', 'Genre_4', 'Genre_5', 'Genre_6', 'MBTI_E', 'MBTI_I',
       'MBTI_S', 'MBTI_N', 'MBTI_T', 'MBTI_F', 'MBTI_J', 'MBTI_P'],axis = 1).set_index('ID')

In [None]:
one_hot_columns = ['Zodiacs', 'Colours', 'Numbers', 'Seasons', 'Music_Genres',
       'Musical_instruments', 'Social_Media', 'Alignments']

In [None]:
for ele in one_hot_columns:
  # Get one hot encoding of columns B
  one_hot = pd.get_dummies(df_train[ele],prefix = 'dum_')
  # Drop column B as it is now encoded
  df_train = df_train.drop(ele,axis = 1)
  # Join the encoded df
  df_train = df_train.join(one_hot)

In [None]:
#Process off_mbti
one_hot = pd.get_dummies(df_train['off_mbti'])
# Join the encoded df
df_train = df_train.join(one_hot)

In [None]:
df_sim = df_train.drop(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20'],axis = 1)

In [None]:
# Cosine similarity
def cos_sim(user1_id, user2_id):
  drop_col = ['off_mbti'] +[str(i) for i in range(1,21)]
  user1 = df_train.drop(drop_col, axis = 1).loc[user1_id,:].values.reshape(1, -1)
  user2 = df_train.drop(drop_col, axis = 1).loc[user2_id,:].values.reshape(1, -1)
  sim = cosine_similarity(user1,user2)[0][0]
  return sim

#Không lấy topN do dữ liệu nhỏ
def get_topN_user(user,mvID):
  # Ignore ratings == 0
  mbti_user = df_train.loc[user,'off_mbti']
  sim = df_train[(df_train.off_mbti == mbti_user)]#(df_sim[str(mvID)] != -1) & (df_sim[str(mvID)] != 0)].drop('off_mbti', axis = 1)
  drop_col = ['off_mbti'] +[str(i) for i in range(1,21)]
  sim = sim[(sim[str(mvID)] != -1) & (sim[str(mvID)] != 0)].drop(drop_col, axis = 1)
  urs = sim.index.to_list()
  # chọn những user có chung nhóm MBTI tương ứng với loại bỏ cos_sim == 0
  scores = [(cos_sim(user, userID),userID) for userID in urs if userID != user] 
  return scores

def get_rating(userid, movieid):
  return(df_train.loc[userid,str(movieid)])

def pred_rating(user, mvID):
  sum_ = 0
  sum_sim = 0
  sim_users = get_topN_user(user,mvID)
  count0 = 0
  if len(sim_users) != 0:
    for score,userID in sim_users:
      sum_ += score
      sum_sim += score*get_rating(userID,mvID)
    print(sum)
    rate = (1/sum_)*sum_sim
    return rate
  else:
    return 'nan'


In [None]:
df_true = pd.read_csv('data/true_historical_ratings.csv')

In [None]:
pred = []
true = []
u_id = []
mv_id= []
user = users.copy().set_index('ID')
for i in range(len(df_true)):
  r = pred_rating(df_true.user_ID[i],df_true.mv_ID[i])
  pred.append(r)
  true.append(df_true.Rating[i])
  u_id.append(df_true.user_ID[i])
  mv_id.append(df_true.mv_ID[i])

In [None]:
result = pd.DataFrame()
result['UserID']  = u_id
result['MvID']    = mv_id
result['True']    = true
result['Pred']    = pred

In [None]:
# Evaluating
true = result[result.Pred != 'nan']['True']
pred = result[result.Pred != 'nan'].Pred 

MAE_mbti_gr = MAE(true, pred)
print('MAE using MBTI+ Movies preferences rating predict: ', MAE_mbti_gr)

MAE using MBTI+ Movies preferences rating predict:  2.1236736575848454


## 2. Ratings only

In [None]:
df_train.columns[1:20]

Index(['off_mbti', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19'],
      dtype='object')

In [None]:
def rating_cos_sim(user1,user2):
  cmmn_mv = [] #movies in common
  for movieid in df_train.columns[1:20]:
    if rating_get_rating(user1, movieid) !=-1 and rating_get_rating(user2,movieid) != -1 and rating_get_rating(user1, movieid) != 0 and rating_get_rating(user2,movieid) != 0:
      cmmn_mv.append(movieid)
  if len(cmmn_mv) == 0:
      return 0
  
  rating_user1 = np.array([rating_get_rating(user1, movieid) for movieid in cmmn_mv]).reshape(1, -1) #vertical -> horizontal
  rating_user2 = np.array([rating_get_rating(user2, movieid) for movieid in cmmn_mv]).reshape(1, -1)
  score = cosine_similarity(rating_user1, rating_user2)
  return score[0][0]

In [None]:
ratings

Unnamed: 0,ID,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,1,3,8,2,8,6,1,9,5,6,1,6,7,3,6,10,5,-1,7,3,5
1,2,10,4,7,2,10,-1,10,10,10,10,6,0,0,1,7,10,-1,10,-1,5
2,3,7,5,8,6,8,-1,9,6,7,7,4,5,3,3,4,6,7,10,10,-1
3,4,8,7,5,-1,7,5,-1,6,6,-1,6,7,10,5,7,8,8,7,5,5
4,5,8,9,9,8,8,8,10,8,9,8,9,10,10,9,10,-1,9,10,8,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,86,9,8,8,8,9,9,8,8,10,9,7,-1,10,10,10,8,-1,10,8,-1
86,87,0,5,2,8,2,0,5,7,8,3,1,0,0,7,8,3,1,0,0,0
87,88,9,10,7,6,5,0,0,8,10,0,8,0,0,0,7,8,9,7,0,8
88,89,0,9,0,9,0,0,9,0,0,8,0,0,0,0,-1,0,0,0,0,0


In [None]:
# Cosine similarity
def rating_get_topN_user(user,mvID,topN = 15):
  df = ratings[ratings[str(mvID)] != -1].set_index('ID')
  urs = df.index.to_list()
  scores = [(rating_cos_sim(user, userID),userID) for userID in urs if userID != user] 
  scores.sort(reverse = True)
  return scores[:topN]

def notop_rating_get_topN_user(user,mvID):
  df = ratings[ratings[str(mvID)] != -1].set_index('ID')
  urs = df.index.to_list()
  scores = [(rating_cos_sim(user, userID),userID) for userID in urs if userID != user] 
  #scores.sort(reverse = True)
  return scores

def rating_get_rating(userid, movieid):
  return(ratings.set_index('ID').loc[userid,str(movieid)])


In [None]:
def rating_predict_rating(user, mvID, topN = 15):
  sum_ = 0
  sum_sim = 0
  sim_users = notop_rating_get_topN_user(user,mvID)
  for score,userID in sim_users:
    sum_ += score
    sum_sim += score*get_rating(userID,mvID)
  rate = (1/sum_)*sum_sim
  return rate

In [None]:
df_true = pd.read_csv('data/true_historical_ratings.csv')

In [None]:
pred = []
true = []
u_id = []
mv_id= []
user = users.copy().set_index('ID')
for i in range(len(df_true)):
  r = rating_predict_rating(df_true.user_ID[i],df_true.mv_ID[i])
  pred.append(r)
  true.append(df_true.Rating[i])
  u_id.append(df_true.user_ID[i])
  mv_id.append(df_true.mv_ID[i])

In [None]:
result = pd.DataFrame()
result['UserID']  = u_id
result['MvID']    = mv_id
result['True']    = true
result['Pred']    = pred

In [None]:
# Evaluating
MAE_ratings = MAE(true, pred)
print('MAE using Ratings + Movie preferences rating predict: ', MAE_ratings)

MAE using Ratings + Movie preferences rating predict:  2.155906610764849


## MBTI + Rating + MV references

In [None]:
train = df_train.drop('off_mbti', axis = 1)

In [None]:
def aggregate_cos_sim(user1,user2):
  cmmn_mv = [] #movies in common
  for movieid in train.columns[0:20]:
    if aggregate_get_rating(user1, movieid) !=-1 and aggregate_get_rating(user2,movieid) != -1 and aggregate_get_rating(user1, movieid) !=0 and aggregate_get_rating(user2,movieid) != 0:
      cmmn_mv.append(movieid)
  if len(cmmn_mv) == 0:
      return 0
  
  _user1 = train.drop(train.columns[0:20], axis = 1).loc[user1,:].to_list()
  rating_user1 = np.array(_user1 + [aggregate_get_rating(user1, movieid) for movieid in cmmn_mv] ).reshape(1, -1) #vertical -> horizontal

  _user2 = train.drop(train.columns[0:20], axis = 1).loc[user2,:].to_list()

  rating_user2 = np.array(_user2 + [aggregate_get_rating(user2, movieid) for movieid in cmmn_mv]).reshape(1, -1)

  score = cosine_similarity(rating_user1, rating_user2)

  return score[0][0]

In [None]:
# Cosine similarity

def aggregate_get_topN_user(user,mvID,topN = 15):
  df = df_train[df_train[str(mvID)] != -1]
  urs = df_train.index.to_list()
  scores = [(aggregate_cos_sim(user, userID),userID) for userID in urs if userID != user] 
  scores.sort(reverse = True)
  return scores[:topN]

def aggregate_get_rating(userid, movieid):
  return(train.loc[userid,str(movieid)])


In [None]:
def aggregate_predict_rating(user, mvID, topN = 15):
  sum_ = 0
  sum_sim = 0
  sim_users = aggregate_get_topN_user(user,mvID)
  count0 = 0
  for score,userID in sim_users:
    sum_ += score
    sum_sim += score*get_rating(userID,mvID)
    if score == 0 : 
      count0 += 1
  if count0/topN >= 0.70:
    return 0
  else:
    rate = (1/sum_)*sum_sim
    return rate

In [None]:
pred = []
true = []
u_id = []
mv_id= []
user = users.copy().set_index('ID')
for i in range(len(df_true)):
  r = aggregate_predict_rating(df_true.user_ID[i],df_true.mv_ID[i])
  pred.append(r)
  true.append(df_true.Rating[i])
  u_id.append(df_true.user_ID[i])
  mv_id.append(df_true.mv_ID[i])

In [None]:
result = pd.DataFrame()
result['UserID']  = u_id
result['MvID']    = mv_id
result['True']    = true
result['Pred']    = pred

In [None]:
# Evaluating
MAE_aggregate = MAE(true, pred)
print('MAE using Ratings + MBTI preferences rating predict: ', MAE_aggregate)

MAE using Ratings + MBTI preferences rating predict:  2.0849980418899756


# Cold start

In [None]:
train= pd.read_csv('data/cold_start_train.csv')
test = pd.read_csv('data/cold_start_test.csv')

In [None]:
train.shape

(81, 97)

In [None]:
test.shape

(9, 97)

In [None]:
train_sim = train.set_index('ID').iloc[:,21:]
test_sim = test.set_index('ID').iloc[:,21:]

In [None]:
train = train.set_index('ID')
test = test.set_index('ID')

In [None]:
# Cosine similarity
def cs_cos_sim(user1_id, user2_id):
  user1 = test_sim.loc[user1_id,:].values.reshape(1, -1)
  user2 = train_sim.loc[user2_id,:].values.reshape(1, -1)
  sim = cosine_similarity(user1,user2)[0][0]
  return sim

#Không lấy topN do dữ liệu nhỏ
def cs_get_topN_user(user):
  # Ignore ratings == 0
  mbti_user = test.loc[user,'off_mbti']
  sim = train[(train.off_mbti == mbti_user)]
  urs = sim.index.to_list()
  scores = [(cs_cos_sim(user, userID),userID) for userID in urs if userID != user] 
  return scores

def cs_get_rating(userid, movieid):
  return(train.loc[userid,str(movieid)])


def cs_predict_rating(user):
  ratings = []
  sim_users = cs_get_topN_user(user)
  if len(sim_users) != 0:
    for mvID in train.columns[1:21]:
      sum_ = 0
      sum_sim = 0
      for score,userID in sim_users:
        if cs_get_rating(userID, mvID) != 0 :
          sum_ += score
          sum_sim += score*cs_get_rating(userID, mvID)
      if sum_ == 0:
        ratings.append('nan')
      else:
        rate = (1/sum_)*sum_sim
        ratings.append(rate)
  else:
    ratings = ['nan' for i in range(20)]
  return ratings

In [None]:
pred = []
true = []
u_id = []
mv_id= []
user = test.copy()
for usr in user.index.to_list():
  r = cs_predict_rating(usr)
  pred += r
  true += list(user.loc[usr,user.columns[1:21].to_list()].values)
  u_id += [usr for i in range(20)]
  mv_id += user.columns[1:21].to_list()

In [None]:
result = pd.DataFrame()
result['UserID']  = u_id
result['MvID']    = mv_id
result['True']    = true
result['Pred']    = pred
result = result[result['True'] != 0]

In [None]:
# Evaluating
true = result['True']
pred = result.Pred
MAE_cold_start = MAE(true, pred)
print('MAE using MBTI+ in4 predicts rating: ', MAE_cold_start)

MAE using MBTI+ in4 predicts rating:  1.7997126272343473
