# Personalization V2: Collaborative Filtering on Likes and Views


### [Obtaining Views and Likes Data](#Obtaining-Views-and-Likes-Data)

### [Transforming and Aggregating Data](#Transforming-and-Aggregating-Data)

### [Creating Score and Applying Penalties](#Creating-Score-and-Applying-Penalties)

### [Preparing Data](#Preparing-Data)

### [Training Model](#Training-Model)

### [Selecting Daily Test Users](#Selecting-Daily-Test-Users)

### [Generating Predictions](#Generating-Predictions)

### [Writing Out Predictions](#Writing-Out-Predictions)

In [None]:
import implicit
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pytz

from datetime import datetime, timedelta

from google.cloud import storage
from scipy.sparse import csr_matrix

import warnings; warnings.simplefilter('ignore')

today = datetime.strftime(datetime.now(tz=pytz.timezone("Asia/Tokyo")), "%Y%m%d")

bucket = "..."

in_bucket_base = bucket + "inputs/"
in_bucket = bucket + "inputs/implicit-svd/"
out_bucket = bucket + "outputs/"

dislikes_bucket = in_bucket_base + "beacon_events/recommend-dislikes/"
dislikes_file_base = dislikes_bucket + "{}_daily_recs_dislikes.csv"

### both users and likes data are provided by the collect-recs-data DAG in Airflow
users_file = in_bucket_base + "users.csv"
likes_file = in_bucket + "likes_v2.csv"

N_recs = 100
N_DAYS_DISLIKES_DATA = 14

### Papermill Parameters

In [None]:
district_id = 1

In [None]:
outfile = f"implicit_svd_v2_district_{district_id}.csv"

### Declaring Input/Output GCS Directories

In [None]:
client = storage.Client(project="...")

## Obtaining user sample group

In [None]:
users = pd.read_csv(users_file, names = ['user_id', 'gender', 'district_id'])
users = users[users.district_id == district_id]
users = users[['user_id']]

## Likes Data (SQL)

The following code is intended for automated implementation on a daily basis.

In [None]:
likes_cols = [
    'user_id',
    'gender',
    'user_age',
    'user_prefecture_id',
    'user_district_id',
    'target_user_id',
    'target_gender',
    'target_user_age',
    'target_prefecture_id',
    'target_district_id',
    'has_greeting',
    'matched'
]

likes = pd.read_csv(likes_file, names = likes_cols)

### Filtering likes to the 'district'

As this is a factorization method, we cannot train the model and then remove users later.

In [None]:
likes = likes[likes.user_id.isin(users['user_id'].unique())]

In [None]:
likes.drop_duplicates(inplace=True)

In [None]:
likes.query('gender == 1').user_id.nunique(), likes.query('gender == 2').user_id.nunique()

### Engineering matched likes as likes from the target_users' perspectives
Apparently, liking in response to a received like is not separately recorded as a like. This means the majority of likes from our female users are not logged as such in the database. 

In [None]:
print("Proportion of matches with women (passive likes) and sent out by women (active likes)")
((likes.matched == True) & (likes.target_gender == 2)).mean(), (likes.gender == 2).mean()

In [None]:
#for women
matched_likes_for_women_bef = likes[(likes.matched == True) & (likes.target_gender == 2)]

matched_likes_for_women = pd.DataFrame({
  'user_id': matched_likes_for_women_bef['target_user_id'],
  'gender': matched_likes_for_women_bef['target_gender'],
  'user_age': matched_likes_for_women_bef['target_user_age'],
  'user_district_id': matched_likes_for_women_bef['target_district_id'],
  'target_user_id': matched_likes_for_women_bef['user_id'],
  'target_gender': matched_likes_for_women_bef['gender'],
  'target_user_age': matched_likes_for_women_bef['user_age'],
  'target_district_id': matched_likes_for_women_bef['user_district_id'],
  'has_greeting': 0, # 一応0にしました
  'matched': 0 # 一応0にしました
})

likes = likes.append(matched_likes_for_women)

# Filtering again, because we added new likes data
likes = likes[likes.user_id.isin(users['user_id'].unique())]

# In case of instances where likes get duplicated, this drops on user_id, target_user_id being duplicated, and keeps only
# the first entry. Each user_id -> target_user_id pair should be unique.
likes.drop(likes[likes[['user_id', 'target_user_id']].duplicated(keep = False)].index, axis = 0, inplace = True)

del matched_likes_for_women_bef, matched_likes_for_women

## Transforming and Aggregating Data

In [None]:
user_like_counts = likes.groupby('user_id').target_user_id.count()
target_like_counts = likes.groupby('target_user_id').user_id.count()
user_like_counts.name = 'user_likes_sent'
target_like_counts.name = 'target_likes_received'

#### With > 1 likes the entry requirement.

In [None]:
#proportion of target_user_gender be removed and impact on likes data set size
print((user_like_counts > 1).mean())
print(likes[likes.user_id.isin(user_like_counts[user_like_counts > 1].index)].shape[0]/likes.shape[0])

In [None]:
df = likes[likes.user_id.isin(user_like_counts[user_like_counts > 1].index)]

In [None]:
df["liked"] = 1


## Obtaining the past 2 weeks of DailyRecommend Dislikes data

In [None]:
def create_dislikes_df(n_days):
    dislikes = pd.DataFrame(
        [], 
        columns = [
            'user_id',
            'target_user_id'
        ]
    )
    for date in np.array([datetime.today() - timedelta(days = i) for i in range(1, n_days + 1)]):
        try:
            date = date.strftime("%Y%m%d")
            print(f"Obtaining dislikes for {date}")
            dislikes = dislikes.append(
                pd.read_csv(
                    dislikes_file_base.format(date),
                    dtype = {'user_id': np.int32, 'target_user_id':np.int32}
                )
            )
            dislikes.drop_duplicates(inplace = True) 
        except FileNotFoundError:
            print(f"Could not find dislikes data at {dislikes_file_base.format(date)}. Please examine URI.")
        
    dislikes['liked'] = 0

    return dislikes

In [None]:
dislikes = create_dislikes_df(n_days = N_DAYS_DISLIKES_DATA)

In [None]:
dislikes.shape

In [None]:
### reducing dislikes to users and target users in the likes dataset to avoid conflicts
### it's possible to avoid this step (if the dislikes were also subset for district, 
### but the effect of avoiding it would be to introduce new user-target_user pairs, who have not liked.
### this might not be a bad thing, but it would require re-thinking how to weight negative interactions
### together with any other interaction data for these interactions

dislikes = dislikes[
    (dislikes.user_id.isin(df.user_id.unique())) & 
    (dislikes.target_user_id.isin(df.target_user_id.unique()))
]

dislikes.drop_duplicates(inplace=True)
dislikes.shape

In [None]:
user_gender = likes.set_index('user_id').gender.to_dict()
target_user_gender = likes.set_index('target_user_id').target_gender.to_dict()

dislikes['gender'] = dislikes.user_id.apply(lambda x: user_gender[x])
dislikes['target_gender'] = dislikes.target_user_id.apply(lambda x: target_user_gender[x])

In [None]:
del user_gender, target_user_gender

#### This is check on disliked user-target_user pairs also being in the likes data as liked pairs.

Resetting the index first is necessary for the .isin() function to work.

In [None]:
df.reset_index(drop = True, inplace = True)
dislikes.reset_index(drop = True, inplace = True)

In [None]:
(df[['user_id', 'target_user_id']].isin(dislikes[['user_id', 'target_user_id']])).any(axis = 1).sum()

The output of this reflects the number of user - target_user likes where the user _skipped_ the target_user one or more times during daily recommendation. We want to remove these from the dislikes data too, to avoid complication.

In [None]:
df = pd.concat(
    [df, dislikes]
)

In [None]:
df.drop(index=df[(df[['user_id', 'target_user_id']].duplicated(keep=False)) & (df.liked == 0)].index, inplace = True)

In [None]:
df.liked.mean()

## Creating Score and Applying Penalties


### Weighting for Likes and Matches

The intuition here is that users who like less often probably are more careful in their liking, and so fewer likes per active user suggests more value for learning from that data.

In [None]:
sent_likes_weighting = user_like_counts/(1 + np.log(user_like_counts))
received_likes_weighting = target_like_counts/(1 + np.log(target_like_counts))

In [None]:
del user_like_counts, target_like_counts

In [None]:
df = df.join(sent_likes_weighting, on = 'user_id', how = 'left')
df = df.join(received_likes_weighting, on = 'target_user_id', how = 'left')

df = df.rename({'user_likes_sent': 'send_penalty', 'target_likes_received': 'receive_penalty'}, axis = 1)

In [None]:
### Matches are rare and presumably powerful indicators of mutual interest. 
### So, we take the inverse of the average, which winds up being like 10~33.0 (it gets added linearly).
### Note, NaNs were introduced into "matched" by adding the dislikes data, but they are ignored by the mean() function.
matching_weight = df.matched.mean(skipna = True)**-1
matching_weight

In [None]:
X = df.fillna(0)
X['match_score'] = X.matched * matching_weight

## Preparing Data

### Separating and Standardizing our Data

In [None]:
male_data = X[X.gender == 1].copy()
female_data = X[X.gender == 2].copy()

del X, df

In [None]:
def create_standardized_interaction_matrix(X):
    X['has_greeting_score'] = X.has_greeting * (X.has_greeting.mean()**-0.5)
    for col in ['has_greeting_score', 'match_score', 'receive_penalty', 'send_penalty']:
        X[col] = (X[col] - X[col].min())/(X[col].max() - X[col].min())
    ### Rating as linear combination of standardized, weighted features
    X['rating'] = X.has_greeting_score + X.match_score + X.receive_penalty + X.send_penalty
    
    ### and now, setting rating to -1 if the target has been passed on
    X['rating'] = X.apply(lambda x: -1 if x['liked'] == 0 else x['rating'], axis = 1)
    
    return X[['user_id', 'target_user_id', 'rating']]

In [None]:
male_data = create_standardized_interaction_matrix(male_data)
female_data = create_standardized_interaction_matrix(female_data)

In [None]:
male_data.rating.hist()

In [None]:
female_data.rating.hist()

### Sparsifying Our Data for Algorithm

In [None]:
def df2mat(df, m, n):
    mat = csr_matrix(
        (df["rating"], (m, n)),
        shape = (m.max()+1, n.max()+1),
        dtype=np.float32)
    return mat

In [None]:
male_uid_codes = {uid:code for uid, code in zip(male_data.user_id.values, male_data.user_id.astype('category').cat.codes)}
male_tid_codes = {code:tid for tid, code in zip(male_data.target_user_id.values, male_data.target_user_id.astype('category').cat.codes)}
male_tid_codes_inv = {tid:code for tid, code in zip(male_data.target_user_id.values, male_data.target_user_id.astype('category').cat.codes)}

female_uid_codes = {uid:code for uid, code in zip(female_data.user_id.values, female_data.user_id.astype('category').cat.codes)}
female_tid_codes = {code:tid for tid, code in zip(female_data.target_user_id.values, female_data.target_user_id.astype('category').cat.codes)}
female_tid_codes_inv = {tid:code for tid, code in zip(female_data.target_user_id.values, female_data.target_user_id.astype('category').cat.codes)}

In [None]:
M_sparse = df2mat(male_data, male_data.user_id.astype('category').cat.codes, male_data.target_user_id.astype('category').cat.codes)
F_sparse = df2mat(female_data, female_data.user_id.astype('category').cat.codes, female_data.target_user_id.astype('category').cat.codes)

## Generating Predictions

#### Defining a few utilities

In [None]:
def filter_items(user_id):
    """Filter out target users who meet any of the following conditions: 
    -> user and targets have different district_id
    -> user and targets' user ages are more than 5 years apart
    """
    tmp_age = likes[likes.user_id == user_id].user_age.iloc[0]
    tmp_gender = likes[likes.user_id == user_id].gender.iloc[0]
    tmp_district_id = likes[likes.user_id == user_id].user_district_id.iloc[0]
    likes_subset = likes[likes.target_gender != tmp_gender]
    filtered_target_users = likes_subset.query(" \
                                        (5 < abs({} - target_user_age) or ({} != target_district_id)) \
    ".format(tmp_age, tmp_district_id)).target_user_id.values
    
    return set(filtered_target_users)

In [None]:
def generate_recommendations(users, model, X_sparse, uid_codes, tid_codes, tid_codes_inv, N = 100):
    recommendations = {}
    for user in users:
        to_filter = [tid_codes_inv[t_user] for t_user in filter_items(user) if t_user in tid_codes_inv]
        uid_code = uid_codes[user]
        try:
            recs = model.recommend(
                uid_code, 
                X_sparse, 
                N = N, 
                filter_already_liked_items=True,
                filter_items=to_filter
            )
            recs = [(tid_codes[r[0]], r[1]) for r in recs]
            recommendations[user] = recs
            
        except Exception as e:
            print(e)
            break
    return recommendations

def get_rec_counts(recs):
    rec_counts = {}
    for uid, recs in recs.items():
        for rec in recs:
            rec_counts[rec[0]] = rec_counts.get(rec[0], 0) + 1
    return rec_counts

def convert_to_long(all_recs):
    for user, recs in all_recs.items():
        for rec in recs:
            yield int(user), int(rec[0]), rec[1]

### Training with Bayesian Optimizaton

#### Coverage with Bayesian Optimization

In [None]:
BPR_m_model = implicit.bpr.BayesianPersonalizedRanking()
BPR_m_model.fit(M_sparse.T.tocoo())
BPR_f_model = implicit.bpr.BayesianPersonalizedRanking()
BPR_f_model.fit(F_sparse.T.tocoo())

In [None]:
BPR_male_recs = generate_recommendations(
    users=male_data.user_id.unique(), 
    model=BPR_m_model, 
    X_sparse=M_sparse,
    uid_codes=male_uid_codes,
    tid_codes=male_tid_codes,
    tid_codes_inv=male_tid_codes_inv,
    N=N_recs
)

BPR_female_recs = generate_recommendations(
    users=female_data.user_id.unique(), 
    model=BPR_f_model, 
    X_sparse=F_sparse,
    uid_codes=female_uid_codes,
    tid_codes=female_tid_codes,
    tid_codes_inv=female_tid_codes_inv,
    N=N_recs
)

In [None]:
BPR_m_rec_counts = get_rec_counts(BPR_male_recs)
BPR_f_rec_counts = get_rec_counts(BPR_female_recs)

BPR_m_rec_cov = len(BPR_m_rec_counts)/male_data.target_user_id.nunique()
BPR_f_rec_cov = len(BPR_f_rec_counts)/female_data.target_user_id.nunique()

In [None]:
print("Coverage proportion for males:   {}".format(BPR_m_rec_cov))
print("Coverage proportion for females: {}".format(BPR_f_rec_cov))

In [None]:
plt.plot(list(range(len(BPR_f_rec_counts))),sorted(list(BPR_f_rec_counts.values()), reverse = True))
plt.plot(list(range(len(BPR_m_rec_counts))),sorted(list(BPR_m_rec_counts.values()), reverse = True))

## Writing Out Combined Predictions
Of dimensions user_ids x target_user_id (long and thin matrix).

In [None]:
combined_recs_df_long = pd.concat(
    [
        pd.DataFrame(
            convert_to_long(BPR_male_recs), 
            columns = ['user_id', 'target_user_id', 'predicted_rating']
        ),
        pd.DataFrame(
            convert_to_long(BPR_female_recs),
            columns = ['user_id', 'target_user_id', 'predicted_rating']
        )
    ]
)

In [None]:
combined_recs_df_long.groupby('target_user_id').user_id.count().plot(kind = 'hist', bins = 50)

In [None]:
def write_to_recs_bucket(recs_df):
    n_limit = min(N_recs, recs_df.groupby("user_id").target_user_id.count().max())
    recs_df = recs_df.groupby("user_id").apply(
        lambda x: x.sample(n = n_limit, replace = False) if len(x) >= n_limit else x
    ).reset_index(drop = True)
    recs_df.to_csv(out_bucket + outfile, header = False, index = False)

In [None]:
write_to_recs_bucket(combined_recs_df_long);