In [69]:
import csv
import matplotlib.pyplot as plt
import os
import pandas as pd
import pytz
import numpy as np
import seaborn as sns
import sys
import time
import yaml

from datetime import datetime
from datetime import timedelta

from scipy import sparse
from sklearn.model_selection import train_test_split

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k

from google.cloud import storage

import warnings

warnings.simplefilter("ignore")

%matplotlib inline

In [2]:
district_id = 4

In [3]:
with open("hyperparameters.yaml", "r") as file:
    hyperparameters = yaml.load(file)

  


In [4]:
# district_id to fraction of the users to be included
reduced_samples = {
    4: 0.1,
    6: 0.8,
    7: 0.5
}

In [5]:
gcs_client = storage.Client(project="linkbal-dp")

today = datetime.strftime(datetime.now(tz=pytz.timezone("Asia/Tokyo")), "%Y%m%d")

bucket = "cl-personalization.datasets.linkbal.com"
beacon_path = "gs://" + bucket + "/inputs/beacon_events/"
dislikes_file_base = beacon_path + "recommend-dislikes/{}_daily_recs_dislikes.csv"

### both users and likes data are provided by the collect-recs-data DAG in Airflow
prefix = "inputs/lightfm"
users_file = "users.csv"
users_path = prefix + "/" + users_file
likes_file = "likes.csv"
likes_path = "gs://" + bucket + "/" + prefix + "/" + likes_file

rec_output_path = "gs://" + bucket + "/outputs/lightfm_v1_district_{}.csv"


max_recs = 150
target_recs = 50
n_days_dislikes_data = 14



# Global variables

In [6]:
NUM_THREADS = os.cpu_count() # set this to the number of CPU cores to take advantage of parallel training
EPOCHS = 50

# User data

### Quick and dirty fix with bash utils

In [7]:
bucket_obj = gcs_client.get_bucket(bucket)
bucket_obj.get_blob(f'{users_path}').download_to_filename(f'{users_file}')

In [8]:
! cat $users_file | sed 's/"N,/,/g' | sed 's/"//g' > clean_users.csv
! mv clean_users.csv $users_file

In [9]:
user_cols = [
    'user_id',
    'gender',
    'age',
    'self_introduction',
    'blood_type',
    'brother_and_sister',
    'job_id',
    'annual_salary_range',
    'body_shape',
    'education_background',
    'hometown_prefecture_id',
    'nationality',
    'holiday',
    'smoking',
    'drinking',
    'housemate',
    'sociality',
    'intention_to_marry',
    'marital_status',
    'absence_or_presence_of_child',
    'whether_want_child',
    'housework_and_child_rearing',
    'meeting_wish_type',
    'first_dating_expense_type',
    'height',
    'personal_color',
    'no_lover_history',
    'gamble',
    'cooking_skill',
    'completion_rate',
    'district_id',
    'created_at'
]

In [10]:
user_df = pd.read_csv(
    users_file,
    names = user_cols,
    parse_dates = ['created_at'],
    dtype = {'user_id': np.int32, 'target_user_id':np.int32}
).query('district_id == @district_id').reset_index(drop = True)

### reduce user space due to LightFM long run-times
if district_id in reduced_samples:
    user_df = user_df.sample(frac = reduced_samples[district_id])

In [11]:
# define discrete variables
continuous_vars = ['height', 'age', 'self_introduction']
discrete_vars = list(set(user_df.columns) - set(continuous_vars) - set(['user_id','gender']))

### Missing values

In [12]:
def fillna(df, continuous_vars, discrete_vars):
    for col in continuous_vars:
        df[col].fillna(df[col].mean(), inplace=True)
    for col in discrete_vars:
        df[col].fillna('na', inplace=True)

In [13]:
fillna(user_df, continuous_vars, discrete_vars)

### Split gender

In [14]:
def categorize_age(age, low, high):
    if age < low:
        return 'lt_' + str(low)
    elif age > high:
        return 'gt_' + str(high)
    else:
        return str(age)

In [15]:
user_df['age_cat'] = user_df.age.apply(lambda x: categorize_age(x, 18, 60))

In [16]:
def categorize_height(height, low, high):
    if height < low:
        return 'lt_' + str(low)
    elif height > high:
        return 'gt_' + str(high)
    else:
        return str(int(height))

In [17]:
user_df['height_cat'] = user_df.apply(
    lambda x: categorize_height(x.height, 150, 190) if x.gender == 1 else categorize_height(x.height, 140, 180), 
    axis = 1
)

In [18]:
def categorize_introduction(intro):
    if intro == 0:
        return 'na'
    elif intro > 0 and intro <= 100:
        return 'lt_100'
    elif intro > 100 and intro <= 200:
        return 'gt_100_lt_200'
    elif intro > 200 and intro <= 300:
        return 'gt_200_lt_300'
    elif intro > 300 and intro <= 400:
        return 'gt_300_lt_400'
    elif intro > 400 and intro <= 500:
        return 'gt_400_lt_500'
    else:
        return 'gt_500'

In [19]:
user_df['intro_cat']= user_df.self_introduction.apply(categorize_introduction)

# Interaction data

### Like data

In [20]:
likes_cols = [
    'user_id',
    'target_user_id',
    'checked',
    'matched',
    'deleted'
]

like_df = pd.read_csv(
    likes_path,
    names = likes_cols
)



In [21]:
like_df = like_df.drop_duplicates(subset=['user_id', 'target_user_id'])

In [22]:
like_df = like_df[like_df.user_id.isin(user_df.user_id) & like_df.target_user_id.isin(user_df.user_id)]

In [23]:
like_df['checked'] = like_df['checked'].astype(bool)
like_df['matched'] = like_df['matched'].astype(bool)
like_df['deleted'] = like_df['deleted'].astype(bool)

### Dislike data

In [24]:
def create_dislikes_df(n_days=14):
    dislikes = pd.DataFrame(
        [], 
        columns = [
            'user_id',
            'target_user_id'
        ]
    )
    for date in np.array([datetime.today() - timedelta(days = i) for i in range(1, n_days + 1)]):
        try:
            date = date.strftime("%Y%m%d")
            print(f"Obtaining dislikes for {date}")
            dislikes = dislikes.append(
                pd.read_csv(
                    dislikes_file_base.format(date),
                    dtype = {'user_id': np.int32, 'target_user_id':np.int32}
                )
            )
            dislikes.drop_duplicates(inplace = True) 
        except FileNotFoundError:
            print(f"Could not find dislikes data at {dislikes_file_base.format(date)}. Please examine URI.")

    return dislikes

In [25]:
dislike_df = create_dislikes_df()

Obtaining dislikes for 20200624
Obtaining dislikes for 20200623
Obtaining dislikes for 20200622
Obtaining dislikes for 20200621
Obtaining dislikes for 20200620
Obtaining dislikes for 20200619
Obtaining dislikes for 20200618
Obtaining dislikes for 20200617
Obtaining dislikes for 20200616
Obtaining dislikes for 20200615
Obtaining dislikes for 20200614
Obtaining dislikes for 20200613
Obtaining dislikes for 20200612
Obtaining dislikes for 20200611


In [26]:
# dislike_df = pd.read_csv('dislikes.csv')

In [27]:
### reducing dislikes to users and target users in the users dataset

dislike_df = dislike_df[(dislike_df.user_id.isin(user_df.user_id.unique())) & (dislike_df.target_user_id.isin(user_df.user_id.unique()))]

dislike_df.drop_duplicates(inplace=True)

In [28]:
dislike_df.user_id.nunique(), user_df.user_id.nunique()

(444, 1568)

### Merge data

#### Male - female matches as female - male liking

In [29]:
match_df = like_df[like_df.matched]

In [30]:
like_df = pd.concat(
    [
        like_df,
        match_df.rename(
            columns =
            {
                'user_id': 'target_user_id', 
                'target_user_id': 'user_id'
            }
        )
    ], 
    sort=False
)

del match_df

In [31]:
like_df = like_df.drop_duplicates(subset=['user_id', 'target_user_id'])

In [32]:
like_df.shape[0]

31102

In [33]:
user_likes_sent = like_df.groupby('user_id').target_user_id.count().\
                            reset_index().\
                            rename(columns={"target_user_id": "user_likes_sent"})
target_likes_received = like_df.groupby('target_user_id').user_id.count().\
                            reset_index().\
                            rename(columns={"user_id": "target_likes_received"})
target_matches = like_df[like_df.matched].groupby('target_user_id').user_id.count().\
                            reset_index().\
                            rename(columns={"user_id": "target_matches"})

In [34]:
like_df = like_df.merge(user_likes_sent, on = 'user_id', how = 'left')
like_df = like_df.merge(target_likes_received, on='target_user_id', how = 'left')
like_df = like_df.merge(target_matches, on='target_user_id', how = 'left')

In [35]:
like_df['liked'] = True

In [36]:
merge_df = like_df.merge(dislike_df, on=['user_id', 'target_user_id'], how='outer')

In [37]:
del like_df, user_likes_sent, target_likes_received, target_matches

In [38]:
merge_df.matched.fillna(False, inplace=True)
merge_df.deleted.fillna(False, inplace=True)
merge_df.checked.fillna(False, inplace=True)
merge_df.user_likes_sent.fillna(0, inplace=True)
merge_df.target_likes_received.fillna(0, inplace=True)
merge_df.target_matches.fillna(0, inplace=True)
merge_df.liked.fillna(False, inplace=True)

In [39]:
### The effect of this is to drop dislike records if a like was eventually sent,
### since there should only be one interaction record b/t user and target pair, either like or dislike.
### All dislikes duplicated have already been removed, so any duplicates remaining are 1 like and 1 dislike

merge_df.drop(
    index=merge_df[
        (merge_df[['user_id', 'target_user_id']].duplicated(keep=False)) & 
        (merge_df.liked == 0)].index, 
    inplace = True
)

In [40]:
merge_df = merge_df.merge(user_df[['user_id', 'gender']], on='user_id')

# Split gender

### Interaction

In [41]:
male_interaction_df = merge_df[merge_df.gender == 1].copy()
female_interaction_df = merge_df[merge_df.gender == 2].copy()

In [42]:
male_interaction_df.shape[0], female_interaction_df.shape[0]

(31958, 4631)

In [43]:
del merge_df

# Build lightfm dataset

### Feature list

In [44]:
feature_vars = [
       'blood_type', 'brother_and_sister', 'annual_salary_range',
       'education_background', 'holiday', 'smoking', 'drinking',
       'sociality', 'intention_to_marry', 'marital_status',
       'absence_or_presence_of_child', 'whether_want_child',
       'housework_and_child_rearing', 'meeting_wish_type',
       'first_dating_expense_type', 'body_shape', 'housemate',
       'personal_color', 'gamble', 'cooking_skill', 'job_id',
       'hometown_prefecture_id', 'age_cat', 'height_cat', 'intro_cat'
]

In [45]:
## creating indicator variables

male_feature_list = []
female_feature_list = []

# categorical features
for var in feature_vars:
    male_feature_list += [var + '_' + str(val) for val in user_df[user_df.gender == 1][var].unique()]
    female_feature_list += [var + '_' + str(val) for val in user_df[user_df.gender == 2][var].unique()]

len(male_feature_list), len(female_feature_list)

(281, 258)

### User feature

In [46]:
def create_features(x):
    return (x['user_id'], [var + '_' + str(x[var]) for var in feature_vars])

## Train model func

In [47]:
def train_model(interactions, weights, item_features, user_features):
    model = LightFM(**hyperparameters)

    model.fit(
        interactions,
        item_features=item_features,
        user_features=user_features,
        sample_weight=weights,
        epochs=EPOCHS,
        num_threads=NUM_THREADS,
        verbose=False
    )
    
    return model

### Male user recommendation

### Profile data

In [48]:
### "select users who have interactions or who have completed at least half of their profiles."

m_male_df = user_df[
    (user_df.gender == 1) &
    (
        (user_df.user_id.isin(male_interaction_df.user_id.unique())) | 
        (user_df.completion_rate > 0.5)
    )
].copy()

m_female_df = user_df[
    (user_df.gender == 2) & 
    (
        (user_df.user_id.isin(male_interaction_df.target_user_id.unique())) |
        (user_df.completion_rate > 0.5)
    )
].copy()

In [49]:
m_male_df.shape[0], m_female_df.shape[0]

(784, 686)

### Build dataset

In [50]:
m_male_df['features'] = m_male_df.apply(create_features, axis=1)
m_female_df['features'] = m_female_df.apply(create_features, axis=1)

In [51]:
dataset_m = Dataset()

dataset_m.fit(
    set(m_male_df['user_id']), 
    set(m_female_df['user_id']),
    user_features=male_feature_list, 
    item_features=female_feature_list
)

In [52]:
user_features_m = dataset_m.build_user_features(m_male_df['features'])
item_features_m = dataset_m.build_item_features(m_female_df['features'])

### Calculate interaction score

In [53]:
def match_score(x):
    if x['matched']:
        return 5
    elif x['checked'] or x['deleted'] or not x['liked']:
        return 1
    elif x['gender'] == 2:
        return 4
    elif x['gender'] == 1:
        return 3

In [54]:
male_interaction_df['match_score'] = male_interaction_df.apply(match_score, axis=1)

### Penalties

In [55]:
male_interaction_df['tg_received_penalty'] = 1 / (np.log(male_interaction_df['target_likes_received'] + 1) + 1)

In [56]:
male_interaction_df['sent_penalty'] = 1 / (np.log(male_interaction_df['user_likes_sent'] + 1) + 1)

In [57]:
male_interaction_df['tg_generousness_score'] = np.log(male_interaction_df['target_matches'] + 1)
male_interaction_df['tg_generousness_score'] = \
    (male_interaction_df['tg_generousness_score'] - male_interaction_df['tg_generousness_score'].min()) / \
    (male_interaction_df['tg_generousness_score'].max() - male_interaction_df['tg_generousness_score'].min())

### Score

In [58]:
M_MATCH_SCORE_WEIGHT = 1
M_RECEIVED_PENALTY_WEIGHT = 2
M_SENT_PENALTY_WEIGHT = 1
M_TG_GENEROUSNESS_WEIGHT =  1

In [59]:
male_interaction_df['score'] = \
    male_interaction_df['match_score'] * M_MATCH_SCORE_WEIGHT + \
    male_interaction_df['tg_received_penalty'] * M_RECEIVED_PENALTY_WEIGHT + \
    male_interaction_df['sent_penalty'] * M_SENT_PENALTY_WEIGHT + \
    male_interaction_df['tg_generousness_score'] * M_TG_GENEROUSNESS_WEIGHT

male_interaction_df['score'] = male_interaction_df.apply(lambda x: -1 if x.liked == 0 else x.score, axis = 1)

In [60]:
male_interaction_df.score.describe()

count    31958.000000
mean         1.833989
std          1.529512
min         -1.000000
25%          1.601806
50%          1.834727
75%          2.024118
max          6.940358
Name: score, dtype: float64

In [61]:
male_interaction_df = male_interaction_df[
    male_interaction_df.user_id.isin(m_male_df.user_id) & 
    male_interaction_df.target_user_id.isin(m_female_df.user_id)
]

### Train and Test

In [None]:
male_interactions, male_weights = dataset_m.build_interactions(
            list(zip(male_interaction_df.user_id, male_interaction_df.target_user_id, male_interaction_df.score))
        )

m_i_train, m_i_test = random_train_test_split(male_interactions, test_percentage = 0.33)

In [None]:
model_m = train_model(m_i_train, None, item_features_m, user_features_m)

Note: the effect of setting the male_weights entry to None and splitting the interactions into train and test sets has the effect of turning the below into a measure on the model's ability to predict an interaction.

In Anh Khoa's original nb, he uses as a test set the set of matched interactions, but he doesn't remove it from the training set, which likely led to data leakage. 

In [None]:
precision_at_k(
    model_m,
    m_i_test,
    item_features=item_features_m,
    user_features=user_features_m,
    num_threads=NUM_THREADS,
    k=50
).mean()

And here I include the interaction weights as well, which leads to better prediction.

In [None]:
male_interactions, male_weights = dataset_m.build_interactions(
            list(zip(male_interaction_df.user_id, male_interaction_df.target_user_id, male_interaction_df.score))
        )
m_i_train, m_i_test, m_w_train, m_w_test = train_test_split(male_interactions, male_weights, test_size = 0.33)

In [None]:
model_m = train_model(m_i_train, m_w_train.tocoo(), item_features_m, user_features_m)

In [None]:
precision_at_k(
    model_m,
    m_i_test,
    item_features=item_features_m,
    user_features=user_features_m,
    num_threads=NUM_THREADS,
    k=50
).mean()

And here I do what Khoa-san intended, by only testing on matched interactions:

In [None]:
matched = male_interaction_df[male_interaction_df['matched']]

male_interactions, male_weights = dataset_m.build_interactions(
            list(zip(matched.user_id, matched.target_user_id, matched.score))
        )
m_i_train, m_i_test, m_w_train, m_w_test = train_test_split(male_interactions, male_weights, test_size = 0.33)

In [None]:
model_m = train_model(m_i_train, m_w_train.tocoo(), item_features_m, user_features_m)

In [None]:
precision_at_k(
    model_m,
    m_i_test,
    item_features=item_features_m,
    user_features=user_features_m,
    num_threads=NUM_THREADS,
    k=50
).mean()

### Parameter search

In [None]:
import csv
import itertools
from sklearn.model_selection import train_test_split
from pprint import pprint
import time
import numpy as np


def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """

    while True:
        yield {
            "no_components": np.random.randint(50, 150),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["warp", "bpr"]),
            "learning_rate": np.random.exponential(0.05),
            "max_sampled": np.random.randint(5, 15),
            "num_epochs": np.random.randint(50, 150),
        }


def random_search(i_train, i_test, w_train, w_test, dwriter, num_samples=10):
    """
    Sample random hyperparameters, fit a LightFM model, and evaluate it
    on the test set.

    Parameters
    ----------

    train: np.float32 coo_matrix of shape [n_users, n_items]
        Training data.
    test: np.float32 coo_matrix of shape [n_users, n_items]
        Test data.
    num_samples: int, optional
        Number of hyperparameter choices to evaluate.


    Returns
    -------

    generator of (auc_score, hyperparameter dict, fitted model)

    """
    for i in range(num_samples):
        hyperparams = sample_hyperparameters().__next__()
        print("---Training with hyperparameters---")
        pprint(hyperparams)
        num_epochs = hyperparams.pop("num_epochs")

        model = LightFM(**hyperparams)
        start = time.time()
        model.fit(
            i_train,
            item_features=item_features_m,
            user_features=user_features_m,
            sample_weight=w_train,
            verbose=False,
            epochs=num_epochs, 
            num_threads=NUM_THREADS
        )
        train_time = int(time.time() - start)
        score = precision_at_k(
            model, 
            test_interactions=m_i_test,
            train_interactions=m_i_train, 
            user_features = user_features_m,
            item_features = item_features_m,
            k = 50,
            num_threads=NUM_THREADS,
            check_intersections=False
            ).mean()

        hyperparams["num_epochs"] = num_epochs
        hyperparams["train_time"] = train_time
        hyperparams["score"] = score
        dwriter.writerow(hyperparams)
        yield (score, hyperparams, model)

In [84]:
with open("hyperparameters-warp-100.csv", "w") as outcsv, open("hyperparameters.yaml", "r") as hypfile:
    dwriter = csv.DictWriter(
        outcsv, 
        fieldnames=[
            "num_epochs",
            "train_time",
            "mar@k",
            "map@k",
            "auc"
        ]
    )
    
    dwriter.writeheader()

    hyperparams = yaml.load(hypfile)
    
    for num_epochs in range(50, 151, 10):
        csv_output = {}
        
        male_interactions, male_weights = dataset_m.build_interactions(
            list(zip(male_interaction_df.user_id, male_interaction_df.target_user_id, male_interaction_df.score))
        )

        m_i_train, m_i_test, m_w_train, m_w_test = train_test_split(male_interactions, male_weights, test_size = 0.33)

        start = time.time()

        model = LightFM(**hyperparams)
        model.fit(
            m_i_train,
            item_features=item_features_m,
            user_features=user_features_m,
            sample_weight=m_w_train.tocoo(),
            verbose=False,
            epochs=num_epochs, 
            num_threads=NUM_THREADS
        )

        train_time = int(time.time() - start)

        mapk = precision_at_k(
            model, 
            test_interactions=m_i_test,
            train_interactions=m_i_train, 
            user_features = user_features_m,
            item_features = item_features_m,
            k = 50,
            num_threads=NUM_THREADS,
            check_intersections=False
        ).mean()
        mark = recall_at_k(
            model, 
            test_interactions=m_i_test,
            train_interactions=m_i_train, 
            user_features = user_features_m,
            item_features = item_features_m,
            k = 50,
            num_threads=NUM_THREADS,
            check_intersections=False
        ).mean()
        auc = auc_score(
            model,
            test_interactions=m_i_test,
            train_interactions=m_i_train,
            user_features=user_features_m,
            item_features=item_features_m,
            preserve_rows=False,
            num_threads=NUM_THREADS,
            check_intersections=False
        ).mean()

        csv_output["num_epochs"] = num_epochs
        csv_output["train_time"] = train_time
        csv_output["mar@k"] = mark
        csv_output["map@k"] = mapk
        csv_output["auc"] = auc

        dwriter.writerow(csv_output)