In [21]:
import numpy as np
import numpy.ma as ma
from numpy import genfromtxt
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from collections import defaultdict
import csv
from urllib import request
import os
import tabulate

In [14]:
def load_data():
    
    data_dir = "data"
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    url_content_destination_list = "https://raw.githubusercontent.com/Touventure/models/main/content-based-filtering/content_destinations_list.csv"
    url_content_item_train = "https://raw.githubusercontent.com/Touventure/models/main/content-based-filtering/content_item_train.csv"
    url_content_item_train_header = "https://raw.githubusercontent.com/Touventure/models/main/content-based-filtering/content_item_train_header.txt"
    url_content_item_vecs = "https://raw.githubusercontent.com/Touventure/models/main/content-based-filtering/content_item_vecs.csv"
    url_content_user_train = "https://raw.githubusercontent.com/Touventure/models/main/content-based-filtering/content_user_train.csv"
    url_content_user_train_header = "https://raw.githubusercontent.com/Touventure/models/main/content-based-filtering/content_user_train_header.txt"
    url_content_y_train = "https://raw.githubusercontent.com/Touventure/models/main/content-based-filtering/content_y_train.csv"
    
    request.urlretrieve(url_content_destination_list, "data/content_destination_list.csv")
    request.urlretrieve(url_content_item_train, "data/content_item_train.csv")
    request.urlretrieve(url_content_item_train_header, "data/content_item_train_header.txt")
    request.urlretrieve(url_content_item_vecs, "data/content_item_vecs.csv")
    request.urlretrieve(url_content_user_train, "data/content_user_train.csv")
    request.urlretrieve(url_content_user_train_header, "data/content_user_train_header.txt")
    request.urlretrieve(url_content_y_train, "data/content_y_train.csv")

    item_train = genfromtxt('data/content_item_train.csv', delimiter=',')
    user_train = genfromtxt('data/content_user_train.csv', delimiter=',')
    y_train    = genfromtxt('data/content_y_train.csv', delimiter=',')
    with open('data/content_item_train_header.txt', newline='') as f:
        item_features = list(csv.reader(f))[0]
    with open('data/content_user_train_header.txt', newline='') as f:
        user_features = list(csv.reader(f))[0]
    item_vecs = genfromtxt('data/content_item_vecs.csv', delimiter=',')

    destination_dict = defaultdict(dict)
    count = 0

    with open('data/content_destination_list.csv', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        for line in reader:
            if count == 0:
                count += 1
            else:
                count += 1
                destination_id = int(line[0])
                destination_dict[destination_id]["name"] = line[1]
                destination_dict[destination_id]["category"] = line[2]

    return item_train, user_train, y_train, item_features, user_features, item_vecs, destination_dict

In [18]:
def split_str(ifeatures, smax):
    # split the feature name strings to tables fit
    ofeatures = []
    for s in ifeatures:
        if not ' ' in s:  # skip string that already have a space
            if len(s) > smax:
                mid = int(len(s)/2)
                s = s[:mid] + " " + s[mid:]
        ofeatures.append(s)
    return ofeatures

In [25]:
def pprint_train(x_train, features, vs, u_s, maxcount=5, user=True):
    """ Prints user_train or item_train nicely """
    if user:
        flist = [".0f", ".0f", ".1f",
                 ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f"]
    else:
        flist = [".0f", ".0f", ".1f",
                 ".0f", ".0f", ".0f", ".0f", ".0f", ".0f", ".0f", ".0f", ".0f", ".0f", ".0f", ".0f", ".0f", ".0f"]

    head = features[:vs]
    if vs < u_s: print("error, vector start {vs} should be greater then user start {u_s}")
    for i in range(u_s):
        head[i] = "[" + head[i] + "]"
    genres = features[vs:]
    hdr = head + genres
    disp = [split_str(hdr, 5)]
    count = 0
    for i in range(0, x_train.shape[0]):
        if count == maxcount: break
        count += 1
        disp.append([x_train[i, 0].astype(int),
                     x_train[i, 1].astype(int),
                     x_train[i, 2].astype(float),
                     *x_train[i, 3:].astype(float)
                     ])
    table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow", floatfmt=flist, numalign='center')
    return pd.DataFrame(disp)

In [41]:
def gen_user_vecs(user_vec, num_items):
    user_vecs = np.tile(user_vec, (num_items, 1))
    return user_vecs

In [46]:

def print_pred_destinations(y_p, item, destination_dict, maxcount=10):
    count = 0
    disp = [["y_p", "place id", "rating ave", "name", "category"]]

    for i in range(0, y_p.shape[0]):
        if count == maxcount:
            break
        count += 1
        destination_id = item[i, 0].astype(int)
        disp.append([np.around(y_p[i, 0], 1), item[i, 0].astype(int), np.around(item[i, 2].astype(float), 1),
                     destination_dict[destination_id]['name'], destination_dict[destination_id]['category']])

    # table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow")
    return pd.DataFrame(disp)

In [26]:
item_train, user_train, y_train, item_features, user_features, item_vecs, destination_dict = load_data()

num_user_features = user_train.shape[1] - 3  # remove userid, rating count and ave rating during training
num_item_features = item_train.shape[1] - 1  # remove place id at train time
uvs = 3  # user genre vector start
ivs = 3  # item genre vector start
u_s = 3  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items
print(f"Number of training vectors: {len(item_train)}")

Number of training vectors: 437


In [30]:
print(f"y_train[:5]: {y_train[:5]}")

y_train[:5]: [3.36666667 3.24137931 3.36666667 3.42307692 3.36363636]


In [31]:
# scale training data
item_train_unscaled = item_train
user_train_unscaled = user_train
y_train_unscaled    = y_train

scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)

scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train = scalerTarget.transform(y_train.reshape(-1, 1))
#ynorm_test = scalerTarget.transform(y_test.reshape(-1, 1))

print(np.allclose(item_train_unscaled, scalerItem.inverse_transform(item_train)))
print(np.allclose(user_train_unscaled, scalerUser.inverse_transform(user_train)))

True
True


In [33]:
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"destination/item training data shape: {item_train.shape}")
print(f"destination/item test data shape: {item_test.shape}")

destination/item training data shape: (279, 9)
destination/item test data shape: (70, 9)


In [34]:
num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear'),
])

item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear'),
])

input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

output = tf.keras.layers.Dot(axes=1)([vu, vm])
model = tf.keras.Model([input_user, input_item], output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 6)]                  0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 8)]                  0         []                            
                                                                                                  
 sequential (Sequential)     (None, 32)                   38816     ['input_1[0][0]']             
                                                                                                  
 sequential_1 (Sequential)   (None, 32)                   39328     ['input_2[0][0]']             
                                                                                              

In [35]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss=cost_fn)

In [72]:
tf.random.set_seed(1)
model.fit([user_train[:, u_s:], item_train[:, i_s:]], y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x237f840d0f0>

In [39]:
model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], y_test)



0.012821502983570099

In [85]:
new_user_id = 501
new_rating_ave = 0.0
new_Bahari = 15.0
new_Budaya = 10.0
new_Cagar_Alam = 10.0
new_Pusat_Perbelanjaan = 10.0
new_Taman_Hiburan = 10.0
new_Tempat_Ibadah = 2.0
new_rating_count = 0

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave, new_Bahari, new_Budaya, new_Cagar_Alam, new_Pusat_Perbelanjaan, new_Taman_Hiburan, new_Tempat_Ibadah]])

In [86]:
user_vecs = gen_user_vecs(user_vec,len(item_vecs))
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)
y_p = model.predict([suser_vecs[:, u_s:], sitem_vecs[:, i_s:]])
y_pu = scalerTarget.inverse_transform(y_p)
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]
print_pred_destinations(sorted_ypu, sorted_items, destination_dict, maxcount = 10)



Unnamed: 0,0,1,2,3,4
0,y_p,place id,rating ave,name,category
1,3.9,141,3.2,Bunker Kaliadem Merapi,Cagar Alam
2,3.9,379,3.2,Goa Rong,Cagar Alam
3,3.9,357,3.2,Wisata Alam Wana Wisata Penggaron,Cagar Alam
4,3.9,256,3.2,Wisata Batu Kuda,Cagar Alam
5,3.9,319,3.2,Kawah Rengganis Cibuni,Cagar Alam
6,3.9,217,3.2,Kebun Binatang Bandung,Cagar Alam
7,3.9,211,3.2,GunungTangkuban perahu,Cagar Alam
8,3.9,312,3.2,Taman Hutan Raya Ir. H. Juanda,Cagar Alam
9,3.9,242,3.2,Curug Dago,Cagar Alam
