### Simple Transfer Demo
This notebook is for demo of data augmentation method

### 1. Load necessary Libs

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from surprise import SVD, Dataset, Reader, SVDpp
from surprise import accuracy
from sklearn.preprocessing import LabelEncoder
import random

import warnings
warnings.filterwarnings("ignore")

### 2. Import datasets

In [2]:
# ML1M datasets

movies_df = pd.read_csv('../datasets/ml-1m/movies.dat', sep='::', names=['item_id', 'title', 'genres'], engine = "python", encoding = "ISO-8859-1")
movies_df['item_id'] = movies_df['item_id'].astype(np.int)

ratings_df = pd.read_csv('../datasets/ml-1m/ratings.dat', sep='::', names=['user_id', 'item_id', 'rating', 'timestamp'], engine = "python")
ratings_df['user_id'] = ratings_df['user_id'].astype(np.int)
ratings_df['item_id'] = ratings_df['item_id'].astype(np.int)
data_df_1m = pd.merge(ratings_df, movies_df, how = "inner", on = "item_id")

In [3]:
# ML100K

ratings_df_100k = pd.read_csv("../datasets/ml-100k/u.data", 
                         sep = "\t",
                         names = ['user_id', 'item_id', 'rating', 'timestamp'], 
                         engine = "python")

m_cols = ["item_id", "title"]
movies_df_100k = pd.read_csv('../datasets/ml-100k/u.item',
                        names = m_cols,
                        usecols = range(2),
                        sep='|', 
                        engine = "python",
                        encoding = "ISO-8859-1")

data_df_100 = pd.merge(ratings_df_100k, movies_df_100k, how = "inner", on = "item_id")

### 3.1 Encode item cross two domains (titles)

In [4]:
arr_100k = data_df_100["title"].values
arr_1m = data_df_1m["title"].values
all_titles = np.concatenate((arr_1m, arr_100k))
all_unique = np.unique(all_titles)


lbe = LabelEncoder()
lbe.fit(all_unique)

data_df_100["item_id"] = lbe.transform(data_df_100["title"])
data_df_1m["item_id"] = lbe.transform(data_df_1m["title"])

In [5]:
print(f"number of unique item in ml100k: {data_df_100['item_id'].nunique()}, number of unique item in ml1m: {data_df_1m['item_id'].nunique()}")

number of unique item in ml100k: 1664, number of unique item in ml1m: 3706


### 3.2 Encode user cross two domains

In [6]:
lbe = LabelEncoder()
data_df_100["user_id"] = lbe.fit_transform(data_df_100["user_id"])
print(f"number of unique user in ml100k: {data_df_100['user_id'].nunique()}")

number of unique user in ml100k: 943


### 3.3 rename the user id: The first 943(0--942) users are from ml100k

In [7]:
lbe = LabelEncoder()
data_df_1m["user_id"] = lbe.fit_transform(data_df_1m["user_id"])
data_df_1m["user_id"] = data_df_1m["user_id"] + 943

In [8]:
data_df_1m["user_id"].nunique()

6040

In [11]:
print(f"ml1m interaction: {data_df_1m.shape[0]}, ml100k interactions: {data_df_100.shape[0]}, ml1m density: {data_df_1m.shape[0] / (data_df_1m['user_id'].nunique() * data_df_1m['item_id'].nunique())}")

ml1m interaction: 1000209, ml100k interactions: 100000, ml1m density: 0.044683625622312845


In [12]:
print(f"ml100k density: {data_df_100.shape[0] / (data_df_100['user_id'].nunique() * data_df_100['item_id'].nunique())}")

ml100k density: 0.06372868912635615


### 4. train, test, valid split

In [22]:
from sklearn.model_selection import train_test_split
# 1m
train_df_1m, rest_df_1m = train_test_split(data_df_1m, test_size = 0.2, random_state = 133)
test_df_1m, valid_df_1m = train_test_split(rest_df_1m , test_size = 0.5, random_state = 133)
# 100k
train_df_100, rest_df_100 = train_test_split(data_df_100, test_size = 0.2, random_state = 133)
test_df_100, valid_df_100 = train_test_split(rest_df_100, test_size = 0.5, random_state = 133)

### 5. Augment 2 training data

In [23]:
train_df_100.drop(columns= ["title"], inplace = True)
train_df_1m = train_df_1m.drop(columns = ["title", "genres"])
train_df = pd.concat([train_df_100, train_df_1m])

In [26]:
random.seed(133)
np.random.seed(133)

reader = Reader(rating_scale=(1, 5))
training_data = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
testing_data = Dataset.load_from_df(test_df_100[['user_id', 'item_id', 'rating']], reader)
algo = SVD(verbose = True)
training_data = training_data.build_full_trainset()
testing_data = testing_data.build_full_trainset().build_testset()

algo.fit(training_data,)
training_eval = training_data.build_testset()
train_pre = algo.test(training_eval)
train_rmse = accuracy.rmse(train_pre, verbose=False)
test_pre = algo.test(testing_data)
test_rmse = accuracy.rmse(test_pre, verbose=False)

print(f"the rmse on ml100k test data if we concatenate datasets together:{test_rmse}")

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
the rmse on ml100k test data if we concatenate datasets together:0.9141560790060519


### 6. Test on ml100k alone

In [27]:
random.seed(133)
np.random.seed(133)

reader = Reader(rating_scale=(1, 5))
training_data = Dataset.load_from_df(train_df_100[['user_id', 'item_id', 'rating']], reader)
testing_data = Dataset.load_from_df(test_df_100[['user_id', 'item_id', 'rating']], reader)
algo = SVD(verbose = True)
training_data = training_data.build_full_trainset()
testing_data = testing_data.build_full_trainset().build_testset()

algo.fit(training_data,)
training_eval = training_data.build_testset()
train_pre = algo.test(training_eval)
train_rmse = accuracy.rmse(train_pre, verbose=False)
test_pre = algo.test(testing_data)
test_rmse = accuracy.rmse(test_pre, verbose=False)

print(f"the rmse on test if we train ml100k alone:{test_rmse}")

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
the rmse on test if we train ml100k alone:0.9337442712215435
