In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
from random import sample
# import tensorflow as tf

from torch.utils.data import Dataset
from torch.utils.data.dataset import T_co

from torch import LongTensor


class TargetData(Dataset):
    def __init__(self, num_negatives=4):
        self.num_jobs = 0
        self.df = self.load_ratings()
        self.num_users = self.df.uid.nunique()
        self.num_movies = self.df.mid.nunique()
        self.users = set(self.df.uid.unique())
        self.movies = set(self.df.mid.unique())

        self.train, self.test = self._train_test_split()

        self.training_data = self.add_negatives(self.train, items=self.movies,  n_samples=num_negatives)
        self.testing_data = self.add_negatives(self.test, items=self.movies, n_samples=100)

        self.testing_tensors = self.parse_testing(self.testing_data)

    def __len__(self):
        return self.training_data.shape[0]  # Length of the data to train on

    def __getitem__(self, index) -> T_co:
        user = LongTensor([self.training_data.uid.iloc[index]])  # -1 so that indexing starts from 0
        movie = LongTensor([self.training_data.mid.iloc[index]])
        output = LongTensor([self.training_data.rating.iloc[index]])
        return user, movie, output

    def __call__(self, test_data):
        return self.parse_testing(self.add_negatives(test_data, items=self.movies, n_samples=100))

    @staticmethod
    def load_ratings(min_ratings=5):
        df = pd.read_csv('MovieLens/ratings.dat',
                         sep='::',
                         header=None,
                         names=['uid_old', 'mid_old', 'rating', 'date'],
                         parse_dates=['date'],
                         date_parser=lambda x: pd.to_datetime(x, unit='s', origin='unix'),
                         engine='python')

        # DROP MOVIES WITH LESS THAN 5 RATINGS
        s = df.groupby(['mid_old']).size()
        low_n_ratings = s[s < min_ratings].reset_index().mid_old.tolist()
        df = df[~df.mid_old.isin(low_n_ratings)]
        # RE-INDEX USERS AND MOVIES
        user_id = df[['uid_old']].drop_duplicates().reindex()
        user_id['uid'] = np.arange(len(user_id))
        df = pd.merge(df, user_id, on=['uid_old'], how='left')

        item_id = df[['mid_old']].drop_duplicates()
        item_id['mid'] = np.arange(len(item_id))
        return pd.merge(df, item_id, on=['mid_old'], how='left')

    @staticmethod
    def parse_testing(df):
        test = df.sort_values(by=['uid', 'rating'], ascending=False)
        users, movies, outputs = [], [], []
        for _, u in test.groupby('uid'):
            users.append(LongTensor([u.uid.values]))
            movies.append(LongTensor([u.mid.values]))
            outputs.append(LongTensor([u.rating.values]))
        return users, movies, outputs

    def _train_test_split(self):
        self.df.rating = np.int8(1)
        self.df['latest'] = self.df.groupby(['uid'])['date'].rank(method='first', ascending=False)
        test_bool = self.df.latest == 1
        test = self.df[test_bool]
        train = self.df[~test_bool]
        return (train[['uid', 'mid', 'rating']],
                test[['uid', 'mid', 'rating']]
                )

    def add_negatives(self, df: pd.DataFrame, item: str = 'mid', items=None, n_samples: int = 4):
        if items is None:
            items = set(self.train[item].unique())

        combine = df.groupby('uid')[item].apply(set).reset_index()
        combine['negatives'] = combine[item].apply(lambda x: sample(list(items - x), n_samples))

        s = combine.apply(lambda x: pd.Series(x.negatives, dtype=np.int16), axis=1).stack().reset_index()
        s.rename(columns={'level_0': 'uid', 0: item}, inplace=True)
        s.drop(['level_1'], axis=1, inplace=True)
        s['rating'] = np.int8(0)
        s.uid = s.uid.astype(np.int16)

        complete = pd.concat([df, s]).sort_values(by=['uid', item])
        return complete.reset_index(drop=True)


class AttributeData(Dataset):
    def __init__(self, num_negatives: int = 4, training_ratio: float = .8):
        self.targets = TargetData()
        self.df = pd.merge(
            self._features(),
            self.targets.train,
            # on=['uid', 'uid'],
            how='left'
            )[['uid', 'mid', 'age', 'gender', 'job', 'rating']]

        # self.df.rename(columns={'rating_x': 'rating'}, inplace=True)

        self.num_users = self.df.uid.nunique()
        self.num_jobs = self.df.job.nunique()
        self.jobs = set(self.df.job.unique())
        self.train, self.test = self._train_test_split()

        self.training_data = self.add_negatives(
            self.train,
            item='job',
            items=self.jobs,
            n_samples=num_negatives)
        self.training_data['age'] = self.training_data.groupby('uid')['age'].transform('first')
        self.training_data['gender'] = self.training_data.groupby('uid')['gender'].transform('first')
        self.training_data.dropna(inplace=True)
        # self.training_data['mid'] = self.training_data.apply(lambda x: np.random.choice(1) if x.rating == 0)
        # self.training_data['job'] = self.training_data.groupby('uid')['job'].transform('first')
        # self.jobs_train, self.genders_train, self.ages_train = self.perturb_input(self.training_data)

    def __len__(self):
        return self.train.shape[0]  # Length of the data to train on

    def __getitem__(self, index) -> T_co:
        # features = LongTensor(self.training_data.iloc[index, 1:-1])
        user = LongTensor(self.training_data.uid.iloc[index])
        # job = LongTensor(self.training_data.iloc[index, :])
        # gender = LongTensor(self.training_data.iloc[index, :])
        # age = LongTensor(self.training_data.iloc[index])
        rating = LongTensor(self.training_data.rating.iloc[index])
        return user, rating

    # def __getitem__(self, index) -> T_co:
    #     user = LongTensor(self.training_data.uid.iloc[index])
    #     job = LongTensor(self.training_data.job.iloc[index])
    #     # protected attribute
    #     gender = LongTensor(self.training_data.gender.iloc[index])
    #     age = LongTensor(self.training_data.age.iloc[index])
    #     rating = LongTensor(self.training_data.rating.iloc[index])
    #     return user, job, gender, age, rating

    # def __call__(self, test_data):
    #     return self.parse_testing(self.add_negatives(test_data, items=self.movies, n_samples=100))

    def perturb_input(self, df):
        # HOT ENCODING (CATEGORICAL)
        func1, func2 = self.obfuscation_functions()

        jobs_train = pd.get_dummies(df.job, drop_first=True)

        genders_train = pd.get_dummies(df.gender, drop_first=True)

        # (CONTINUOUS)
        ages_train = 2 * ((df.age - df.age.min()) /
                          (df.age.max() - df.age.min())) - 1
        return jobs_train.apply(func2), genders_train.apply(func2), ages_train.apply(func1)

    @staticmethod
    def obfuscation_functions(eps_hat: int = 4):
        n_features = 3  # d

        delta = n_features + n_features ** 2 / 4  # global sensitivity
        # slack = np.random.uniform(0, 1, 1)
        # slack = np.max(1, n_features, np.int8(local_epsilon / 2.5))

        C = (np.exp(eps_hat / 2) + 1) / (np.exp(eps_hat / 2) - 1)

        l = lambda x: ((C + 1) / 2) * x - ((C - 1) / 2)
        pi = lambda x: l(x) + C - 1

        dfrac = np.exp(eps_hat / 2) / (np.exp(eps_hat / 2) + 1)
        const = lambda x: 1 / (np.exp(eps_hat / x) + 1)

        def func1(x):
            if np.random.uniform(0, 1, 1) < dfrac:
                return np.random.uniform(l(x), pi(x), 1)
            else:
                return np.random.choice(
                    [np.random.uniform(-C, l(x), 1),
                     np.random.uniform(pi(x), C, 1).squeeze(0)], 1
                )
        def func2(x):
            array = []
            for i in x:
                if i == 1:
                    array.append(np.float32(.5))
                else:
                    array.append(const(np.random.uniform(0, 1, 1)))
            return array
        return func1, func2

    def add_negatives(self, df: pd.DataFrame, item: str = 'mid', items=None, n_samples: int = 4):
        if items is None:
            items = set(self.df[item].unique())

        movies = set(self.df.mid.unique())
        scombine = df.groupby('uid')[item].apply(set).reset_index()
        mcombine = df.groupby('uid')['mid'].apply(set).reset_index()
        scombine['jnegatives'] = scombine[item].apply(lambda x: sample(list(items - x), n_samples))
        mcombine['mnegatives'] = mcombine['mid'].apply(lambda x: sample(list(movies - x), n_samples))

        s = scombine.apply(lambda x: pd.Series(x.jnegatives, dtype=np.int16), axis=1).stack().reset_index()
        m = mcombine.apply(lambda x: pd.Series(x.mnegatives, dtype=np.int16), axis=1).stack().reset_index()
        s.rename(columns={'level_0': 'uid', 0: item}, inplace=True)
        m.rename(columns={'level_0': 'uid', 0: 'mid'}, inplace=True)
        s.drop(['level_1'], axis=1, inplace=True)
        m.drop(['level_1'], axis=1, inplace=True)
        s['rating'] = np.int8(0)
        s.uid = s.uid.astype(np.int16)
        s = pd.merge(s, m, on=['uid', 'uid'], how='inner')
        complete = pd.concat([df, s]).sort_values(by=['uid', item])
        # complete = pd.concat([df, s]).sort_values(by=['uid', item])
        return complete.reset_index(drop=True)

    #
    # def _train_test_split(self):
    #     # self.df['latest'] = self.df.groupby(['uid'])['date'].rank(method='first', ascending=False)
    #     test_bool = self.df.latest <= 1
    #     test = self.df[test_bool]
    #     train = self.df[~test_bool]
    #     return (train[['uid', 'mid', 'age', 'gender', 'job', 'rating']],
    #             test[['uid', 'mid', 'age', 'gender', 'job', 'rating']]
    #             )

    def _train_test_split(self, train_ratio: float = .8):
        msk = np.random.rand(len(self.df)) < train_ratio

        train = self.df[msk]
        test = self.df[~msk]
        return train, test

    def _features(self):
        df = pd.read_csv('MovieLens/users.dat',
                         sep='::',
                         header=None,
                         names=['uid', 'gender', 'age', 'job', 'zip'],
                         engine='python')
        df.drop(columns=['uid'], inplace=True)
        df.index.rename('uid', inplace=True)
        df.gender = pd.get_dummies(df.gender, drop_first=True)  # 0:F, 1:M
        df.reset_index(inplace=True)
        drop = [0, 10, 13, 19]

        clean = df[~df['job'].isin(drop)]

        clean['rating'] = 1
        clean['uid'] = clean.uid - 1

        self.num_jobs = clean.job.nunique()

        item_id = clean[['job']].drop_duplicates()
        item_id['njob'] = np.arange(self.num_jobs)
        clean = pd.merge(clean, item_id, on=['job'], how='left')
        clean.job = clean.njob
        return clean


class DataGenerator(AttributeData):
    def __init__(self):
        super().__init__()

        # self.targets = TargetData()
        # self.full_train = pd.merge(self.targets.training_data, self.df, on=['uid', 'uid'], how='left')
        # self.jobs, self.genders, self.ages = self.perturb_input(self.full_train)

    def __len__(self):
        return self.jobs.shape[0]  # Length of the data to train on


In [2]:
data = AttributeData()

  users.append(LongTensor([u.uid.values]))


In [3]:
# data.df
data.train[data.train.uid==300]

Unnamed: 0,uid,mid,age,gender,job,rating
37357,300,218,18,1,10,1
37358,300,1321,18,1,10,1
37359,300,58,18,1,10,1
37360,300,556,18,1,10,1
37361,300,236,18,1,10,1
...,...,...,...,...,...,...
37707,300,26,18,1,10,1
37708,300,217,18,1,10,1
37709,300,1017,18,1,10,1
37710,300,837,18,1,10,1


In [4]:
data.training_data[data.training_data.uid==300]

Unnamed: 0,uid,mid,age,gender,job,rating
34624,300,1942,18.0,1.0,4,0
34625,300,3034,18.0,1.0,4,0
34626,300,3266,18.0,1.0,4,0
34627,300,1878,18.0,1.0,4,0
34628,300,1942,18.0,1.0,7,0
...,...,...,...,...,...,...
34915,300,1878,18.0,1.0,12,0
34916,300,1942,18.0,1.0,14,0
34917,300,3034,18.0,1.0,14,0
34918,300,3266,18.0,1.0,14,0


In [5]:
from time import time
import torch
from torch.utils.data import DataLoader
num_epochs = 25
batch_size = 2048
learning_rate = .001
top_k = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
def train_ncf(model):
    # data.get_train_instances(seed=e)
    optimizer = torch.optim.AdamW(model.parameters(),  lr=learning_rate, weight_decay=1e-6)

    dataloader = DataLoader(data, batch_size=batch_size,
                            shuffle=True, num_workers=0)
    t1 = time()
    it_per_epoch = len(data) / batch_size
    for i in range(num_epochs):
        model.train()
        print("Starting epoch ", i + 1)
        j = 0
        for batch in dataloader:
            # u, j, g, a, r = batch
            u, r = batch
            print(u)
            break
            # move tensors to cuda
            f = f.to(device)
            # m = m.to(device)
            r = r.to(device)
            y_hat = model(f.squeeze(1), r.squeeze(1))
            loss = torch.nn.BCELoss()  # (weight=w, reduction="mean")
            loss = loss(y_hat, r.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if j % int(1 + it_per_epoch / 10) == 0:
                print("Progress: ", round(100 * j / it_per_epoch), "%")
            j+=1

        # Epoch metrics
        t2 = time()
        print("Epoch time:", round(t2 - t1), "seconds")
        print("Loss:", loss / i)
        ncf.eval()


        # print('baseline ')
        # t1 = time()
        # hr, ndcg = evaluate_model(ncf,data.test[['uid', 'mid']].values,10,100, 3416)
        # t2 = time()
        # print("Evaluation time:", round(t2 - t1), "seconds")
        # print(f"HR@{top_k}:{hr[-1]}")
        #
        # print("(evaluator)...")
        # t1 = time()
        # hr, ndcg = evaluator()
        # # hr, ndcg = evaluate(model, data.test, top_K)
        # t2 = time()
        # print("Evaluation time:", round(t2 - t1), "seconds")
        # print(f"HR@{top_k}:{hr}")

        print("Evaluating (eval_model)...")
        t1 = time()
        hr, ndcg = eval_model(model, data)
        t2 = time()
        print("Evaluation time:", round(t2 - t1), "seconds")
        print(f"HR@{top_k}:{hr}")
        #
        #
        # print("Evaluating (tf)...")
        # t1 = time()
        # hr, ndcg = tf_eval(1)
        # t2 = time()
        #
        # print("Evaluation time:", round(t2 - t1), "seconds")
        # print(f"HR@{top_k}:{np.array(hr).mean()}")
        # new
        # HR, NDCG = evaluate_model(model, data, validation=False)
        # updated
        # hr, ndcg = evaluate_model(model, data.test, top_K, random_samples)
        # original
        loss = 0
        print()

    print("Done")

In [7]:
from models import NCF

ncf = NCF(6040, 3952, 128, [128, 64, 32, 16], 1)

ncf.load_state_dict(torch.load('models/preTrained_NCF'))

FileNotFoundError: [Errno 2] No such file or directory: 'models/preTrained_NCF'

In [None]:
train_ncf(ncf)


In [None]:
data.targets.training_data

In [None]:
data.training_data.dropna().info()

In [None]:
x = data.add_negatives(data.test, item='job', items = set(data.df.job.unique()))
x[x.uid==0]

In [None]:
def add_negatives(df: pd.DataFrame, item: str = 'mid', n_samples: int = 4):
    movies = set(data.df.mid.unique())
    items = set(data.df.mid.unique())
    scombine = df.groupby('uid')[item].apply(set).reset_index()
    mcombine = df.groupby('uid')['mid'].apply(set).reset_index()
    scombine['jnegatives'] = scombine[item].apply(lambda x: sample(list(items - x), n_samples))
    mcombine['mnegatives'] = mcombine['mid'].apply(lambda x: sample(list(movies - x), n_samples))

    s = scombine.apply(lambda x: pd.Series(x.jnegatives, dtype=np.int16), axis=1).stack().reset_index()
    m = mcombine.apply(lambda x: pd.Series(x.mnegatives, dtype=np.int16), axis=1).stack().reset_index()
    s.rename(columns={'level_0': 'uid', 0: item}, inplace=True)
    m.rename(columns={'level_0': 'uid', 0: 'mid'}, inplace=True)
    s.drop(['level_1'], axis=1, inplace=True)
    m.drop(['level_1'], axis=1, inplace=True)
    s['rating'] = np.int8(0)
    s.uid = s.uid.astype(np.int16)
    s = pd.merge(s, m, on=['uid', 'uid'], how='inner')
    complete = pd.concat([df, s]).sort_values(by=['uid', item])
    # complete = pd.concat([df, s]).sort_values(by=['uid', item])
    return complete.reset_index(drop=True)

In [None]:
data.test

In [None]:
data.train

In [None]:
test = pd.merge(
    data._features(),
    data.targets.test,
    # on=['uid', 'uid'],
    how='left'
)[['uid', 'mid', 'age', 'gender', 'job', 'rating']]

In [None]:
test

In [None]:
full_test = add_negatives(test, item='job', n_samples=100)

In [None]:
full_test['age'] = full_test.groupby('uid')['age'].transform('first')
full_test['gender'] = full_test.groupby('uid')['gender'].transform('first')
full_test.dropna(inplace=True)
full_test

In [None]:
def parse_testing(df):
    test = df.sort_values(by=['uid', 'rating'], ascending=False)
    users, features, outputs = [], [], []
    for _, u in test.groupby('uid'):
        users.append(LongTensor([u.uid.values]))
        features.append(LongTensor([u[['mid', 'age', 'gender', 'job']].values]))
        outputs.append(LongTensor([u.rating.values]))
    return users, features, outputs


In [None]:
tensors = parse_testing(full_test)

In [None]:
def rank(l, item):
    # rank of the test item in the list of negative instances
    # returns the number of elements that the test item is bigger than

    index = 0
    for element in l:
        if element > item:
            index += 1
            return index
        index += 1
    return index
def eval_model(model, data, num_users=6040):
    # Evaluates the model and returns HR@10 and NDCG@10
    hits = 0
    ndcg = 0
    for u in range(num_users):
        user = data.testing_tensors[0][u].squeeze().to(device)
        item = data.testing_tensors[1][u].squeeze().to(device)
        y = model(user, item)

        y = y.tolist()
        y = sum(y, [])
        first = y.pop(0)
        y.sort()
        ranking = rank(y, first)
        if ranking > 90:
            hits += 1
            ndcg += np.log(2) / np.log(len(user) - ranking + 1)

    hr = hits / num_users
    ndcg = ndcg / num_users
    return hr, ndcg

In [None]:
tensors