In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mldog1/users.dat
/kaggle/input/mldog1/ratings.dat
/kaggle/input/mldog1/movies_train.dat
/kaggle/input/mldog1/genres.txt
/kaggle/input/mldog1/movies_test.dat


In [2]:
%ls

In [3]:
import pandas
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import cv2
import os
from nltk import wordpunct_tokenize
import re



In [4]:
users = pandas.read_csv('/kaggle/input/mldog1/users.dat', sep='::',
                        engine='python',
                        names=['userid', 'gender', 'age', 'occupation', 'zip']).set_index('userid')
ratings = pandas.read_csv('/kaggle/input/mldog1/ratings.dat', engine='python',
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
movies_train = pandas.read_csv('/kaggle/input/mldog1/movies_train.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='latin-1', index_col=False)
movies_test = pandas.read_csv('/kaggle/input/mldog1/movies_test.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='latin-1', index_col=False)
movies_train['genre'] = movies_train.genre.str.split('|')
movies_test['genre'] = movies_test.genre.str.split('|')

users.age = users.age.astype('category')
users.gender = users.gender.astype('category')
users.occupation = users.occupation.astype('category')
ratings.movieid = ratings.movieid.astype('category')
ratings.userid = ratings.userid.astype('category')

In [5]:
a = set()
for i in ratings['movieid']:
    a.add(i)

In [6]:
count = 0
for i in movies_test['movieid']:
    if i in a:
        continue
    else:
        count+=1

In [7]:
print(count)

34


In [8]:
import pandas as pd


# Use the intersection of sets to get common movieids
common_movieids = set(movies_train['movieid']).intersection(ratings['movieid'])
array = np.random.randint(10, size=(2, 3)) 

In [9]:
def tokenize(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    tokens = wordpunct_tokenize(text)
    tokens = tokens[:-1] # remove last token because it is the year which maybe is not useful
    return tokens

def create_vocab():
    df = movies_train.copy()
    arr_title = df['title'].tolist()
    vocab = set()
    for title in arr_title:
        tokens = tokenize(title)
        vocab.update(tokens)
    vocab = list(vocab)
    pad_token = '<PAD>'
    unk_token = '<UNK>'
    vocab.append(pad_token)
    vocab.append(unk_token)
    return vocab

In [10]:
movie_ids = ratings.movieid.unique()
res = {}
n = len(ratings.userid.unique())
temp = ratings['rating'].values

for movie_id in movie_ids:
    tmp = np.zeros(n)
    cur_users = ratings.loc[ratings['movieid']==movie_id].userid.tolist()
    for user in cur_users:
        tmp[user - 1] = temp[user-1]
    res[movie_id] = tmp

In [11]:
class MLDataset(Dataset):
    def __init__(self, is_train=True):
        if is_train:
            self.data =  movies_train
        else:
            self.data = movies_test
        self.data['title_tokens'] = [tokenize(x) for x in self.data.title]
        self.ratings = res
        # create vocab
        vocab = create_vocab()
        pad_token = '<PAD>'
        unk_token = '<UNK>'
        self.token2idx = {token: idx for idx, token in enumerate(vocab)}

        # Create a binary vector for each word in each sentence
        MAX_LENGTH = 7
        vectors = []
        for title_tokens in self.data.title_tokens.tolist():
            if len(title_tokens) < MAX_LENGTH:
                num_pad = MAX_LENGTH - len(title_tokens)
                for idx in range(num_pad):
                    title_tokens.append(pad_token)
            else:
                title_tokens = title_tokens[:MAX_LENGTH]
            title_vectors = []
            for word in title_tokens:
                binary_vector = np.zeros(len(vocab))
                if word in vocab:
                    binary_vector[self.token2idx[word]] = 1
                else:
                    binary_vector[self.token2idx[unk_token]] = 1
                title_vectors.append(binary_vector)

            vectors.append(np.array(title_vectors))
        self.data['vectors'] = vectors

        # label genre
        with open('/kaggle/input/mldog1/genres.txt', 'r') as f:
            genre_all = f.readlines()
            genre_all = [x.replace('\n','') for x in genre_all]
        self.genre2idx = {genre:idx for idx, genre in enumerate(genre_all)}

    def __getitem__(self, index):
        title = self.data.iloc[index].title
#         img_path = self.data.iloc[index].img_path
        genre = self.data.iloc[index].genre
        movie_id = self.data.iloc[index].movieid
        try:
            ratings = self.ratings[movie_id]
            ratings = ratings/5
        except:
            #ratings = np.random.randint(5, size=(6040)) 
            #ratings = ratings/5
            ratings = np.zeros([6040])
        rating_tensor = torch.from_numpy(ratings).float()
        # preprocess text
        title_vector = self.data.iloc[index].vectors
        title_tensor = torch.from_numpy(title_vector).float()

        # preprocess im

        # preprocess label
        genre_vector = np.zeros(len(self.genre2idx))

        for g in genre:
            genre_vector[self.genre2idx[g]] = 1
        genre_tensor = torch.from_numpy(genre_vector).float()
        return title_tensor, genre_tensor, rating_tensor

    def __len__(self):
        return len(self.data)
    
    

In [12]:
# class RatingModel(nn.Module):
#     def __init__(self):
#         super(RatingModel, self).__init__()
#         self.lin1 = nn.Linear(6040, 1510, bias=True)
#         self.relu = nn.ReLU(inplace=True)
#         self.dropout = nn.Dropout(0.2)
#         self.lin2 = nn.Linear(1510, 256 , bias=True)
#         self.lin3 = nn.Linear(256, 18, bias=True)
#         self.bn1 = nn.BatchNorm1d(1510)
        
#     def forward(self, ratings):
#         ratings = self.relu(self.lin1(ratings))
#         ratings = self.bn1(ratings)
#         ratings = self.dropout(ratings)
#         ratings = self.lin2(ratings)
#         ratings = self.relu(ratings)
#         ratings = self.dropout(ratings)
#         ratings = self.lin3(ratings)
        
#         return ratings

In [13]:
class RatingModel(nn.Module):
    def __init__(self, num_classes):
        super(RatingModel, self).__init__()
        self.dropout = nn.Dropout(0.2)

        self.fc1 = nn.Linear(6040, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.relu1 = nn.ReLU()

        self.fc2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.relu2 = nn.ReLU()

        self.fc3 = nn.Linear(512, 256)
        self.bn3 = nn.BatchNorm1d(256)
        self.relu3 = nn.ReLU()

        self.fc4 = nn.Linear(256, num_classes)
        
        
    def forward(self, x):
        x = self.bn1(self.relu1(self.fc1(x)))
        x = self.dropout(x)
        x = self.bn2(self.relu2(self.fc2(x)))
        x = self.dropout(x)
        x = self.relu3(self.fc3(x))

        x = self.fc4(x)
        
        return x

In [14]:
train_set = MLDataset(is_train=True)
test_set = MLDataset(is_train=False)

BATCH_SIZE = 8
train_dataloader = DataLoader(train_set, batch_size=BATCH_SIZE,drop_last=True)
test_dataloader = DataLoader(test_set, batch_size=BATCH_SIZE,drop_last=True)

In [15]:
print(test_dataloader)

<torch.utils.data.dataloader.DataLoader object at 0x7f537e48ed40>


In [16]:
model2 = RatingModel(18)

In [17]:
# model = RatingModel()

In [18]:
from torch import optim
criterion = nn.CrossEntropyLoss()

learning_rate = 1e-3
optimizer = optim.Adam(
    filter(lambda p: p.requires_grad, model2.parameters()),
    lr=learning_rate,
)

In [19]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
NUM_EP = 15
model2.to(device)
for ep in range(NUM_EP):

    print("="*50)
    for idx, (title_tensor, genre_tensor,rating_tensor) in enumerate(train_dataloader):
        title_tensor = title_tensor.to(device)
        genre_tensor = genre_tensor.to(device)
        rating_tensor = rating_tensor.to(device)
        model2 = model2.to(device)
        out = model2(rating_tensor)

        loss = criterion(out, genre_tensor)


        if idx % 50 == 0 and idx > 0:
          print("loss: ", loss)
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()


loss:  tensor(4.7256, device='cuda:0', grad_fn=<DivBackward1>)
loss:  tensor(1.0435, device='cuda:0', grad_fn=<DivBackward1>)
loss:  tensor(2.8088, device='cuda:0', grad_fn=<DivBackward1>)
loss:  tensor(2.4660, device='cuda:0', grad_fn=<DivBackward1>)
loss:  tensor(3.1083, device='cuda:0', grad_fn=<DivBackward1>)
loss:  tensor(3.6979, device='cuda:0', grad_fn=<DivBackward1>)
loss:  tensor(2.6493, device='cuda:0', grad_fn=<DivBackward1>)
loss:  tensor(3.1003, device='cuda:0', grad_fn=<DivBackward1>)
loss:  tensor(0.6697, device='cuda:0', grad_fn=<DivBackward1>)
loss:  tensor(1.7753, device='cuda:0', grad_fn=<DivBackward1>)
loss:  tensor(2.1545, device='cuda:0', grad_fn=<DivBackward1>)
loss:  tensor(2.7094, device='cuda:0', grad_fn=<DivBackward1>)
loss:  tensor(2.8118, device='cuda:0', grad_fn=<DivBackward1>)
loss:  tensor(1.9407, device='cuda:0', grad_fn=<DivBackward1>)
loss:  tensor(2.7925, device='cuda:0', grad_fn=<DivBackward1>)
loss:  tensor(0.2790, device='cuda:0', grad_fn=<DivBack

In [20]:
!pip install -q torchmetrics
from torchmetrics.classification import MultilabelF1Score

In [21]:
N, C = genre_tensor.shape

auroc_all = 0
f1_all = 0
f1 = MultilabelF1Score(num_labels=C, threshold=0.5, average='macro').to(device)
f1 = f1.to(device)
for title_tensor, genre_tensor,rating_tensor in test_dataloader:
    rating_tensor = rating_tensor.to(device)
    genre_tensor = genre_tensor.to(device)
    model2 = model2.to(device)
    out = model2(rating_tensor)
    f1_batch = f1(out, genre_tensor)
    f1_all += f1_batch

print('F1: ', f1_all/len(test_dataloader))


F1:  tensor(0.3284, device='cuda:0')


In [22]:
from torchmetrics.classification import MultilabelF1Score, MultilabelRecall, MultilabelPrecision
N, C = genre_tensor.shape

f1 = MultilabelF1Score(num_labels=C, threshold=0.5, average='macro')
f1 = f1.to(device)

recall = MultilabelRecall(num_labels=C, threshold=0.5, average='macro')
recall = recall.to(device)
precision = MultilabelPrecision(num_labels=C, threshold=0.5, average='macro')
precision = precision.to(device)

model2.eval()

f1_all = 0
recall_all = 0
precision_all = 0

total_acc_test = 0
total_loss_test = 0

with torch.no_grad():
  for title_tensor, genre_tensor,rating_tensor in test_dataloader:
      #title_tensor = title_tensor.to(device)
#       ids = title_tensor['ids'].to(device, dtype = torch.long)
#       mask = title_tensor['mask'].to(device, dtype = torch.long)
#       token_type_ids = title_tensor['token_type_ids'].to(device, dtype = torch.long)
#       img_tensor = img_tensor.to(device)
#       genre_tensor = genre_tensor.to(device)
      #print("title_tensor", title_tensor)
    rating_tensor = rating_tensor.to(device)
    genre_tensor = genre_tensor.to(device)
    out = model2(rating_tensor)

    #out = out.sigmoid()
    out1 = (out > 0.5).float()

    # f1_batch = f1(out, genre_tensor)
    # f1_all += f1_batch
    f1_val = f1(out, genre_tensor.type(torch.float))
    f1_all += f1_val

    recall_val = recall(out, genre_tensor.type(torch.int))
    recall_all += recall_val
    precision_val = precision(out, genre_tensor.type(torch.int))
    precision_all += precision_val

    acc = ((out > 0.5).int() == genre_tensor.type(torch.int)).float().mean().item()
    total_acc_test += acc

test_acc = total_acc_test / len(test_dataloader)

f1_all = f1_all / len(test_dataloader)
recall_all = recall_all / len(test_dataloader)
precision_all = precision_all / len(test_dataloader)

print(f'Test Accuracy: {test_acc:^10.4f}|Precision: {precision_all:^10.4f}|Recall: {recall_all}|F1-Score: {f1_all}')

Test Accuracy:   0.9354  |Precision:   0.2986  |Recall: 0.372833251953125|F1-Score: 0.321108877658844


## 