In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import metrics, preprocessing

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

books = pd.read_csv('BX-Books.csv', sep=';', encoding='latin-1', error_bad_lines=False)
print(books.columns)
if 'Image-URL-S' in books.columns and 'Image-URL-M' in books.columns and 'Image-URL-L' in books.columns:
    books = books.drop(columns=['Image-URL-S', 'Image-URL-M', 'Image-URL-L'])
print(books.head(5))
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=";", on_bad_lines='warn', encoding="latin-1")
# ratings = ratings[ratings['Book-Rating'] != 0]
print(ratings.head(5))
# print(ratings['Book-Rating'].value_counts())
users = pd.read_csv('BX-Users.csv', sep=";", on_bad_lines='warn', encoding="latin-1")
data = pd.merge(ratings, users, on='User-ID', how='inner')
data = pd.merge(data, books, on='ISBN', how='inner')
lbl_isbn = preprocessing.LabelEncoder()
lbl_title = preprocessing.LabelEncoder()
lbl_author = preprocessing.LabelEncoder()
lbl_publisher = preprocessing.LabelEncoder()
lbl_location = preprocessing.LabelEncoder()
# Convert the 'Year-Of-Publication' column to numeric, coercing errors to NaN
data['Year-Of-Publication'] = pd.to_numeric(data['Year-Of-Publication'], errors='coerce')

# Encoding the other columns
data['ISBN'] = lbl_isbn.fit_transform(data['ISBN'].values)
data['Book-Title'] = lbl_title.fit_transform(data['Book-Title'].values)
data['Book-Author'] = lbl_author.fit_transform(data['Book-Author'].values)
data['Publisher'] = lbl_publisher.fit_transform(data['Publisher'].values)
data['Age'] = data['Age'].fillna(0).astype(int)
data['Location'] = lbl_location.fit_transform(data['Location'].values)
class BookDataset(Dataset):
    def __init__(self, data) -> None:
        self.isbns = data['ISBN']
        self.book_titles = data['Book-Title']
        self.book_authors = data['Book-Author']
        self.year_of_publications = data['Year-Of-Publication']
        self.publishers = data['Publisher']
        self.user_ids = data['User-ID']
        self.locations = data['Location']
        self.ages = data['Age']
        self.book_ratings = data['Book-Rating']

    def __len__(self):
        return len(self.isbns)

    def __getitem__(self, item):
        isbn = self.isbns[item]
        book_title = self.book_titles[item]
        book_author = self.book_authors[item]
        year_of_publication = self.year_of_publications[item]
        publisher = self.publishers[item]
        user_id = self.user_ids[item]
        location = self.locations[item]
        age = self.ages[item]
        book_rating = self.book_ratings[item]

        return {
            'isbn': torch.tensor(isbn, dtype=torch.long),
            'book_title': torch.tensor(book_title, dtype=torch.long),
            'book_author': torch.tensor(book_author, dtype=torch.long),
            'publisher': torch.tensor(publisher, dtype=torch.long),
            'user_id': torch.tensor(user_id, dtype=torch.long),
            'location': torch.tensor(location, dtype=torch.long),
            'year_of_publication': torch.tensor(year_of_publication, dtype=torch.float),
            'age': torch.tensor(age, dtype=torch.float),
            'book_rating': torch.tensor(book_rating, dtype=torch.float),
        }
dataset = BookDataset(data)
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=2023)
class RecommenderModel(nn.Module):
    def __init__(self, cat_dims, cont_dims, embedding_dim):
        super(RecommenderModel, self).__init__()

        self.cat_embeddings = [
            {
                'name': cat,
                'embed': nn.Embedding(num_classes, embedding_dim)
            } for cat, num_classes in cat_dims.items()
        ]

        self.fc_cats = [
            nn.Sequential(
                nn.Linear(embedding_dim, 64),
                nn.ReLU()
            ) for _ in self.cat_embeddings
        ]

        self.fc_cont = nn.Sequential(
            nn.Linear(cont_dims, 64),
            nn.ReLU()
        )

        self.wide_layer = nn.Sequential(
            nn.Linear(64 * (len(cat_dims.keys()) + 1), 1024),
            nn.ReLU(),
            nn.Linear(1024, 10),
            nn.Softmax(dim=1)
        )

        self.deep_layer = nn.Sequential(
            nn.Linear(64 * (len(cat_dims.keys()) + 1), 1024),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 10),
            nn.Softmax(dim=1)
        )

    def forward(self, batch):
        cat_embeddings = []
        for ce in self.cat_embeddings:
                em = ce['embed'](batch[ce['name']])
                cat_embeddings.append(em)

        cat_features = [fc(cat_embed) for cat_embed, fc in zip(cat_embeddings, self.fc_cats)]

        cont_features = torch.stack([batch['user_id'], batch['year_of_publication'], batch['age']])
        cont_features = cont_features.reshape(cont_features.shape[1], cont_features.shape[0])
        cont_features = self.fc_cont(cont_features)

        concatenated_features = torch.cat([cont_features] + cat_features, dim=1)
        wide_output = self.wide_layer(concatenated_features)
        deep_output = self.deep_layer(concatenated_features)

        combined_output = torch.stack([wide_output, deep_output])

        item_scores = torch.sum(combined_output, 0) / wide_output.shape[0]

        return item_scores
categorical_dims = {
    'isbn': len(data['ISBN'].unique()),
    'book_title': len(data['Book-Title'].unique()),
    'book_author': len(data['Book-Author'].unique()),
    'publisher': len(data['Publisher'].unique()) + 1,
    # 'user_id': combined['User-ID'].max() + 1,
    'location': len(data['Location'].unique()) + 1
}
model = RecommenderModel(categorical_dims, 3, 32)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

num_epochs = 10
validation_split = 0.2

train_data, val_data = train_test_split(dataset, test_size=validation_split, random_state=2023)

batch_size = 1024  # Adjust the batch size if necessary

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adjust the learning rate

criterion = nn.MSELoss()  # Mean Squared Error loss for regression problems

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in train_loader:
        ratings = batch['book_rating'].clone()
        ratings = ratings.view(-1, 1)
        del batch['book_rating']

        optimizer.zero_grad()
        predictions = model(batch)
        loss = criterion(predictions, ratings)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    scheduler.step()

    # Calculate and print the average loss for the epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}] - Training Loss: {avg_loss:.4f}")

    # Validation loop
    model.eval()
    with torch.no_grad():
        total_val_loss = 0.0
        for batch in val_loader:
            ratings = batch['book_rating'].clone()
            ratings = ratings.view(-1, 1)
            del batch['book_rating']

            predictions = model(batch)
            val_loss = criterion(predictions, ratings)
            total_val_loss += val_loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}] - Validation Loss: {avg_val_loss:.4f}")

# Testing loop
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

total_mse = 0
model.eval()
for batch in test_loader:
    ratings = batch['book_rating'].clone()
    ratings = ratings.view(-1, 1)
    del batch['book_rating']
    predict = model(batch)
    y_true = ratings.numpy()
    y_pred = predict.numpy()
    total_mse += mean_squared_error(y_true, y_pred)

print('Average MSE: ', total_mse / len(test_loader))
