In [49]:
from scripts.process_data import ProcessData
processor = ProcessData()
final_dataframe = processor.get_final_dataframe() 

DataFrame saved to data/processed/book_reference_dataframe.pkl


In [50]:
import os
import urllib
import zipfile
import time
import pickle 
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

def map_columns_to_int(df: pd.DataFrame, save_column_name: list, mapping_dir: str = None):
    """
    Map columns of strings in a DataFrame to integers.

    Parameters:
        df (pd.DataFrame): DataFrame containing the columns to be mapped.
        column_names (list): List of column names to be mapped.
        mapping_dir (str): Directory path to save the mapping dictionaries. If None, mappings won't be saved.

    Returns:
        pd.DataFrame: DataFrame with the mapped columns.
    """
    for column_name in df.columns:
        # Create a mapping dictionary that assigns a unique integer to each unique string in the column
        unique_values = df[column_name].unique()
        mapping_dict = {value: idx for idx, value in enumerate(unique_values)}

        # Replace the strings in the column with their corresponding integer values
        df[f"{column_name}"] = df[column_name].map(mapping_dict)

        # Save the mapping dictionary to a file if mapping_dir is provided
        if column_name == save_column_name:
            mapping_file_path = f"{mapping_dir}{column_name}_mapping.pkl"
            with open(mapping_file_path, 'wb') as f:
                pickle.dump(mapping_dict, f)
            print(f"Mapping for column '{column_name}' saved to {mapping_file_path}")

    return df


In [51]:
# Call the function to map the 'Fruit' column and save the mapping
mapping_file_path = 'fruit_mapping.pkl'
df_mapped = map_columns_to_int(final_dataframe, save_column_name='isbn_gr', mapping_dir='data/processed/')

Mapping for column 'isbn_gr' saved to data/processed/isbn_gr_mapping.pkl


In [52]:
df_mapped.rename(columns={'Book-Rating_bx':'rating_bx'}, inplace=True)
df_mapped.head()

Unnamed: 0,user_id_gr,rating_gr,books_count_gr,isbn_gr,average_rating_gr,ratings_count_gr,work_ratings_count_gr,work_text_reviews_count_gr,ratings_1_gr,ratings_2_gr,ratings_3_gr,ratings_4_gr,ratings_5_gr,rating_bx
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17750,0,0,1,1,1,1,1,1,1,1,1,1,1,1
25293,0,1,2,2,2,2,2,2,2,2,2,2,2,2
29017,0,0,3,3,3,3,3,3,3,3,3,3,3,3
38052,0,0,4,4,4,4,4,4,4,4,4,4,4,4


In [59]:
y.dtypes

rating_gr    int64
dtype: object

In [62]:
X = df_mapped.loc[:, ['user_id_gr', 'isbn_gr','rating_bx']]
y = df_mapped.loc[:, ['rating_gr']]
X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)

In [63]:
def prep_dataloaders(X_train,y_train,X_val,y_val,batch_size):
    # Convert training and test data to TensorDatasets
    trainset = TensorDataset(torch.from_numpy(np.array(X_train)).long(), 
                            torch.from_numpy(np.array(y_train)).float())
    valset = TensorDataset(torch.from_numpy(np.array(X_val)).long(), 
                            torch.from_numpy(np.array(y_val)).float())

    # Create Dataloaders for our training and test data to allow us to iterate over minibatches 
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
    valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

    return trainloader, valloader

batchsize = 128
trainloader,valloader = prep_dataloaders(X_train,y_train,X_val,y_val,batchsize)

In [64]:
class NNHybridFiltering(nn.Module):
    
    def __init__(self, n_users, n_isbn, n_bxrating, embdim_users, embdim_isbn, embdim_bxrating, n_activations, rating_range):
        super().__init__()
        self.user_embeddings = nn.Embedding(num_embeddings=n_users,embedding_dim=embdim_users)
        self.item_embeddings = nn.Embedding(num_embeddings=n_isbn,embedding_dim=embdim_isbn)
        self.genre_embeddings = nn.Embedding(num_embeddings=n_bxrating,embedding_dim=embdim_bxrating)
        self.fc1 = nn.Linear(embdim_users+embdim_isbn+embdim_bxrating,n_activations)
        self.fc2 = nn.Linear(n_activations,1)
        self.rating_range = rating_range

    def forward(self, X):
        # Get embeddings for minibatch
        embedded_users = self.user_embeddings(X[:,0])
        embedded_isbn = self.item_embeddings(X[:,1])
        embedded_bxrating = self.genre_embeddings(X[:,2])
        # Concatenate user, item and genre embeddings
        embeddings = torch.cat([embedded_users,embedded_isbn,embedded_bxrating],dim=1)
        # Pass embeddings through network
        preds = self.fc1(embeddings)
        preds = F.relu(preds)
        preds = self.fc2(preds)
        # Scale predicted ratings to target-range [low,high]
        preds = torch.sigmoid(preds) * (self.rating_range[1]-self.rating_range[0]) + self.rating_range[0]
        return preds
        

In [67]:
def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5, scheduler=None):
    model = model.to(device) # Send model to GPU if available
    since = time.time()

    costpaths = {'train':[],'val':[]}

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0

            # Get the inputs and labels, and send to GPU if available
            for (inputs,labels) in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                labels = labels.squeeze(1)
                # Zero the weight gradients
                optimizer.zero_grad()

                # Forward pass to get outputs and calculate loss
                # Track gradient only for training data
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model.forward(inputs).view(-1)
                    loss = criterion(outputs, labels)

                    # Backpropagation to get the gradients with respect to each weight
                    # Only if in train
                    if phase == 'train':
                        loss.backward()
                        # Update the weights
                        optimizer.step()

                # Convert loss into a scalar and add it to running_loss
                running_loss += np.sqrt(loss.item()) * labels.size(0)

            # Step along learning rate scheduler when in train
            if (phase == 'train') and (scheduler is not None):
                scheduler.step()

            # Calculate and display average loss and accuracy for the epoch
            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            costpaths[phase].append(epoch_loss)
            print('{} loss: {:.4f}'.format(phase, epoch_loss))

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    return costpaths

In [68]:
# Train the model
dataloaders = {'train':trainloader, 'val':valloader}

n_users = X.loc[:,'user_id_gr'].max()+1
n_isbn = X.loc[:,'isbn_gr'].max()+1
n_bxrating = X.loc[:,'rating_bx'].max()+1

model = NNHybridFiltering(n_users,
                       n_isbn,
                       n_bxrating,
                       embdim_users=50, 
                       embdim_isbn=50, 
                       embdim_bxrating=25,
                       n_activations = 100,
                       rating_range=[0.,4.])
criterion = nn.MSELoss()
lr=0.001
n_epochs=10
wd=1e-3
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cost_paths = train_model(model,criterion,optimizer,dataloaders, device,n_epochs, scheduler=None)

Epoch 0/9
----------
train loss: 1.1392
val loss: 1.1352
Epoch 1/9
----------
train loss: 1.1328
val loss: 1.1326
Epoch 2/9
----------
train loss: 1.1188
val loss: 1.1064
Epoch 3/9
----------
train loss: 1.0769
val loss: 1.0913
Epoch 4/9
----------
train loss: 1.0350
val loss: 1.0913
Epoch 5/9
----------
train loss: 1.0121
val loss: 1.0936
Epoch 6/9
----------


KeyboardInterrupt: 