In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
import warnings
warnings.filterwarnings('ignore')

# 1. Data Loading and Initial Exploration

In [57]:
def load_data():
    """
    Load the three dataset files with proper encoding and separator settings, skipping problematic lines.
    """
    # Load datasets with appropriate encoding and skip lines with errors
    users_df = pd.read_csv('Users.csv', encoding='latin-1', sep=';', on_bad_lines='skip')
    books_df = pd.read_csv('Books.csv', encoding='latin-1', sep=';', on_bad_lines='skip')
    ratings_df = pd.read_csv('Ratings.csv', encoding='latin-1', sep=';', on_bad_lines='skip')
    
    # Check and print column names for troubleshooting
    print("Users columns:", users_df.columns)
    print("Books columns:", books_df.columns)
    print("Ratings columns:", ratings_df.columns)
    
    print("Data Loading Complete!")
    print(f"Users shape: {users_df.shape}")
    print(f"Books shape: {books_df.shape}")
    print(f"Ratings shape: {ratings_df.shape}")
    
    return users_df, books_df, ratings_df



def explore_data(users_df, books_df, ratings_df):
    """
    Perform initial data exploration and display key statistics
    """
    ratings_df.head()

    # Ensure columns have expected names
    if 'Book-Rating' not in ratings_df.columns:
        print("Renaming columns for Ratings dataset.")
        ratings_df.columns = ['User-ID', 'ISBN', 'Book-Rating']
    
    # Users analysis
    print("\nUsers Dataset Info:")
    print(users_df.info())
    print("\nMissing values in Users:")
    print(users_df.isnull().sum())
    
    # Books analysis
    print("\nBooks Dataset Info:")
    print(books_df.info())
    print("\nMissing values in Books:")
    print(books_df.isnull().sum())
    
    # Ratings analysis
    print("\nRatings Distribution:")
    plt.figure(figsize=(10, 6))
    sns.histplot(ratings_df['Book-Rating'])
    plt.title('Distribution of Book Ratings')
    plt.show()
    
    return {
        'total_users': users_df['User-ID'].nunique(),
        'total_books': books_df['ISBN'].nunique(),
        'total_ratings': len(ratings_df)
    }


# 2. Data Preprocessing

In [58]:
class DataPreprocessor:
    def __init__(self):
        self.user_encoder = LabelEncoder()
        self.book_encoder = LabelEncoder()
        self.author_encoder = LabelEncoder()
        self.publisher_encoder = LabelEncoder()
        self.rating_scaler = MinMaxScaler()
    
    def preprocess(self, users_df, books_df, ratings_df, min_ratings=10):
        """
        Preprocess the datasets with memory efficiency in mind
        """
        # Filter users and books with minimum ratings to reduce sparsity
        user_counts = ratings_df['User-ID'].value_counts()
        book_counts = ratings_df['ISBN'].value_counts()
        
        valid_users = user_counts[user_counts >= min_ratings].index
        valid_books = book_counts[book_counts >= min_ratings].index
        
        # Filter ratings
        ratings_filtered = ratings_df[
            ratings_df['User-ID'].isin(valid_users) & 
            ratings_df['ISBN'].isin(valid_books)
        ]
        
        # Update users and books dataframes
        users_filtered = users_df[users_df['User-ID'].isin(valid_users)]
        books_filtered = books_df[books_df['ISBN'].isin(valid_books)]
        
        # Encode categorical variables
        ratings_filtered['User-ID'] = self.user_encoder.fit_transform(ratings_filtered['User-ID'])
        ratings_filtered['ISBN'] = self.book_encoder.fit_transform(ratings_filtered['ISBN'])
        
        # Scale ratings to [0,1]
        ratings_filtered['Book-Rating'] = self.rating_scaler.fit_transform(
            ratings_filtered['Book-Rating'].values.reshape(-1, 1)
        )
        
        return ratings_filtered, users_filtered, books_filtered

# 3. Baseline Models

In [59]:
class BaselineModels:
    def __init__(self):
        self.global_mean = None
        self.user_means = None
        self.item_means = None
    
    def train_baseline_models(self, ratings_df):
        """
        Train simple baseline models
        """
        self.global_mean = ratings_df['Book-Rating'].mean()
        self.user_means = ratings_df.groupby('User-ID')['Book-Rating'].mean()
        self.item_means = ratings_df.groupby('ISBN')['Book-Rating'].mean()
    
    def evaluate_baselines(self, test_df):
        """
        Evaluate baseline models using RMSE
        """
        results = {}
        
        # Global mean RMSE
        global_pred = np.full_like(test_df['Book-Rating'], self.global_mean)
        results['Global Mean RMSE'] = np.sqrt(np.mean((test_df['Book-Rating'] - global_pred) ** 2))
        
        # User mean RMSE
        user_pred = test_df['User-ID'].map(self.user_means).fillna(self.global_mean)
        results['User Mean RMSE'] = np.sqrt(np.mean((test_df['Book-Rating'] - user_pred) ** 2))
        
        # Item mean RMSE
        item_pred = test_df['ISBN'].map(self.item_means).fillna(self.global_mean)
        results['Item Mean RMSE'] = np.sqrt(np.mean((test_df['Book-Rating'] - item_pred) ** 2))
        
        return results

# 4. Advanced Model (Neural Collaborative Filtering with Content Features)

In [60]:
class NCFModel:
    def __init__(self, n_users, n_books, embedding_dim=50):
        self.n_users = n_users
        self.n_books = n_books
        self.embedding_dim = embedding_dim
        self.model = self.build_model()
    
    def build_model(self):
        # User input path
        user_input = Input(shape=(1,), name='user_input')
        user_embedding = Embedding(self.n_users, self.embedding_dim, name='user_embedding')(user_input)
        user_vec = Flatten(name='flatten_users')(user_embedding)

        # Book input path
        book_input = Input(shape=(1,), name='book_input')
        book_embedding = Embedding(self.n_books, self.embedding_dim, name='book_embedding')(book_input)
        book_vec = Flatten(name='flatten_books')(book_embedding)

        # Merge features
        concat = Concatenate()([user_vec, book_vec])
        
        # Dense layers
        fc1 = Dense(128, activation='relu')(concat)
        dropout1 = Dropout(0.2)(fc1)
        fc2 = Dense(64, activation='relu')(dropout1)
        dropout2 = Dropout(0.2)(fc2)
        output = Dense(1, activation='sigmoid')(dropout2)

        model = Model(inputs=[user_input, book_input], outputs=output)
        model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
        
        return model
    
    def train(self, train_data, validation_data, epochs=10, batch_size=256):
        history = self.model.fit(
            x=[train_data['User-ID'], train_data['ISBN']],
            y=train_data['Book-Rating'],
            validation_data=(
                [validation_data['User-ID'], validation_data['ISBN']],
                validation_data['Book-Rating']
            ),
            epochs=epochs,
            batch_size=batch_size,
            verbose=1
        )
        return history

# 5. Evaluation and Visualization

In [61]:
def plot_training_history(history):
    """
    Plot training metrics
    """
    plt.figure(figsize=(12, 4))
    
    # Loss plot
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    # MAE plot
    plt.subplot(1, 2, 2)
    plt.plot(history.history['mae'], label='Training MAE')
    plt.plot(history.history['val_mae'], label='Validation MAE')
    plt.title('Model MAE')
    plt.xlabel('Epoch')
    plt.ylabel('MAE')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

# Main execution flow

In [None]:
users_df, books_df, ratings_df = load_data()
ratings_df.head()
explore_data(users_df, books_df, ratings_df)

Users columns: Index(['User-ID,Location,Age'], dtype='object')
Books columns: Index(['ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L'], dtype='object')
Ratings columns: Index(['User-ID,ISBN,Book-Rating'], dtype='object')
Data Loading Complete!
Users shape: (278698, 1)
Books shape: (249936, 1)
Ratings shape: (1149780, 1)
Renaming columns for Ratings dataset.


ValueError: Length mismatch: Expected axis has 1 elements, new values have 3 elements

# Initialize the preprocessor and preprocess data

In [None]:
preprocessor = DataPreprocessor()
train_df, users_df, books_df = preprocessor.preprocess(users_df, books_df, ratings_df)

# Split into train and test datasets

In [None]:
train_df, test_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Initialize and train baseline models

In [None]:

baseline_models = BaselineModels()
baseline_models.train_baseline_models(train_df)
baseline_results = baseline_models.evaluate_baselines(test_df)
print("Baseline Model Results:", baseline_results)

# Define NCF model parameters

In [None]:
n_users = train_df['User-ID'].nunique()
n_books = train_df['ISBN'].nunique()

# Train and evaluate the NCF model

In [None]:
ncf_model = NCFModel(n_users, n_books)
history = ncf_model.train(train_data=train_df, validation_data=test_df)

# Plot the training history

In [None]:
plot_training_history(history)

# Evaluate NCF model performance

In [None]:
test_predictions = ncf_model.model.predict([test_df['User-ID'], test_df['ISBN']])
test_predictions = preprocessor.rating_scaler.inverse_transform(test_predictions)
actual_ratings = preprocessor.rating_scaler.inverse_transform(test_df['Book-Rating'].values.reshape(-1, 1))
rmse = np.sqrt(np.mean((test_predictions - actual_ratings) ** 2))
print("NCF Model RMSE on Test Set:", rmse)