## <font color='purple'> Data 255 - Lab 1 - Part 1 Contd...

<font color='purple'> Part 1: Deep Learning-Based Recommendation (10 Points)

<font color='purple'> Read the paper Wide and Deep Learning for Recommender Systems.

<font color='purple'> Download the files anime-dataset-2023.csv, users-details-2023.csv, users-score- 2023.csv from the following link: https://www.kaggle.com/datasets/dbdmobile/myanimelist-dataset

<font color='purple'> Based on the architecture described in the paper, build your own Wide and Deep Recommender system for the Anime Dataset. Your model should learn the features of each user and anime, not just the associated ID numbers. Utilize an 80/20 train-test split and record your model’s prediction accuracy.

### <font color='blue'> Step 3: Training

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

<font color='blue'> Load the saved Data files for final training

In [2]:
data_sampled = pd.read_pickle('data_sampled.pkl')

In [3]:
data_sampled.shape

(11901624, 27)

In [4]:
data_sampled.columns

Index(['user_id', 'anime_id', 'Anime Title', 'rating', 'Username', 'Gender',
       'Days Watched', 'Mean Score', 'Completed', 'Total Entries', 'Name',
       'Score', 'Genres', 'Type', 'Episodes', 'Status', 'Producers', 'Studios',
       'Source', 'Rating', 'Rank', 'Popularity', 'Favorites', 'Air Start',
       'Air End', 'Duration (min)', 'Type_Gender'],
      dtype='object')

In [5]:
data_sampled.dtypes

user_id              int64
anime_id             int64
Anime Title         object
rating               int64
Username            object
Gender              object
Days Watched       float64
Mean Score         float64
Completed          float64
Total Entries      float64
Name                object
Score              float64
Genres              object
Type                object
Episodes           float64
Status              object
Producers           object
Studios             object
Source              object
Rating              object
Rank               float64
Popularity           int64
Favorites            int64
Air Start          float64
Air End            float64
Duration (min)     float64
Type_Gender       category
dtype: object

<font color='blue'> Drop the column used for sampling

In [6]:
data_sampled.drop(columns = ['Type_Gender'], inplace = True)

<font color='blue'> Setup the Wide and Deep features for training \
    \
    Wide Features - Categorical Features \
    Deep Features - Continous Features \
    \
    Also, perform encoding for wide features

In [7]:
def process_features(df):
    wide_features = ['user_id', 'anime_id', 'Gender', 'Type', 'Status', 'Producers', 
                     'Studios', 'Source', 'Rating']
    
    deep_features = ['Days Watched', 'Mean Score', 'Completed', 'Total Entries', 'Genres', 
                     'Episodes', 'Rank', 'Popularity', 'Favorites', 'Air Start', 'Air End', 'Duration (min)']
    
    encoders = {}
    for feature in wide_features:
        le = LabelEncoder()
        df[f'{feature}_encoded'] = le.fit_transform(df[feature].fillna('Unknown'))
        encoders[feature] = le
    
    scaler = StandardScaler()
    df[deep_features] = df[deep_features].fillna(df[deep_features].mean())
    scaled_features = scaler.fit_transform(df[deep_features])
    for i, feature in enumerate(deep_features):
        df[f'{feature}_scaled'] = scaled_features[:, i]
    
    return df, wide_features, deep_features, encoders

<font color='blue'> Setup Dataset Class to be used for preparing the training dataset

In [8]:
class AnimeDataset(Dataset):
    def __init__(self, df, wide_features, deep_features):
        self.wide_features = torch.tensor(df[[f'{f}_encoded' for f in wide_features]].values)
        self.deep_features = torch.tensor(df[[f'{f}_scaled' for f in deep_features]].values, dtype=torch.float32)
        self.labels = torch.tensor(df['Score'].values, dtype=torch.float32)
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'wide': self.wide_features[idx],
            'deep': self.deep_features[idx],
            'label': self.labels[idx]
        }

<font color='blue'> Setup the Model Architecture as a Class

In [9]:
class WideAndDeepModel(nn.Module):
    def __init__(self, wide_dim, deep_dim, wide_embeddings_dim, deep_hidden_units):
        super(WideAndDeepModel, self).__init__()
        
        self.wide_embeddings = nn.ModuleList([
            nn.Embedding(size, wide_embeddings_dim) 
            for size in wide_dim
        ])
        
        wide_output_dim = len(wide_dim) * wide_embeddings_dim
        
        deep_layers = []
        input_dim = deep_dim
        
        for units in deep_hidden_units:
            deep_layers.append(nn.Linear(input_dim, units))
            deep_layers.append(nn.ReLU())
            deep_layers.append(nn.Dropout(0.2))
            input_dim = units
        
        self.deep_layers = nn.Sequential(*deep_layers)
        
        combined_dim = wide_output_dim + deep_hidden_units[-1]
        self.final_layer = nn.Linear(combined_dim, 1)
        
    def forward(self, wide_input, deep_input):
        wide_embeddings = [emb(wide_input[:, i]) for i, emb in enumerate(self.wide_embeddings)]
        wide_concat = torch.cat(wide_embeddings, dim=1)
        
        deep_output = self.deep_layers(deep_input)
        
        combined = torch.cat([wide_concat, deep_output], dim=1)
        prediction = self.final_layer(combined)
        
        return prediction.squeeze()

<font color='blue'> Setup the Train Model Function

In [10]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device):
    model = model.to(device)
    best_val_loss = float('inf')
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            wide_features = batch['wide'].to(device)
            deep_features = batch['deep'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            outputs = model(wide_features, deep_features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        model.eval()
        val_loss = 0
        predictions = []
        true_labels = []
        with torch.no_grad():
            for batch in val_loader:
                wide_features = batch['wide'].to(device)
                deep_features = batch['deep'].to(device)
                labels = batch['label'].to(device)
                
                outputs = model(wide_features, deep_features)
                val_loss += criterion(outputs, labels).item()
                
                predictions.extend(outputs.cpu().numpy())
                true_labels.extend(labels.cpu().numpy())
        
        val_rmse = np.sqrt(np.mean((np.array(predictions) - np.array(true_labels)) ** 2))
        
        print(f'Epoch {epoch+1}/{num_epochs}')
        print(f'Training Loss: {total_loss/len(train_loader):.4f}')
        print(f'Validation Loss: {val_loss/len(val_loader):.4f}')
        print(f'Validation RMSE: {val_rmse:.4f}')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pth')

<font color='blue'> Setup Random Seed and Device for Training

In [11]:
torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

df = data_sampled

Using device: cpu


<font color='blue'> Process the dataset and generate the wide and deep features

In [None]:
processed_df, wide_features, deep_features, encoders = process_features(df)

<font color='blue'> Split the dataset into Trainm Test and Validation \
    <b> Train Size: 70% \
    Test Size: 15% \
    Val Size: 15%</b>

In [None]:
train_df, remaining_df = train_test_split(processed_df, test_size=0.3, random_state=42)

val_df, test_df = train_test_split(remaining_df, test_size=0.5, random_state=42)

<font color='blue'> Setup the Datasets and Create Dataloaders with batch Size as 256

In [None]:
train_dataset = AnimeDataset(train_df, wide_features, deep_features)
val_dataset = AnimeDataset(val_df, wide_features, deep_features)
test_dataset = AnimeDataset(test_df, wide_features, deep_features)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256)
test_loader = DataLoader(test_dataset, batch_size=256)

<font color='blue'> Initialize the Model

In [11]:
wide_dim = [len(encoders[f].classes_) for f in wide_features]
deep_dim = len(deep_features)
model = WideAndDeepModel(
    wide_dim=wide_dim,
    deep_dim=deep_dim,
    wide_embeddings_dim=16,
    deep_hidden_units=[128, 64, 32]
)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

<font color='blue'> Train the Model

In [12]:
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, device=device)

Epoch 1/10
Training Loss: 0.0882
Validation Loss: 0.0075
Validation RMSE: 0.0868
Epoch 2/10
Training Loss: 0.0156
Validation Loss: 0.0074
Validation RMSE: 0.0861
Epoch 3/10
Training Loss: 0.0152
Validation Loss: 0.0060
Validation RMSE: 0.0772
Epoch 4/10
Training Loss: 0.0149
Validation Loss: 0.0075
Validation RMSE: 0.0865
Epoch 5/10
Training Loss: 0.0148
Validation Loss: 0.0062
Validation RMSE: 0.0787
Epoch 6/10
Training Loss: 0.0147
Validation Loss: 0.0057
Validation RMSE: 0.0753
Epoch 7/10
Training Loss: 0.0146
Validation Loss: 0.0052
Validation RMSE: 0.0720
Epoch 8/10
Training Loss: 0.0146
Validation Loss: 0.0052
Validation RMSE: 0.0725
Epoch 9/10
Training Loss: 0.0144
Validation Loss: 0.0050
Validation RMSE: 0.0709
Epoch 10/10
Training Loss: 0.0142
Validation Loss: 0.0053
Validation RMSE: 0.0729


<font color='blue'> The training loss consistently decreases till the last epochs, indicating that the model is learning effectively from the training data. The RMSE values show a trend of decreasing until epoch 9, which indicates improving performance in terms of prediction accuracy on the validation set. There is minor increase with Epoch 10 for Validation RMSE, this could be a sign of Overfitting, however, without further epochs it cannot be established. \
Also, The model appears to be relatively stable, as the training loss values are relatively small and show a gradual decrease.

<font color='blue'> Generating Metrics for the Validation dataset

In [13]:
def analyze_predictions(model, val_loader, val_df, device):
    model.eval()
    predictions = []
    true_labels = []
    test_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            wide_features = batch['wide'].to(device)
            deep_features = batch['deep'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(wide_features, deep_features)
            test_loss += criterion(outputs, labels).item()
            
            predictions.extend(outputs.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    test_rmse = np.sqrt(np.mean((np.array(predictions) - np.array(true_labels)) ** 2))
    print(f'Validation Loss: {test_loss/len(test_loader):.4f}')
    print(f'Test RMSE: {test_rmse:.4f}')
    
    predictions = np.array(predictions)
    true_labels = np.array(true_labels)
    
    errors = np.abs(predictions - true_labels)
    
    val_df['predicted_score'] = predictions
    val_df['prediction_error'] = errors
    
    print("\nAnalysis by Genre:")
    genre_errors = val_df.groupby('Genres')['prediction_error'].agg(['mean', 'count']).sort_values('mean', ascending=False)
    print(genre_errors.head())
    
    print("\nAnalysis by Type:")
    type_errors = val_df.groupby('Type')['prediction_error'].agg(['mean', 'count']).sort_values('mean', ascending=False)
    print(type_errors)
    
    print("\nWorst Predictions:")
    worst_predictions = val_df.nlargest(5, 'prediction_error')[['Anime Title', 'Score', 'predicted_score', 'prediction_error']]
    print(worst_predictions)
    
    return val_df

In [14]:
val_df_with_predictions = analyze_predictions(model, val_loader, val_df, device)

Validation Loss: 0.0053
Test RMSE: 0.0729

Analysis by Genre:
            mean  count
Genres                 
532     1.361262      1
538     0.905021     81
515     0.817674     49
907     0.686859    574
506     0.582172    414

Analysis by Type:
          mean    count
Type                   
1     0.344736     6180
3     0.071914   170510
2     0.069820    29616
4     0.038799   108937
0     0.037054   245114
5     0.029681  1224887

Worst Predictions:
           Anime Title  Score  predicted_score  prediction_error
23646117  Tsui no Sora   2.22         6.118314          3.898314
23646180  Tsui no Sora   2.22         6.117153          3.897153
23646122  Tsui no Sora   2.22         6.113920          3.893920
23646173  Tsui no Sora   2.22         6.109069          3.889069
23646171  Tsui no Sora   2.22         6.105966          3.885966


<font color='blue'> Evaluation \
<b>Validation Loss:</b>Validation loss of 0.0053 is low which indicating that the model has trained effectively and is able to generalize well on the validation data. \
<b>Test RMSE:</b> Test RMSE of 0.0729 suggests that the predictions are close to the actual values on the val dataset. Given that this RMSE is reasonably low, it indicates that the model's predictions are generally accurate.

In [23]:
test_df['user_id'].unique

<bound method Series.unique of 22457891     111125
4987336     1158121
15293953     919863
14926919     455244
9502097      496828
             ...   
8153754      422506
22579532     366396
22380835     331549
18118120     396311
5043314      381646
Name: user_id, Length: 1785244, dtype: int64>

<font color='blue'> Display Dataframe for one User ID

In [24]:
display(test_df.query(f"user_id == {366396}")[['user_id', 'Username','anime_id', 'Anime Title', 'Genres', 'Mean Score', 'Score']])

Unnamed: 0,user_id,Username,anime_id,Anime Title,Genres,Mean Score,Score
16317179,366396,shinji144,49438,Isekai Yakkyoku,850,7.71,7.32
16355083,366396,shinji144,48761,Saihate no Paladin,80,7.71,6.84
14539718,366396,shinji144,23283,Zankyou no Terror,929,7.71,8.1
9071784,366396,shinji144,7088,Ichiban Ushiro no Daimaou,176,7.71,6.74
19667017,366396,shinji144,49891,Tensei shitara Ken deshita,270,7.71,7.55
10006093,366396,shinji144,8937,Toaru Majutsu no Index II,290,7.71,7.53
3951342,366396,shinji144,523,Tonari no Totoro,371,7.71,8.25
14327608,366396,shinji144,21561,Ryuugajou Nanana no Maizoukin,413,7.71,7.13
9969642,366396,shinji144,16049,Toaru Kagaku no Railgun S,290,7.71,8.02
18469208,366396,shinji144,32105,Sousei no Onmyouji,286,7.71,7.3


<font color='blue'> Generate Recommendations for a Single User to see how the model recommends Animes

In [25]:
def get_recommendations(user_id, top_n=5, test_df=test_df, wide_features=wide_features, deep_features=deep_features, model=model):
    user_data = test_df[test_df['user_id'] == user_id]
    if user_data.empty:
        return "User not found"
    
    # Get all animes not rated by the user
    unrated_animes = test_df[~test_df['anime_id'].isin(user_data['anime_id'])]
    
    # Create a dataset for unrated animes
    unrated_dataset = AnimeDataset(unrated_animes, wide_features, deep_features)
    unrated_loader = DataLoader(unrated_dataset, batch_size=256)
    
    # Make predictions
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in unrated_loader:
            wide_input = batch['wide']
            deep_input = batch['deep']
            outputs = model(wide_input, deep_input)
            predictions.extend(outputs.squeeze().tolist())
    
    # Get top N recommendations
    unrated_animes['predicted_score'] = predictions
    top_recommendations = unrated_animes.nlargest(top_n, 'predicted_score')
    
    return top_recommendations[['anime_id', 'Anime Title', 'Genres', 'Mean Score', 'Score', 'predicted_score']]

# Example usage
recommendations = get_recommendations(user_id=366396, top_n=5)
display(recommendations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unrated_animes['predicted_score'] = predictions


Unnamed: 0,anime_id,Anime Title,Genres,Mean Score,Score,predicted_score
652279,1,Cowboy Bebop,143,8.3,8.75,9.243464
8138439,9253,Steins;Gate,838,7.5,9.07,9.221704
23800621,51836,Douluo Dalu II: Jueshi Tangmen,80,7.79,6.38089,9.220238
17270587,820,Ginga Eiyuu Densetsu,833,8.41,9.02,9.158541
17270730,820,Ginga Eiyuu Densetsu,833,7.91,9.02,9.15325


<font color='blue'> Based on the Original ratings given by the user. Our model recommended new Animes that the user can review and watch.

## <font color='blue'> Thank You