In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("amazon_reviews_us_Electronics_v1_00.tsv", delimiter = '\t',nrows=999999, on_bad_lines = 'skip')
df.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'],
      dtype='object')

In [None]:
class RatingDataset(Dataset):
    def __init__(self, users, items, ratings):
        self.users = torch.tensor(users, dtype=torch.long)
        self.items = torch.tensor(items, dtype=torch.long)
        self.ratings = torch.tensor(ratings, dtype=torch.float)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

In [None]:
def prepare_data_MF(file_path):
    df = pd.read_csv(file_path, delimiter = '\t',nrows=999999, on_bad_lines = 'skip')
    df = df[['customer_id', 'product_id', 'star_rating']]
    
    #filter the dataframe to only include customers with more than 5 records and products with more than 6 records
    
    customers = df['customer_id'].value_counts()
    products = df['product_id'].value_counts()
    
    selected_customers = customers[customers >= 5].index
    selected_products = products[products >= 10].index
    
    filtered_df = df[df['product_id'].isin(selected_products)]
    filtered_df = filtered_df[filtered_df['customer_id'].isin(selected_customers)]
    
    customers = filtered_df['customer_id'].value_counts()
    products = filtered_df['product_id'].value_counts()
    
    customer_idx = pd.DataFrame({'customer_id': customers.index, 'user': np.arange(customers.shape[0])})
    product_idx = pd.DataFrame({'product_id': products.index, 'item': np.arange(products.shape[0])})
    
    filtered_df = filtered_df.merge(customer_idx).merge(product_idx)
    
    train_df, test_df = train_test_split(filtered_df, test_size = 0.2, random_state = 0)
    
    # PyTorch data loaders
    train_dataset = RatingDataset(train_df['user'].values, train_df['item'].values, train_df['star_rating'].values)
    test_dataset = RatingDataset(test_df['user'].values, test_df['item'].values, test_df['star_rating'].values)

    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, num_workers=4, drop_last=True)
    test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size, num_workers=4, drop_last=True)

    return train_iter, test_iter, customer_idx, product_idx 
    
    

    
    




In [None]:
#todo: prepare data for embedding based sim
class prepare_data_twotower(Dataset):


In [None]:
#matrix factorization colaborative filtering
class MFModel(nn.Module):
    def __init__(self, max_users, max_items, emb_dim, dropout_rate=0.5):
        super(MFModel, self).__init__()
        self.max_users = max_users
        self.max_items = max_items
        self.dropout_rate = dropout_rate
        self.emb_dim = emb_dim

        self.user_embeddings = nn.Embedding(max_users, emb_dim)
        self.item_embeddings = nn.Embedding(max_items, emb_dim)

        self.dropout_user = nn.Dropout(dropout_rate)
        self.dropout_item = nn.Dropout(dropout_rate)

        self.dense_user = nn.Linear(emb_dim, emb_dim)
        self.dense_item = nn.Linear(emb_dim, emb_dim)

    def forward(self, users, items):
        a = self.user_embeddings(users)
        a = self.dense_user(a)
        a = nn.functional.relu(a)
        a = self.dropout_user(a)

        b = self.item_embeddings(items)
        b = self.dense_item(b)
        b = nn.functional.relu(b)
        b = self.dropout_item(b)

        predictions = a * b
        predictions = torch.sum(predictions, dim=1)

        return predictions



            
        

In [None]:
#two tower nn - embedding-based sim 
class TwoTowerModel(nn.Module):
    def __init__(self, num_media_items, num_users, embedding_size):
        super(TwoTowerModel, self).__init__()

        # Embedding layers for item and user categorical features
        self.item_sparse_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=embedding_size)
        self.user_sparse_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=embedding_size)

        # Fully connected layers for item and user dense features
        self.item_dense_layers = nn.Sequential(
            nn.Linear(in_features=3, out_features=10),
            nn.ReLU()
        )

        self.user_dense_layers = nn.Sequential(
            nn.Linear(in_features=2, out_features=10),
            nn.ReLU()
        )

        # Output layer
        self.output_layer = nn.Linear(in_features=20, out_features=1)  

    def forward(self, item_sparse, item_dense, user_sparse, user_dense):
        # Embed sparse features
        item_sparse_embedded = self.item_sparse_embedding(item_sparse)
        user_sparse_embedded = self.user_sparse_embedding(user_sparse)

        # Apply dense layers to dense features
        item_dense_processed = self.item_dense_layers(item_dense)
        user_dense_processed = self.user_dense_layers(user_dense)

        # Concatenate embedded and processed features
        item_combined_features = torch.cat([item_sparse_embedded, item_dense_processed], dim=1)
        user_combined_features = torch.cat([user_sparse_embedded, user_dense_processed], dim=1)

        # Apply the output layer
        item_output = self.output_layer(item_combined_features)
        user_output = self.output_layer(user_combined_features)

        return item_output, user_output


In [None]:
def train(model, train_loader, test_loader, file_path, n_epochs):
    model.to(device)
    for epoch in range(n_epochs):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        mode.train()
        criterion = nn.MSELoss()
        
        for user, item, rating in train_loader:
            #transfer data to gpu
            users, items, ratings = users.to(device), items.to(device), ratings.to(device)
            
            #forward pass
            outputs = model(users, items)
            loss = criterion(outputs, ratings)
            
            #backward pass
            optimizer.zero_grad()
            loss.backward()
            
            #update parameters
            optimizer.step()
            
        #evaluate 
        
        model.eval()
        train_loss = evaluate(model, train_loader)
        test_loss = evaluate(model, test_loader)
        print(f"Epoch {epoch + 1}/{n_epochs}, Train MSE: {train_loss}, Test MSE: {test_loss}")
        
        return model

            

In [None]:
def evaluate(model, data_loader):
    model.eval()
    
    criterion = nn.MSELoss()
    loss = 0
    
    with torch.no_grad():
        for users, items, ratings in data_loader:
            users, items, ratings = users.to(device), items.to(device), ratings.to(device)
            y_pred = model(users, items)
            loss += criterion(y_pred, ratings).item()
            
            
    return loss/len(data_loader)
    

In [None]:
def save(model, model_dir, customer_idx, product_idx):
    torch.save(model.state_dict(), os.path.join(model_dir, 'model.pth'))
    customer_idx.to_csv(os.path.join(model_dir, 'customer_index.csv'), index=False)
    product_idx.to_csv(os.path.join(model_dir, 'product_index.csv'), index=False)
    
    

In [None]:
def load_model(model_dir):
    model = MFModel(max_users=100, max_items=100, emb_dim=64, dropout_rate=0.5)
    model.load_state_dict(torch.load(os.path.join(model_dir, 'model.pth')))
    customer_index = pd.read_csv(os.path.join(model_dir, 'customer_index.csv'))
    product_index = pd.read_csv(os.path.join(model_dir, 'product_index.csv'))
    return model, customer_index, product_index
    

In [None]:
#inference
# Transform function
def transform_fn(model, data, input_content_type, output_content_type):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    parsed = json.loads(data)
    
    users = pd.DataFrame({'customer_id': parsed['customer_id']}).merge(model[1], how='left')['user'].values
    items = pd.DataFrame({'product_id': parsed['product_id']}).merge(model[2], how='left')['item'].values

    inputs = torch.tensor([users, items], dtype=torch.long).to(device)
    with torch.no_grad():
        outputs = model(*inputs)
    
    response_body = json.dumps(outputs.cpu().numpy().tolist())
    return response_body, output_content_type
    