In [1]:
import sys
import pandas as pd
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
import mlfoundry as mlf

In [2]:
mlf.login(api_key="OGQ1NDI4ZGMtZGY3My00ZWEyLTg2NGMtZjA3OTQzNDkzZDRiOmZjNjc4Mw==", relogin=True)
client = mlf.get_client()

Writing API key at /home/ec2-user/.mlfoundry/credentials.netrc


In [3]:
class MovieLensDataset(torch.utils.data.Dataset):
    def __init__(self, df, mean_rating=3.58):
        self.df = df
        self.mean_rating = mean_rating
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        x = torch.tensor([row.user_id, row.movie_id], dtype=torch.int32)
        y = torch.tensor(row.rating - self.mean_rating, dtype=torch.float)
        return x, y

In [4]:
train = pd.read_csv("data/ml1m_train.csv")
train_dataset = MovieLensDataset(train)
test = pd.read_csv("data/ml1m_test.csv")
test_dataset = MovieLensDataset(test)

In [5]:
class MatrixFactorization(torch.nn.Module):
    
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        self.user_biases = torch.nn.Embedding(n_users, 1)
        self.item_biases = torch.nn.Embedding(n_items,1)
        torch.nn.init.xavier_uniform_(self.user_factors.weight)
        torch.nn.init.xavier_uniform_(self.item_factors.weight)
        self.user_biases.weight.data.fill_(0.)
        self.item_biases.weight.data.fill_(0.)
        
    def forward(self, user, item):
        pred = self.user_biases(user) + self.item_biases(item)
        pred += (self.user_factors(user) * self.item_factors(item)).sum(1, keepdim=True)
        return pred.squeeze()
    
    def freeze_users(self):
        self.user_factors.weight.requires_grad = False
        self.user_biases.weight.requires_grad = False
        self.item_factors.weight.requires_grad = True
        self.item_biases.weight.requires_grad = True
        
    def freeze_items(self):
        self.user_factors.weight.requires_grad = True
        self.user_biases.weight.requires_grad = True
        self.item_factors.weight.requires_grad = False
        self.item_biases.weight.requires_grad = False

In [6]:
reviewers = 6041
books = 3953
batch_size = 256

In [7]:
device = torch.device("cpu")

In [8]:
def train_loop(model, train_batch, label_batch, loss_func, optimizer):
    # First pass with users layer freeze
    model.freeze_users()
    prediction = model(train_batch[:,0].to(device), train_batch[:,1].to(device))
    loss = loss_func(prediction, label_batch.to(device))    
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

    # First pass with items layer freeze
    model.freeze_items()
    prediction = model(train_batch[:,0].to(device), train_batch[:,1].to(device))
    loss = loss_func(prediction, label_batch.to(device))    
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()
    return loss.item()

In [9]:
def train(nfactor, learning_rate=0.02, weight_decay=1e-5):
    run = client.create_run(project_name="recommendation-system", run_name=f"matrix-factorization-factor{nfactor}", log_system_metrics=False)
    run.log_params({"nfactor": nfactor, "learning_rate": learning_rate, "weight_decay": weight_decay})
    
    model = MatrixFactorization(reviewers, books, n_factors=nfactor)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
    model.to(device)
    loss_func = torch.nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    epoches = 100
    last_test_loss = 10000
    for epoch in range(0, epoches):
        # Training
        pbar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
        train_count = 1
        train_loss = 0.
        for i, (train_batch, label_batch) in pbar:
            loss = train_loop(model, train_batch, label_batch, loss_func, optimizer)
            train_loss += loss
            train_count += 1
            pbar.set_description(f'Train loss at {epoch} batch {i}: {train_loss/train_count}')

        # Calculate test loss
        pbar = tqdm(enumerate(test_dataloader), total=len(test_dataloader))
        test_loss = 0.
        test_count = 1
        for i,( test_batch, label_batch) in pbar:
            with torch.no_grad():
                prediction = model(test_batch[:,0].to(device), test_batch[:,1].to(device))
                loss = loss_func(prediction, label_batch.to(device))
                test_loss += loss.item()
                test_count += 1
                pbar.set_description(f'Test loss at {epoch} batch {i}: {test_loss/test_count}')
        
        run.log_metrics(
            metric_dict={
                "train_loss": train_loss/train_count,
                "test_loss": test_loss/test_count,
            }, step=epoch
        )
        # Early stopping
        current_test_loss = test_loss/test_count
        if last_test_loss - current_test_loss < 0.01:
            print(f"Stopping Training. Last")
            break
        last_test_loss = current_test_loss
    run.log_model(model=model, framework="pytorch")
    run.end()

In [None]:
train(nfactor=10)

Link to the dashboard for the run: https://app.truefoundry.com/mlfoundry/176/d020293a02f64a9ba70ff2da20679771/
[mlfoundry] 2022-07-06T18:07:07+0000 INFO Run 'cloud/recommendation-system/matrix-factorization-factor10-24' has started.
[mlfoundry] 2022-07-06T18:07:07+0000 INFO Parameters logged successfully


Train loss at 0 batch 3125: 1.2143582426056834: 100%|██████████| 3126/3126 [02:12<00:00, 23.66it/s]
Test loss at 0 batch 326: 1.177852744918044:  42%|████▏     | 327/782 [00:12<00:17, 25.89it/s] 

In [14]:
factors = [20, 30, 50, 75, 100]

In [15]:
for factor in factors:
    train(nfactor=factor)

Link to the dashboard for the run: https://app.truefoundry.com/mlfoundry/176/598dc4792eb845f0844b3a6221acb09a/
[mlfoundry] 2022-07-06T15:07:58+0000 INFO Run 'cloud/recommendation-system/matrix-factorization-factor20-18' has started.
[mlfoundry] 2022-07-06T15:07:58+0000 INFO Parameters logged successfully


Train loss at 0 batch 3125: 12.23497039653411: 100%|██████████| 3126/3126 [02:02<00:00, 25.42it/s] 
Test loss at 0 batch 781: 10.590846121387312: 100%|██████████| 782/782 [00:28<00:00, 27.59it/s]


[mlfoundry] 2022-07-06T15:10:30+0000 INFO Metrics logged successfully


Train loss at 1 batch 3125: 9.378546677117612: 100%|██████████| 3126/3126 [02:05<00:00, 24.86it/s] 
Test loss at 1 batch 781: 8.263826126035298: 100%|██████████| 782/782 [00:28<00:00, 27.61it/s]


[mlfoundry] 2022-07-06T15:13:04+0000 INFO Metrics logged successfully


Train loss at 2 batch 3125: 7.421100911371312: 100%|██████████| 3126/3126 [02:01<00:00, 25.65it/s] 
Test loss at 2 batch 781: 6.647035461123724: 100%|██████████| 782/782 [00:29<00:00, 26.86it/s] 


[mlfoundry] 2022-07-06T15:15:35+0000 INFO Metrics logged successfully


Train loss at 3 batch 3125: 6.046207107443911: 100%|██████████| 3126/3126 [02:02<00:00, 25.42it/s] 
Test loss at 3 batch 781: 5.49690130265432: 100%|██████████| 782/782 [00:28<00:00, 27.79it/s]  


[mlfoundry] 2022-07-06T15:18:06+0000 INFO Metrics logged successfully


Train loss at 4 batch 3125: 5.058332944244175: 100%|██████████| 3126/3126 [02:00<00:00, 25.86it/s] 
Test loss at 4 batch 781: 4.660611960166259: 100%|██████████| 782/782 [00:28<00:00, 27.71it/s] 


[mlfoundry] 2022-07-06T15:20:35+0000 INFO Metrics logged successfully


Train loss at 5 batch 3125: 4.332955695815006: 100%|██████████| 3126/3126 [02:01<00:00, 25.73it/s] 
Test loss at 5 batch 781: 4.039704348392413: 100%|██████████| 782/782 [00:29<00:00, 26.88it/s] 


[mlfoundry] 2022-07-06T15:23:06+0000 INFO Metrics logged successfully


Train loss at 6 batch 3125: 3.7896791221922412: 100%|██████████| 3126/3126 [02:02<00:00, 25.57it/s]
Test loss at 6 batch 781: 3.5696322646938827: 100%|██████████| 782/782 [00:28<00:00, 27.73it/s]


[mlfoundry] 2022-07-06T15:25:36+0000 INFO Metrics logged successfully


Train loss at 7 batch 3125: 3.3747653916959455: 100%|██████████| 3126/3126 [02:02<00:00, 25.56it/s]
Test loss at 7 batch 781: 3.207180341998279: 100%|██████████| 782/782 [00:28<00:00, 27.27it/s] 


[mlfoundry] 2022-07-06T15:28:07+0000 INFO Metrics logged successfully


Train loss at 8 batch 3125: 3.052206049111845: 100%|██████████| 3126/3126 [02:01<00:00, 25.69it/s] 
Test loss at 8 batch 781: 2.922692971820271: 100%|██████████| 782/782 [00:28<00:00, 27.36it/s] 


[mlfoundry] 2022-07-06T15:30:38+0000 INFO Metrics logged successfully


Train loss at 9 batch 3125: 2.797150493392481: 100%|██████████| 3126/3126 [02:01<00:00, 25.67it/s] 
Test loss at 9 batch 781: 2.6956535396600287: 100%|██████████| 782/782 [00:28<00:00, 27.05it/s]


[mlfoundry] 2022-07-06T15:33:08+0000 INFO Metrics logged successfully


Train loss at 10 batch 3125: 2.592014936467577: 100%|██████████| 3126/3126 [02:03<00:00, 25.28it/s] 
Test loss at 10 batch 781: 2.5115734623462: 100%|██████████| 782/782 [00:29<00:00, 26.73it/s]   


[mlfoundry] 2022-07-06T15:35:41+0000 INFO Metrics logged successfully


Train loss at 11 batch 3125: 2.424622912242232: 100%|██████████| 3126/3126 [02:04<00:00, 25.06it/s] 
Test loss at 11 batch 781: 2.3600922462127216: 100%|██████████| 782/782 [00:28<00:00, 27.66it/s]


[mlfoundry] 2022-07-06T15:38:15+0000 INFO Metrics logged successfully


Train loss at 12 batch 3125: 2.2859973127482833: 100%|██████████| 3126/3126 [02:04<00:00, 25.19it/s]
Test loss at 12 batch 781: 2.233689974246506: 100%|██████████| 782/782 [00:28<00:00, 27.13it/s] 


[mlfoundry] 2022-07-06T15:40:48+0000 INFO Metrics logged successfully


Train loss at 13 batch 3125: 2.1694996009286234: 100%|██████████| 3126/3126 [02:05<00:00, 24.84it/s]
Test loss at 13 batch 781: 2.126859411273727: 100%|██████████| 782/782 [00:29<00:00, 26.87it/s] 


[mlfoundry] 2022-07-06T15:43:23+0000 INFO Metrics logged successfully


Train loss at 14 batch 3125: 2.0706152291125597: 100%|██████████| 3126/3126 [02:02<00:00, 25.61it/s]
Test loss at 14 batch 781: 2.0354661965887817: 100%|██████████| 782/782 [00:28<00:00, 27.72it/s]


[mlfoundry] 2022-07-06T15:45:53+0000 INFO Metrics logged successfully


Train loss at 15 batch 3125: 1.9856043415097029: 100%|██████████| 3126/3126 [02:03<00:00, 25.39it/s]
Test loss at 15 batch 781: 1.9564168651137468: 100%|██████████| 782/782 [00:28<00:00, 27.01it/s]


[mlfoundry] 2022-07-06T15:48:25+0000 INFO Metrics logged successfully


Train loss at 16 batch 3125: 1.9117161223863237: 100%|██████████| 3126/3126 [02:02<00:00, 25.59it/s]
Test loss at 16 batch 781: 1.8873807236000344: 100%|██████████| 782/782 [00:28<00:00, 27.70it/s]


[mlfoundry] 2022-07-06T15:50:56+0000 INFO Metrics logged successfully


Train loss at 17 batch 3125: 1.84686417231973: 100%|██████████| 3126/3126 [02:03<00:00, 25.35it/s]  
Test loss at 17 batch 781: 1.8265298644549393: 100%|██████████| 782/782 [00:28<00:00, 27.07it/s]


[mlfoundry] 2022-07-06T15:53:28+0000 INFO Metrics logged successfully


Train loss at 18 batch 3125: 1.789563599050636: 100%|██████████| 3126/3126 [02:02<00:00, 25.52it/s] 
Test loss at 18 batch 781: 1.7724595336317255: 100%|██████████| 782/782 [00:27<00:00, 28.01it/s]


[mlfoundry] 2022-07-06T15:55:58+0000 INFO Metrics logged successfully


Train loss at 19 batch 3125: 1.738429299360043: 100%|██████████| 3126/3126 [02:03<00:00, 25.41it/s] 
Test loss at 19 batch 781: 1.7240610043512299: 100%|██████████| 782/782 [00:28<00:00, 27.26it/s]


[mlfoundry] 2022-07-06T15:58:30+0000 INFO Metrics logged successfully


Train loss at 20 batch 3125: 1.6925289239371055: 100%|██████████| 3126/3126 [02:02<00:00, 25.56it/s]
Test loss at 20 batch 781: 1.6804524964604188: 100%|██████████| 782/782 [00:27<00:00, 27.95it/s]


[mlfoundry] 2022-07-06T16:01:00+0000 INFO Metrics logged successfully


Train loss at 21 batch 3125: 1.6510268877137382: 100%|██████████| 3126/3126 [02:03<00:00, 25.38it/s]
Test loss at 21 batch 781: 1.6409308449061895: 100%|██████████| 782/782 [00:29<00:00, 26.87it/s]


[mlfoundry] 2022-07-06T16:03:33+0000 INFO Metrics logged successfully


Train loss at 22 batch 3125: 1.613388989236281: 100%|██████████| 3126/3126 [02:04<00:00, 25.19it/s] 
Test loss at 22 batch 781: 1.6049170524528962: 100%|██████████| 782/782 [00:28<00:00, 27.38it/s]


[mlfoundry] 2022-07-06T16:06:05+0000 INFO Metrics logged successfully


Train loss at 23 batch 3125: 1.5789303799044563: 100%|██████████| 3126/3126 [02:03<00:00, 25.28it/s]
Test loss at 23 batch 781: 1.5719459731307828: 100%|██████████| 782/782 [00:28<00:00, 27.82it/s]


[mlfoundry] 2022-07-06T16:08:37+0000 INFO Metrics logged successfully


Train loss at 24 batch 3125: 1.5473615701945018: 100%|██████████| 3126/3126 [02:01<00:00, 25.79it/s]
Test loss at 24 batch 781: 1.5416236793096647: 100%|██████████| 782/782 [00:28<00:00, 27.20it/s]


[mlfoundry] 2022-07-06T16:11:07+0000 INFO Metrics logged successfully


Train loss at 25 batch 3125: 1.5182728893961965: 100%|██████████| 3126/3126 [02:03<00:00, 25.39it/s]
Test loss at 25 batch 781: 1.5136281734529888: 100%|██████████| 782/782 [00:27<00:00, 28.04it/s]


[mlfoundry] 2022-07-06T16:13:38+0000 INFO Metrics logged successfully


Train loss at 26 batch 3125: 1.4913164132465597: 100%|██████████| 3126/3126 [02:03<00:00, 25.32it/s]
Test loss at 26 batch 781: 1.4876954407832024: 100%|██████████| 782/782 [00:29<00:00, 26.78it/s]


[mlfoundry] 2022-07-06T16:16:11+0000 INFO Metrics logged successfully


Train loss at 27 batch 3125: 1.4663476473155934: 100%|██████████| 3126/3126 [02:03<00:00, 25.28it/s]
Test loss at 27 batch 781: 1.4635897987494828: 100%|██████████| 782/782 [00:29<00:00, 26.92it/s]


[mlfoundry] 2022-07-06T16:18:44+0000 INFO Metrics logged successfully


Train loss at 28 batch 3125: 1.4431114537974383: 100%|██████████| 3126/3126 [02:05<00:00, 24.93it/s]
Test loss at 28 batch 781: 1.4411189294257232: 100%|██████████| 782/782 [00:27<00:00, 27.97it/s]


[mlfoundry] 2022-07-06T16:21:17+0000 INFO Metrics logged successfully


Train loss at 29 batch 3125: 1.4214045164834435: 100%|██████████| 3126/3126 [02:04<00:00, 25.17it/s]
Test loss at 29 batch 781: 1.4201198166205355: 100%|██████████| 782/782 [00:28<00:00, 27.78it/s]


[mlfoundry] 2022-07-06T16:23:50+0000 INFO Metrics logged successfully


Train loss at 30 batch 3125: 1.4010744154701074: 100%|██████████| 3126/3126 [02:03<00:00, 25.30it/s]
Test loss at 30 batch 781: 1.4004413439091747: 100%|██████████| 782/782 [00:30<00:00, 25.94it/s]


[mlfoundry] 2022-07-06T16:26:23+0000 INFO Metrics logged successfully


Train loss at 31 batch 3125: 1.3819942253254305: 100%|██████████| 3126/3126 [02:02<00:00, 25.42it/s]
Test loss at 31 batch 781: 1.381963987155588: 100%|██████████| 782/782 [00:28<00:00, 27.81it/s] 


[mlfoundry] 2022-07-06T16:28:55+0000 INFO Metrics logged successfully


Train loss at 32 batch 3125: 1.3640649757867123: 100%|██████████| 3126/3126 [02:02<00:00, 25.60it/s]
Test loss at 32 batch 781: 1.3645792114019089: 100%|██████████| 782/782 [00:29<00:00, 26.76it/s]


[mlfoundry] 2022-07-06T16:31:26+0000 INFO Metrics logged successfully


Train loss at 33 batch 3125: 1.3472090615873944: 100%|██████████| 3126/3126 [02:05<00:00, 24.83it/s]
Test loss at 33 batch 781: 1.3481855482281702: 100%|██████████| 782/782 [00:28<00:00, 27.35it/s]


[mlfoundry] 2022-07-06T16:34:00+0000 INFO Metrics logged successfully


Train loss at 34 batch 3125: 1.3312590485721645: 100%|██████████| 3126/3126 [02:01<00:00, 25.67it/s]
Test loss at 34 batch 781: 1.3327015028602776: 100%|██████████| 782/782 [00:28<00:00, 27.59it/s]


[mlfoundry] 2022-07-06T16:36:31+0000 INFO Metrics logged successfully


Train loss at 35 batch 3125: 1.3162070331308153: 100%|██████████| 3126/3126 [02:04<00:00, 25.19it/s]
Test loss at 35 batch 781: 1.3180517334286432: 100%|██████████| 782/782 [00:28<00:00, 27.13it/s]


[mlfoundry] 2022-07-06T16:39:04+0000 INFO Metrics logged successfully


Train loss at 36 batch 3125: 1.3019436630418897: 100%|██████████| 3126/3126 [02:02<00:00, 25.54it/s]
Test loss at 36 batch 781: 1.3041660321322133: 100%|██████████| 782/782 [00:27<00:00, 28.17it/s]


[mlfoundry] 2022-07-06T16:41:34+0000 INFO Metrics logged successfully


Train loss at 37 batch 3125: 1.2883682343423806: 100%|██████████| 3126/3126 [02:03<00:00, 25.27it/s]
Test loss at 37 batch 781: 1.2909920810130668: 100%|██████████| 782/782 [00:28<00:00, 27.90it/s]


[mlfoundry] 2022-07-06T16:44:06+0000 INFO Metrics logged successfully


Train loss at 38 batch 3125: 1.2755129479507994: 100%|██████████| 3126/3126 [02:03<00:00, 25.28it/s]
Test loss at 38 batch 781: 1.2784718705167564: 100%|██████████| 782/782 [00:29<00:00, 26.92it/s]


[mlfoundry] 2022-07-06T16:46:38+0000 INFO Metrics logged successfully


Train loss at 39 batch 3125: 1.2632629001030038: 100%|██████████| 3126/3126 [02:03<00:00, 25.37it/s]
Test loss at 39 batch 781: 1.266560028795995: 100%|██████████| 782/782 [00:29<00:00, 26.95it/s] 


[mlfoundry] 2022-07-06T16:49:11+0000 INFO Metrics logged successfully


Train loss at 40 batch 3125: 1.2516609459858106: 100%|██████████| 3126/3126 [02:04<00:00, 25.17it/s]
Test loss at 40 batch 781: 1.255208679390441: 100%|██████████| 782/782 [00:27<00:00, 28.47it/s] 


[mlfoundry] 2022-07-06T16:51:42+0000 INFO Metrics logged successfully


Train loss at 41 batch 3125: 1.2405316484100262: 100%|██████████| 3126/3126 [02:02<00:00, 25.47it/s]
Test loss at 41 batch 781: 1.2443802828685229: 100%|██████████| 782/782 [00:29<00:00, 26.86it/s]


[mlfoundry] 2022-07-06T16:54:14+0000 INFO Metrics logged successfully


Train loss at 42 batch 3125: 1.2299406067761591: 100%|██████████| 3126/3126 [02:04<00:00, 25.12it/s]
Test loss at 42 batch 781: 1.2340370471450104: 100%|██████████| 782/782 [00:28<00:00, 27.60it/s]


[mlfoundry] 2022-07-06T16:56:47+0000 INFO Metrics logged successfully


Train loss at 43 batch 3125: 1.219784085080385: 100%|██████████| 3126/3126 [02:01<00:00, 25.69it/s] 
Test loss at 43 batch 781: 1.224152172966783: 100%|██████████| 782/782 [00:27<00:00, 28.43it/s] 


[mlfoundry] 2022-07-06T16:59:16+0000 INFO Metrics logged successfully
Stopping Training. Last
[mlfoundry] 2022-07-06T16:59:25+0000 INFO Model logged successfully


AttributeError: 'MlFoundryRun' object has no attribute 'close'

1