In [7]:
import fsspec
import os
from ipfsspec.asyn import AsyncIPFSFileSystem
from fsspec import register_implementation
import asyncio
import io
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset, Dataset

# register_implementation(IPFSFileSystem.protocol, IPFSFileSystem)
# register_implementation(AsyncIPFSFileSystem.protocol, AsyncIPFSFileSystem)

# with fsspec.open("ipfs://QmZ4tDuvesekSs4qM5ZBKpXiZGun7S2CYtEZRB3DYXkjGx", "r") as f:
#     print(f.read())
class fs:
    ipfs = AsyncIPFSFileSystem()
    local = fsspec.filesystem("file")
    

    
    
class deML:
    tmp_root_path = '/tmp/deML'
    fs = fs

    @staticmethod
    def get_tmp_path(path):
        tmp_path = os.path.join(deML.tmp_root_path, path)
        try:
            fs.local.mkdir(tmp_path, create_parents=True)
        except FileExistsError:
            pass
        
        return tmp_path
    
    
    @staticmethod
    def save_model(model, path:str):

        
        # fs.ipfs.mkdir(path, create_parents=True)
        
        tmp_path = deML.get_tmp_path(path=path)
        model.save_pretrained(tmp_path)
        fs.ipfs.mkdirs(path)
        
        cid = deML.ipfs_put(lpath=tmp_path, rpath=path, max_trials=10)
        fs.local.rm(tmp_path,  recursive=True)
        
        return cid

    @staticmethod
    def save_tokenizer(tokenizer, path:str):

        
        # fs.ipfs.mkdir(path, create_parents=True)
        
        tmp_path = deML.get_tmp_path(path=path)
        tokenizer.save_pretrained(tmp_path)
        fs.ipfs.mkdirs(path)
        
        cid = deML.ipfs_put(lpath=tmp_path, rpath=path, max_trials=10)
        fs.local.rm(tmp_path,  recursive=True)
        
        return cid

    
    @staticmethod
    def load_tokenizer( path:str):
        tmp_path = deML.get_tmp_path(path=path)
        fs.ipfs.get(lpath=tmp_path, rpath=path )
        model = AutoTokenizer.from_pretrained(tmp_path)
        fs.local.rm(tmp_path,  recursive=True)
        return model


    
    @staticmethod
    def load_model( path:str):
        tmp_path = deML.get_tmp_path(path=path)
        fs.ipfs.get(lpath=tmp_path, rpath=path )
        model = AutoModel.from_pretrained(tmp_path)
        # fs.local.rm(tmp_path,  recursive=True)
        return model


    @staticmethod
    def load_dataset(path):
        tmp_path = deML.get_tmp_path(path=path)
        fs.ipfs.get(lpath=tmp_path, rpath=path )
        dataset = Dataset.load_from_disk(tmp_path)
        # fs.local.rm(tmp_path,  recursive=True)
        
        return dataset

    @staticmethod
    def save_dataset(dataset, path:str):
        tmp_path = deML.get_tmp_path(path=path)
        dataset = dataset.save_to_disk(tmp_path)
        cid = deML.ipfs_put(lpath=tmp_path, rpath=path, max_trials=10)
        # fs.local.rm(tmp_path,  recursive=True)
        return cid


    
          
    @staticmethod
    def ipfs_put(lpath, rpath, max_trials=10):
        trial_count = 0
        cid = None
        while trial_count<max_trials:
            try:
                cid= fs.ipfs.put(lpath=lpath, rpath=rpath, recursive=True)
                break
            except fsspec.exceptions.FSTimeoutError:
                trial_count += 1
                print(f'Failed {trial_count}/{max_trials}')
                
        return cid



## Loading Model from Hub

In [8]:
dataset, model, tokenizer = {}, {}, {}
dataset['web2'] = load_dataset("glue", "mrpc", split="train")
model['web2'] = AutoModel.from_pretrained("bert-base-uncased")
tokenizer['web2'] = AutoTokenizer.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:



cid = {}
cid['tokenizer'] = deML.save_tokenizer(tokenizer=tokenizer['web2'], path='/hf_tokenizer')
cid['model'] = deML.save_model(model=model['web2'], path='/hf_model')
cid['dataset'] = deML.save_dataset(dataset=dataset['web2'], path='/hf_dataset')
cid

{'tokenizer': 'QmNcERjBC4TC4HhYeRRXaoq6xF6s5sExFvF9jjCKzt68z5',
 'model': 'QmY2TSXiDckZ9kuqsGpFKyKz5bVZcm61FN77M8nfbxfzAh',
 'dataset': 'QmeGhwGt5X3KvuVrx1orfcCBAbwG4aJZn9n5yBpWkbPu5w'}

In [10]:

model['web3'] = deML.load_model(cid['model'])
tokenizer['web3']  = deML.load_tokenizer(cid['tokenizer'])
dataset['web3']  = deML.load_dataset(cid['dataset'])



In [11]:
def run_inference(model, tokenizer, dataset):
    def encode(examples):
        return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding="max_length")
    dataset = dataset.map(encode, batched=True)
    dataset = dataset.map(lambda examples: {"labels": examples["label"]}, batched=True)


    dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])
    import torch
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=1)

    x_batch = next(iter(dataloader))
    del x_batch['labels']
    with torch.no_grad():
        output_dict = dict(model(**x_batch))
    return output_dict
    
results = {}
print('RUNNING WEB2')
results['web2'] = run_inference(model['web2'], tokenizer['web2'], dataset['web2'])
# results['web3'] = run_inference(model['web3'], tokenizer['web3'], dataset['web3'])

print('RUNNING WEB3')
results['web3'] = run_inference(model['web3'], tokenizer['web3'], dataset['web3'])





RUNNING WEB2




RUNNING WEB3


In [12]:


{k: {'web2': results['web2'][k].mean(), 'web3': results['web3'][k].mean()}for k in results['web3'].keys()}

{'last_hidden_state': {'web2': tensor(-0.0085), 'web3': tensor(-0.0085)},
 'pooler_output': {'web2': tensor(-0.0395), 'web3': tensor(-0.0395)}}