In [4]:
import torch
import torch.nn as nn

In [5]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer

checkpoint='bert-base-uncased'

model=AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer=AutoTokenizer.from_pretrained(checkpoint)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:

class BertEmbeddings(nn.Module):
    def __init__(self):
        super(BertEmbeddings,self).__init__()
        self.word_embeddings=nn.Embedding(30522,768,padding_idx=0)
        self.position_embeddings=nn.Embedding(512,768)
        self.token_type_embeddings=nn.Embedding(2,768)
        self.LayerNorm=nn.LayerNorm((768,),eps=1e-12,elementwise_affine=True)
        self.dropout=nn.Dropout(p=0.1,inplace=False)

    def forward(self,input_ids,position_ids=None,token_type_ids=None):
        if  position_ids is None:
            position_ids=torch.arange(input_ids.size(1),dtype=torch.long,device=input_ids.device)
            position_ids=position_ids.unsqueeze(0).expand_as(input_ids)
        
        if  token_type_ids is None:
            token_type_ids=torch.zeros_like(input_ids,dtype=torch.long)
        
        word_embeds=self.word_embeddings(input_ids)
        position_embeds=self.position_embeddings(position_ids)
        token_type_embeds=self.token_type_embeddings(token_type_ids)
        embeds=word_embeds+position_embeds+token_type_embeds
        embeds=self.LayerNorm(embeds)
        embeds=self.dropout(embeds)
        return embeds

class BertSdpaSelfAttention(nn.Module):
    def __init__(self,hidden_size):
        super(BertSdpaSelfAttention,self).__init__()
        self.query=nn.Linear(in_features=hidden_size,out_features=hidden_size,bias=True)
        self.key=nn.Linear(in_features=hidden_size,out_features=hidden_size,bias=True)
        self.value=nn.Linear(in_features=hidden_size,out_features=hidden_size,bias=True)
        self.dropout=nn.Dropout(p=0.1,inplace=False)

    def forward(self,x):
        key=self.key(x)
        query=self.query(x)
        value=self.value(x)
        d_k=key.size(-1)

        attention_scores=(query@key.transpose(-2,-1))*(d_k**-0.5)
        attention_weight=torch.softmax(attention_scores,dim=-1)
        attention_weight=self.dropout(attention_weight)

        attention_output=attention_weight@value

        return attention_output
    

class BertSelfOutput(nn.Module):
    def __init__(self,hidden_size):
        super(BertSelfOutput,self).__init__()
        self.dense=nn.Linear(hidden_size,hidden_size,True)
        self.LayerNorm=nn.LayerNorm((hidden_size,),eps=1e-12,elementwise_affine=True)
        self.dropout=nn.Dropout(p=0.1,inplace=False)
    
    def forward(self,x,residual):
        output=self.dense(x)
        output=self.dropout(output)
        output=output + residual
        output=self.LayerNorm(output)
        return output 
       


class BertIntermediate(nn.Module):
    def __init__(self,hidden_size,intermediate_size=3072):
        super(BertIntermediate,self).__init__()
        self.dense=nn.Linear(in_features=hidden_size,out_features=intermediate_size,bias=True)
        self.Intermediate_act_fn=nn.GELU()
    
    def forward(self,x):
        out=self.dense(x)
        out=self.Intermediate_act_fn(out)
        return out
        

class BertOutput(nn.Module):
    def __init__(self,hidden_size):
        super(BertOutput,self).__init__()

        self.dense=nn.Linear(in_features=hidden_size,out_features=hidden_size)
        self.LayerNorm=nn.LayerNorm((hidden_size,),eps=1e-12,elementwise_affine=True)
        self.dropout=nn.Dropout(p=0.1,inplace=False)
    
    def forward(self,x):

        dense=self.dense(x)
        dropped=self.dropout(dense)
        return self.LayerNorm(dropped)
    


class BertPooler(nn.Module):
    def __init__(self,hidden_size):
        super(BertPooler,self).__init__()
        self.dense=nn.Linear(hidden_size,hidden_size,bias=True)
        self.activation=nn.Tanh()
    
    def forward(self,x):
        pooled_output = x[:, 0]
        dense=self.dense(pooled_output)
        return self.activation(dense)


class BertAttention(nn.Module):
    def __init__(self,hidden_size):
        super(BertAttention,self).__init__()
        self.self=BertSdpaSelfAttention(hidden_size)
        self.output=BertSelfOutput(hidden_size)
    
    def forward(self,x):
        attention=self.self(x)
        return  self.output(attention,x)

class BertLayer(nn.Module):
    def __init__(self,hidden_size,intermediate_size=3072):
        super(BertLayer,self).__init__()
        self.attention=BertAttention(hidden_size)
        self.intermediate=BertIntermediate(hidden_size,intermediate_size)
        self.output=BertOutput(hidden_size)
    
    def forward(self,x):
        attention=self.attention(x)
        intermediate=self.intermediate(attention)
        out=self.output(attention+intermediate,attention)
        return out


class BertEncoder(nn.Module):
    def __init__(self,hidden_size,num_hidden_layer,intermediate_size=3072):
        super(BertEncoder,self).__init__()
        self.layer=nn.ModuleList([
            BertLayer(hidden_size=hidden_size,intermediate_size=intermediate_size) for _ in range(num_hidden_layer)
        ])
    
    def forward(self,hidden_states):
        for layer_module in self.layer:
            hidden_states=layer_module(hidden_states)
        return hidden_states


class BertModel(nn.Module):
    def __init__(self,hidden_size,num_hidden_layer,intermediate_size=3072):
        super(BertModel,self).__init__()
        self.embeddings=BertEmbeddings()
        self.encoder=BertEncoder(hidden_size=hidden_size,intermediate_size=intermediate_size,num_hidden_layer=num_hidden_layer)
        self.pooler=BertPooler(hidden_size)

    def forward(self,x):
        embeds=self.embeddings(x)
        encoded=self.encoder(embeds)
        pool=self.pooler(encoded)
        return pool

class BertForSequenceClassification(nn.Module):
    def __init__(self,hidden_size,num_classes,num_hidden_layer,intermediate_size=3072):
        super(BertForSequenceClassification,self).__init__()
        self.bert=BertModel(hidden_size=hidden_size,intermediate_size=intermediate_size,num_hidden_layer=num_hidden_layer)
        self.dropout=nn.Dropout(p=0.1,inplace=False)
        self.classifier=nn.Linear(in_features=hidden_size,out_features=num_classes)

    def forward(self,input_ids,token_type_ids,attention_mask):

        bert=self.bert(input_ids)
        dropout=self.dropout(bert)
        classification=self.classifier(dropout)

        return classification



In [32]:
my_model=BertForSequenceClassification(hidden_size=768,num_hidden_layer=12,num_classes=2)

In [33]:
toks=tokenizer('hello', return_tensors='pt')

In [34]:
my_model(**toks)

RuntimeError: The size of tensor a (768) must match the size of tensor b (3072) at non-singleton dimension 2