In [1]:
import torch
import torch.nn as nn


class CRF(nn.Module):

    def __init__(self, input_dim, embed_dim, conv_layers, num_labels, batch_size):
        """
        Linear chain CRF as in Assignment 2
        """
        super(CRF, self).__init__()

        self.input_dim = input_dim
        self.embed_dim = embed_dim
        self.conv_layers = conv_layers
        self.num_labels = num_labels
        self.batch_size = batch_size
        self.use_cuda = torch.cuda.is_available()

        ### Use GPU if available
        if self.use_cuda:
            [m.cuda() for m in self.modules()]

    def init_params(self):
        # W and T flattened into one long tensor of shape (4004*1)
        self.x = torch.zeros((26*128+26**2,1))

    def forward(self, X):
        """
        Implement the objective of CRF here.
        The input (features) to the CRF module should be convolution features.
        """
        features = self.get_conv_feats(X)
        W = self.x[:128*26].view(26,128)  # each column of W (128 dim)
        T = self.x[128*26:].view(26, 26)
        
        prediction = crf_decode(W, T, features)
        return (prediction)

    def loss(self, X, labels):
        """
        Compute the negative conditional log-likelihood of a labelling given a sequence.
        """


        # x is a vector. so reshape it into w_y and T
        W = self.x[:128*26].view(26,128)  # each column of W (128 dim)
        T = self.x[128*26:].view(26, 26)  # T is 26*26
        c=1000

        features = self.get_conv_feats(X)
        
        y=torch.zeros(256,64,26)
        y[:,:14,:]=labels
        # Assuming the feature(ie convoluted input X) shape to be [266*64*128] ie [batchsize*paddedwords*(height*weight)]
        # y is reshaped to (256*64*26)

        loss = get_crf_obj(features, y, W, T, c)
        return loss

    def backward(self):
        """
        Return the gradient of the CRF layer
        :return:
        """
        gradient = blah
        return gradient

    def get_conv_features(self, X):
        """
        Generate convolution features for a given word
        """
        convfeatures = blah
        return convfeatures


In [7]:
def get_crf_obj(X_train, y_train, W, T, c):    
    average_log_loss = -c * torch.sum(torch.tensor([compute_log_probability(X, y, W, T) for X, y in zip(X_train, y_train)])) / len(X_train)    
    
    W_norm = torch.sum(torch.tensor([(torch.norm(Wy.float())) ** 2 for Wy in W])) / 2
    T_norm = torch.sum(torch.tensor([torch.sum( torch.tensor([Tij**2 for Tij in row]) )for row in T])) / 2
    return average_log_loss + W_norm + T_norm

In [None]:
def compute_log_probability(X, y, W, T):
    m = self.embed_dim # 64
    
    # X is a tensor of shape (64*128)
    # y is a tensor of shape (64*26)
    # W is a tensor of shape (26*128)
    # T is a tensor of shape (26*26)
    
    # To convert y from 64*26 to 64*1
    
    y_nonzero=torch.nonzero(y)[1]
    y=torch.nn.ConstantPad1d((0,64-len(v)),-1)(y_nonzero)
    
    Z = torch.sum(forward_messages(X, W, T)[-1])
    
    temp=torch.zeros(m)
    for s in range(m):
        if y[s]<0: break
        temp[s]=torch.dot(X[s], W[y[s]])        
        
    unnormalized = torch.sum(temp)
    
    temp=torch.zeros(m)
    for i in range(len(y) - 1):
        if y[i] <0 :break
        temp[i]=T[y[i],y[i+1]]
        
    unnormalized += torch.sum(temp)
    
    return torch.log(torch.exp(unnormalized.float())/Z)

In [None]:
def forward_messages(self,X, W, T):
    n = self.num_labels      # 26 letters
    m = self.embed_dim       # word length
    f_msgs = torch.zeros((m, n), dtype=torch.double) 
    
    # Implement the code from both of the comments in section 5.1 of the appendix
    for i in range(n):
        f_msgs[0, i] = torch.exp(torch.dot(X[0], W[i]).float())
    for i in range(1, m):
        for a in range(n):
            product = torch.exp(torch.dot(X[i], W[a]).float())
            tmp = torch.zeros(n)
            for b in range(n):
                tmp[b] = torch.exp(T[b, a]) * f_msgs[i - 1, b]
            f_msgs[i, a] = product * torch.sum(tmp)
    return f_msgs

In [6]:
 def backward(self):
        """
        Return the gradient of the CRF layer
        :return:
        """
        gradient = crf_obj_prime(self.x, X_data, y_data, c)
        return gradient

In [8]:
def crf_decode(self,W, T, X_train):
    y_pred=[]
    n= self.num_labels 
    m=self.embed_dim
    
    def maxSumBottomUp(X, W, T):
        
        l = torch.zeros((m, n))
        opts = torch.zeros((m, n))
        
        yStar = torch.zeros(m, dtype=torch.int8)
        for i in range(1, m):
            for a in range(n):
                tmp = torch.zeros(n)
                for b in range(n):                    
                    tmp[b] = torch.dot(X[i-1], W[b]) + T[a, b] + l[i-1, b]
                l[i, a] = max(tmp)
        for b in range(n):
            opts[m-1, b] = tprch.dot(X[m-1], W[b]) + l[m-1, b]
        MAP = max(opts[m-1])
        yStar[m-1] = opts[m-1].max(0)[1]
        for i in range(m-1, 0, -1):
            for b in range(n):
                opts[i-1, b] = torch.dot(X[i-1], W[b]) + T[yStar[i], b] + l[i-1, b]
            yStar[i-1] = opts[i-1].max(0)[1]

        return MAP, yStar    
    

    for X in X_train:
        target=torch.zeros((64,26),dtype=torch.int8)
        MAP, yStar=maxSumBottomUp(X, W, T)
        for i,j in enumerate(yStar):
            target[i][j-1]=1
        y_pred.append(target)
    return torch.stack(y_pred)

In [None]:
def crf_obj_prime(self, X_data, y_data, c=1000):
    # x is a vector. so reshape it into w_y and T
    W = self.x[:128*26].view(26,128)  # each column of W (128 dim)
    T = self.x[128*26:].view(26, 26)  # T is 26*26
    n= self.num_labels

    grad_Ws = torch.zeros((n, 128), dtype=torch.double)
    grad_Ts = np.zeros((n, n),  dtype=torch.double)

    for X, y in zip(X_data, y_data):
        grad_Ws += compute_grad_W(X, y, W, T)
        grad_Ts += compute_grad_T(X, y, W, T)
    
    grad_Ws = (-c) * grad_Ws / float(len(X_data))
    grad_Ts = (-c) * grad_Ts / float(len(X_data))

    grad_theta = torch.cat([grad_Ws.reshape(-1), grad_Ts.reshape(-1)])
    
    return grad_theta