In [2]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from transformers import AutoModel
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
patent_data = pd.read_csv('df_claim_cpc_1400.csv',encoding='unicode_escape')

### Sort the data

Pre-processing to get the possible labels.

In [4]:
top_level= {}
mid_level = {}
group_level = {}

i=0
for label in patent_data.iterrows():
    tl = label[1]['group_id'][2]
    ml = label[1]['group_id'][2:5]
    gl = label[1]['group_id'][2:6]
    sentence = label[1]['text'].encode().decode("utf-8")
    
    if tl in top_level:
        top_level[tl].append(sentence)
    else:
        top_level[tl] = [sentence]
    
    if ml in mid_level:
        mid_level[ml].append(sentence)
    else:
        mid_level[ml] = [sentence]

    if gl in group_level:
        group_level[gl].append(sentence)
    else:
        group_level[gl] = [sentence]

In [5]:
def gen_labels(classes):
    labels = {}
    for i,l in enumerate(classes):
        labels[l] = float(i)
    return labels

top_labels = gen_labels(top_level)
mid_labels = gen_labels(mid_level)
group_labels = gen_labels(group_level)

### Fine-tune SBERT

In [6]:
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = patent_data['text']
groups = patent_data['group_id']

train_examples = []

for t in group_level: 
    for k in group_level[t]:
        train_examples.append(InputExample(texts=k, label=group_labels[t])) # can fine-tune on other label types

#Define train dataset, the dataloader and the train loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=1)
train_loss = losses.CosineSimilarityLoss(sbert_model)

do_train = False

if do_train:
    sbert_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=2)
    sbert_model.save("sbert")
else:
    sbert_model = SentenceTransformer('sbert')

embeddings = sbert_model.encode(sentences)

### Define generic softmax classifier

In [10]:
import torch
import torch.nn as nn

class Classifier(nn.Module):
    def __init__(self, output_size, input_size=384, hidden_size1=1000, hidden_size2=1000):
        super().__init__()
        self.linear1 = nn.Linear(input_size, hidden_size1)
        self.linear2 = nn.Linear(hidden_size1, hidden_size2)
        self.linear3 = nn.Linear(hidden_size2, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu(x)
        x = self.linear3(x)
        x = self.softmax(x)
        return x
    
    def train(self, train_loader, criterion, optimizer, num_epochs):
        for epoch in range(num_epochs):
            for inputs, labels in train_loader:
                # Clear the gradients
                optimizer.zero_grad()

                # Compute the predicted outputs
                outputs = self(inputs)

                # Compute the loss
                loss = criterion(outputs, labels)

                # Backpropagate the gradients
                loss.backward()
                
                # Update the model parameters
                optimizer.step()
            if epoch % 100 == 0:
                print(f"Loss: {loss}, epoch: {epoch}")

### Test-train split

In [8]:
train_data_top = []
test_data_top = []
embeddings = torch.tensor(embeddings)
for i in range(len(embeddings)):
    if i < 1000:
        train_data_top.append([embeddings[i], int(top_labels[patent_data['group_id'][i][2]])])
    else:
        test_data_top.append([embeddings[i], int(top_labels[patent_data['group_id'][i][2]])])

train_data_mid = []
test_data_mid = []

for i in range(len(embeddings)):
    if i < 1000:
        train_data_mid.append([embeddings[i], int(mid_labels[patent_data['group_id'][i][2:5]])])
    else:
        test_data_mid.append([embeddings[i], int(mid_labels[patent_data['group_id'][i][2:5]])])


In [12]:
top_classifier = Classifier(output_size=len(top_level))

top_classifier.train(
    train_loader=DataLoader(train_data_top, shuffle=True, batch_size=16),
    criterion=nn.CrossEntropyLoss(),
    optimizer=torch.optim.SGD(top_classifier.parameters(), lr=0.01, momentum=0.9),
    num_epochs=1000
)

Loss: 2.195930004119873, epoch: 0
Loss: 1.8704313039779663, epoch: 100
Loss: 1.7475128173828125, epoch: 200
Loss: 1.6224596500396729, epoch: 300
Loss: 1.373646855354309, epoch: 400
Loss: 1.618364930152893, epoch: 500
Loss: 1.8566889762878418, epoch: 600
Loss: 1.6169908046722412, epoch: 700
Loss: 1.4975194931030273, epoch: 800
Loss: 1.6158488988876343, epoch: 900


In [14]:
mid_classifier = Classifier(output_size=len(mid_level))

mid_classifier.train(
    train_loader=DataLoader(train_data_mid, shuffle=True, batch_size=16),
    criterion=nn.CrossEntropyLoss(),
    optimizer=torch.optim.SGD(mid_classifier.parameters(), lr=0.02),
    num_epochs=1000
)

Loss: 4.663356781005859, epoch: 0
Loss: 4.6631760597229, epoch: 100
Loss: 4.662642002105713, epoch: 200
Loss: 4.66279935836792, epoch: 300
Loss: 4.670515537261963, epoch: 400
Loss: 4.430095672607422, epoch: 500
Loss: 4.67944860458374, epoch: 600
Loss: 4.42968225479126, epoch: 700
Loss: 4.55458402633667, epoch: 800
Loss: 4.429601669311523, epoch: 900


In [16]:
def accuracy(model, data):
    accurate = 0
    for e,l in data:
        sm = model.forward(torch.unsqueeze(e,0))
        if int(torch.argmax(sm)) == l:
            accurate += 1
    return accurate/len(data)


print(f"top_classifier train accuracy: {accuracy(top_classifier, train_data_top)}")
print(f"top_classifier test accuracy: {accuracy(top_classifier, test_data_top)}")
print(f"mid_classifier train accuracy: {accuracy(mid_classifier, train_data_mid)}")
print(f"mid_classifier test accuracy: {accuracy(mid_classifier, test_data_mid)}")

top_classifier train accuracy: 0.729
top_classifier test accuracy: 0.44038929440389296
mid_classifier train accuracy: 0.098
mid_classifier test accuracy: 0.09732360097323602


### Sub-classifier

In [19]:
class Subclassifier(nn.Module):
    def __init__(self, output_size, input_size=384, hidden_size1=1000, hidden_size2=1000):
        super().__init__()
        self.linear1 = nn.Linear(input_size, hidden_size1)
        self.linear2 = nn.Linear(hidden_size1, hidden_size2)
        self.linear3 = nn.Linear(hidden_size2 + 1, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        k=torch.argmax(top_classifier.forward(x),dim=1)
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu(x)
        x = torch.cat((x,torch.unsqueeze(k,1)),dim=1)
        x = self.linear3(x)
        x = self.softmax(x)
        return x
    
    def train(self, train_loader, criterion, optimizer, num_epochs):
        for epoch in range(num_epochs):
            for inputs, labels in train_loader:
                # Clear the gradients
                optimizer.zero_grad()

                # Compute the predicted outputs
                outputs = self(inputs)

                # Compute the loss
                loss = criterion(outputs, labels)

                # Backpropagate the gradients
                loss.backward()
                
                # Update the model parameters
                optimizer.step()
            if epoch % 100 == 0:
                print(f"Loss: {loss}, epoch: {epoch}")

In [20]:
mid_classifier2 = Subclassifier(output_size=len(mid_level))

mid_classifier2.train(
    train_loader=DataLoader(train_data_mid, shuffle=True, batch_size=16),
    criterion=nn.CrossEntropyLoss(),
    optimizer=torch.optim.SGD(mid_classifier2.parameters(), lr=0.02),
    num_epochs=1000
)

Loss: 4.664001941680908, epoch: 0
Loss: 4.671936988830566, epoch: 100
Loss: 4.545565605163574, epoch: 200
Loss: 4.540782451629639, epoch: 300
Loss: 4.52800989151001, epoch: 400
Loss: 4.457130432128906, epoch: 500
Loss: 4.67572546005249, epoch: 600
Loss: 4.43361759185791, epoch: 700
Loss: 4.676501750946045, epoch: 800
Loss: 4.671538352966309, epoch: 900


In [22]:
print(f"mid_classifier train accuracy: {accuracy(mid_classifier, train_data_mid)}")
print(f"mid_classifier test accuracy: {accuracy(mid_classifier, test_data_mid)}")
print(f"mid_classifier2 train accuracy: {accuracy(mid_classifier2, train_data_mid)}")
print(f"mid_classifier2 test accuracy: {accuracy(mid_classifier2, test_data_mid)}")

mid_classifier train accuracy: 0.098
mid_classifier test accuracy: 0.09732360097323602
mid_classifier2 train accuracy: 0.187
mid_classifier2 test accuracy: 0.1362530413625304
