In [23]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from transformers import AutoModel
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader

In [24]:
patent_data = pd.read_csv('df_claim_cpc_1400.csv',encoding='unicode_escape')

### Sort the data

Pre-processing to get the possible labels.

In [25]:
top_level= {}
mid_level = {}
group_level = {}

i=0
for label in patent_data.iterrows():
    tl = label[1]['group_id'][2]
    ml = label[1]['group_id'][2:5]
    gl = label[1]['group_id'][2:6]
    sentence = label[1]['text'].encode().decode("utf-8")
    
    if tl in top_level:
        top_level[tl].append(sentence)
    else:
        top_level[tl] = [sentence]
    
    if ml in mid_level:
        mid_level[ml].append(sentence)
    else:
        mid_level[ml] = [sentence]

    if gl in group_level:
        group_level[gl].append(sentence)
    else:
        group_level[gl] = [sentence]

In [26]:
def gen_labels(classes):
    labels = {}
    for i,l in enumerate(classes):
        labels[l] = float(i)
    return labels

top_labels = gen_labels(top_level)
mid_labels = gen_labels(mid_level)
group_labels = gen_labels(group_level)

### Fine-tune SBERT

In [27]:
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = patent_data['text']
groups = patent_data['group_id']

train_examples = []

for t in group_level: 
    for k in group_level[t]:
        train_examples.append(InputExample(texts=k, label=group_labels[t])) # can fine-tune on other label types

#Define train dataset, the dataloader and the train loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=1)
train_loss = losses.CosineSimilarityLoss(sbert_model)

do_train = False

if do_train:
    sbert_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=2)
    sbert_model.save("sbert")
else:
    sbert_model = SentenceTransformer('sbert')

embeddings = sbert_model.encode(sentences)

KeyboardInterrupt: 

### Define generic softmax classifier

In [None]:
import torch
import torch.nn as nn

class Classifier(nn.Module):
    def __init__(self, output_size, input_size=384, hidden_size1=1000, hidden_size2=1000):
        super().__init__()
        self.linear1 = nn.Linear(input_size, hidden_size1)
        self.linear2 = nn.Linear(hidden_size1, hidden_size2)
        self.linear3 = nn.Linear(hidden_size2, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu(x)
        x = self.linear3(x)
        x = self.softmax(x)
        return x
    
    def train(self, train_loader, criterion, optimizer, num_epochs):
        for epoch in range(num_epochs):
            for inputs, labels in train_loader:
                # Clear the gradients
                optimizer.zero_grad()

                # Compute the predicted outputs
                outputs = self(inputs)

                # Compute the loss
                loss = criterion(outputs, labels)

                # Backpropagate the gradients
                loss.backward()
                
                # Update the model parameters
                optimizer.step()
            if epoch % 10 == 0:
                print(f"Loss: {loss}, epoch: {epoch}")

### Test-train split

In [None]:
train_data_top = []
test_data_top = []
embeddings = torch.tensor(embeddings)
for i in range(len(embeddings)):
    if i < 1000:
        train_data_top.append([embeddings[i], int(top_labels[patent_data['group_id'][i][2]])])
    else:
        test_data_top.append([embeddings[i], int(top_labels[patent_data['group_id'][i][2]])])

train_data_mid = []
test_data_mid = []

for i in range(len(embeddings)):
    if i < 1000:
        train_data_mid.append([embeddings[i], int(mid_labels[patent_data['group_id'][i][2:5]])])
    else:
        test_data_mid.append([embeddings[i], int(mid_labels[patent_data['group_id'][i][2:5]])])


  embeddings = torch.tensor(embeddings)


In [None]:
top_classifier = Classifier(output_size=len(top_level))

top_classifier.train(
    train_loader=DataLoader(train_data_top, shuffle=True, batch_size=10),
    criterion=nn.CrossEntropyLoss(),
    optimizer=torch.optim.SGD(top_classifier.parameters(), lr=0.01, momentum=0.9),
    num_epochs=100
)

Loss: 2.1885478496551514, epoch: 0
Loss: 2.2044105529785156, epoch: 10
Loss: 2.1265623569488525, epoch: 20
Loss: 1.9089206457138062, epoch: 30
Loss: 1.8980792760849, epoch: 40
Loss: 1.7630351781845093, epoch: 50
Loss: 1.661988615989685, epoch: 60
Loss: 1.6849559545516968, epoch: 70
Loss: 1.872053861618042, epoch: 80
Loss: 1.7781860828399658, epoch: 90


In [None]:
mid_classifier = Classifier(output_size=len(mid_level))

mid_classifier.train(
    train_loader=DataLoader(train_data_mid, shuffle=True, batch_size=32),
    criterion=nn.CrossEntropyLoss(),
    optimizer=torch.optim.SGD(mid_classifier.parameters(), lr=0.02),
    num_epochs=100
)

Loss: 4.6634840965271, epoch: 0
Loss: 4.663344860076904, epoch: 10
Loss: 4.663525581359863, epoch: 20
Loss: 4.663418292999268, epoch: 30
Loss: 4.66337251663208, epoch: 40
Loss: 4.663315773010254, epoch: 50
Loss: 4.663266181945801, epoch: 60
Loss: 4.663301467895508, epoch: 70
Loss: 4.663339614868164, epoch: 80
Loss: 4.663333415985107, epoch: 90


In [None]:
accurate = 0

for e,l in train_data_top:
    sm = top_classifier.forward(torch.unsqueeze(e,0))
    if int(torch.argmax(sm)) == l:
        accurate += 1
print(f"Train accuracy: {accurate/len(train_data_top)}")
accurate = 0
for e,l in test_data_top:
    sm = top_classifier.forward(torch.unsqueeze(e,0))
    if int(torch.argmax(sm)) == l:
        accurate += 1
print(f"Test accuracy: {accurate/len(test_data_top)}")

Train accuracy: 0.635
Test accuracy: 0.41849148418491483


In [None]:
accurate = 0

for e,l in test_data_mid:
    sm = top_classifier.forward(torch.unsqueeze(e,0))
    if int(torch.argmax(sm)) == l:
        accurate += 1
accurate/len(test_data_mid)

0.41362530413625304

### Sub-classifier

In [52]:
class Subclassifier(nn.Module):
    def __init__(self, output_size, input_size=384, hidden_size1=1000, hidden_size2=1000):
        super().__init__()
        self.linear1 = nn.Linear(input_size, hidden_size1)
        self.linear2 = nn.Linear(hidden_size1, hidden_size2)
        self.linear3 = nn.Linear(hidden_size2 + 1, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        k=torch.argmax(top_classifier.forward(x),dim=1)
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu(x)
        x = torch.cat((x,torch.unsqueeze(k,1)),dim=1)
        x = self.linear3(x)
        x = self.softmax(x)
        return x
    
    def train(self, train_loader, criterion, optimizer, num_epochs):
        for epoch in range(num_epochs):
            for inputs, labels in train_loader:
                # Clear the gradients
                optimizer.zero_grad()

                # Compute the predicted outputs
                outputs = self(inputs)

                # Compute the loss
                loss = criterion(outputs, labels)

                # Backpropagate the gradients
                loss.backward()
                
                # Update the model parameters
                optimizer.step()
            if epoch % 10 == 0:
                print(f"Loss: {loss}, epoch: {epoch}")

In [53]:
mid_classifier2 = Subclassifier(output_size=len(mid_level))

mid_classifier2.train(
    train_loader=DataLoader(train_data_mid, shuffle=True, batch_size=32),
    criterion=nn.CrossEntropyLoss(),
    optimizer=torch.optim.SGD(mid_classifier2.parameters(), lr=0.02),
    num_epochs=1000
)

Loss: 4.663654327392578, epoch: 0
Loss: 4.663252353668213, epoch: 10
Loss: 4.661838054656982, epoch: 20
Loss: 4.663501739501953, epoch: 30
Loss: 4.653690338134766, epoch: 40
Loss: 4.66337776184082, epoch: 50
Loss: 4.437601566314697, epoch: 60
Loss: 4.667328834533691, epoch: 70
Loss: 4.424098491668701, epoch: 80
Loss: 4.670032501220703, epoch: 90
Loss: 4.667972564697266, epoch: 100
Loss: 4.546344757080078, epoch: 110
Loss: 4.544979572296143, epoch: 120
Loss: 4.551736831665039, epoch: 130
Loss: 4.546972751617432, epoch: 140
Loss: 4.666375637054443, epoch: 150
Loss: 4.669826507568359, epoch: 160
Loss: 4.549212455749512, epoch: 170
Loss: 4.544863700866699, epoch: 180
Loss: 4.547497272491455, epoch: 190
Loss: 4.546455383300781, epoch: 200
Loss: 4.666857719421387, epoch: 210
Loss: 4.543020248413086, epoch: 220
Loss: 4.542840480804443, epoch: 230
Loss: 4.424198627471924, epoch: 240
Loss: 4.670831680297852, epoch: 250
Loss: 4.5482072830200195, epoch: 260
Loss: 4.421463489532471, epoch: 270
Los

In [54]:
accurate1 = 0
accurate2 = 0

for e,l in test_data_mid:
    sm1 = mid_classifier.forward(torch.unsqueeze(e,0))
    sm2 = mid_classifier2.forward(torch.unsqueeze(e,0))
    if int(torch.argmax(sm1)) == l:
        accurate1 += 1
    if int(torch.argmax(sm2)) == l:
        accurate2 += 1
       
print(accurate1/len(test_data_mid),accurate2/len(test_data_mid))

0.09732360097323602 0.14841849148418493
