In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler,SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup

In [2]:
from tqdm import tqdm, trange

In [3]:
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
import spacy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import torch
from sklearn.feature_extraction.text import CountVectorizer
from Chapter01.dividing_into_sentences import read_text_file, preprocess_text, divide_into_sentences_nltk
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Star\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'NVIDIA GeForce 840M'

In [5]:
df_train = pd.read_csv("merged_train.csv", delimiter='\,', header=None,
names=['label','publication_number', 'cpc_code', 'sentence'])
df_train.shape

  df_train = pd.read_csv("merged_train.csv", delimiter='\,', header=None,


(1048575, 4)

In [6]:
pd.set_option('display.max_colwidth', None)
df_train.sample(10)

Unnamed: 0,label,publication_number,cpc_code,sentence
231406,231406,US-11491030-B2,A61B17/1684,{for the shoulder}
403453,403453,US-2013344081-A1,A61P25/00,Drugs for disorders of the nervous system
218802,218802,US-11413939-B2,B60J1/006,{characterised by fixation means such as clips adhesive etc. (incorporated in seal B60J10/70)}
663978,663978,US-2023022652-A1,H04L61/5007,Internet protocol [IP] addresses
970907,970907,US-7045252-B2,H01M6/06,Dry cells i.e. cells wherein the electrolyte is rendered non-fluid
267046,267046,US-11675567-B2,G06N3/047,Probabilistic or stochastic networks
1024440,1024440,US-7633872-B2,H04Q2213/13164,Traffic (registration measurement ...)
156931,156931,US-11031793-B2,H01M10/42,Methods or arrangements for servicing or maintenance of secondary cells or secondary half-cells (H01M10/60 takes precedence)
26228,26228,US-10183660-B2,B60W10/06,including control of combustion engines
148940,148940,US-10978403-B2,H01L2224/29147,Copper [Cu] as principal constituent


In [7]:
sentences = df_train.sentence.values

In [8]:
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df_train.label.values

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[10])

Tokenize the first sentence:
['[CLS]', '{', 'shaping', '}', '[SEP]']


In [10]:
print (tokenized_texts[9])

['[CLS]', 'bea', '##d', 'characterized', 'by', 'the', 'radial', 'extent', 'of', 'apex', 'flip', '##per', 'or', 'cha', '##fer', 'into', 'tire', 'side', '##wall', '[SEP]']


In [11]:
MAX_LEN = 128

In [12]:
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [13]:
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long",truncating="post", padding="post")

In [14]:
print (input_ids)

[[  101  6512  1997 ...     0     0     0]
 [  101  1063  2007 ...     0     0     0]
 [  101  1063  2007 ...     0     0     0]
 ...
 [  101  3424 23585 ...     0     0     0]
 [  101  1015  1016 ...     0     0     0]
 [  101  3495  5799 ...     0     0     0]]


In [15]:
attention_masks = []

In [16]:
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [17]:
train_inputs, validation_inputs, train_labels, validation_labels =train_test_split(input_ids, labels, random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks,input_ids,random_state=2018, test_size=0.1)

In [18]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [19]:
batch_size = 32

In [20]:
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [21]:
validation_data = TensorDataset(validation_inputs, validation_masks,validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [22]:
print (input_ids)

[[  101  6512  1997 ...     0     0     0]
 [  101  1063  2007 ...     0     0     0]
 [  101  1063  2007 ...     0     0     0]
 ...
 [  101  3424 23585 ...     0     0     0]
 [  101  1015  1016 ...     0     0     0]
 [  101  3495  5799 ...     0     0     0]]


In [23]:
try:
    import transformers
except:
    print("Installing transformers")
    !pip -qq install transformers

In [24]:
from transformers import BertModel, BertConfig
configuration = BertConfig()

In [25]:
model = BertModel(configuration)

In [26]:
configuration = model.config
print(configuration)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [27]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [29]:
print("Torch version:",torch.__version__)

Torch version: 2.2.2+cu118


In [30]:
print("Is CUDA enabled?",torch.cuda.is_available())

Is CUDA enabled? True


In [31]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']

In [32]:
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.1},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}]

In [33]:
optimizer = torch.optim.AdamW(optimizer_grouped_parameters,lr=2e-5)

In [34]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [35]:
t = []
train_loss_set = []

In [36]:
epochs = 4

In [37]:
df_test = pd.read_csv("merged_test.csv", delimiter='\,', header=None,
names=['label','publication_number', 'cpc_code', 'sentence'])
df_test.shape

  df_test = pd.read_csv("merged_test.csv", delimiter='\,', header=None,


(58, 4)

In [38]:
labels = df_test.label.values
sentences_list = df_test['sentence'].values.tolist()
sentences = df_test['sentence']
sentences

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    {involving kinase}
1                                                                                                                                                                                                                                                                                                                                       Omics  e.g. proteomics  glycomics or lipidomics; Methods of analysis focusing on the entire complement of classes of biological molecule

In [39]:
nlp = spacy.load("en_core_web_sm")
text = sentences[57]

#doc = nlp(sentences[:])
filtered_words = [token.text for token in nlp(sentences[1]) if not token.is_stop]
clean_text = ' '.join(filtered_words)
print("Original Text:", sentences[57])
print("Text after Stopword Removal:", clean_text)

Original Text: with automatic carbide feed by valves
Text after Stopword Removal: Omics   e.g. proteomics   glycomics lipidomics ; Methods analysis focusing entire complement classes biological molecules subsets thereof   i.e. focusing proteomes   glycomes lipidomes


In [40]:
nlp = spacy.load("en_core_web_sm")
text = "There is a pen on the table"
doc = nlp(text)
filtered_words = [token.text for token in doc if not token.is_stop]
clean_text = ' '.join(filtered_words)
print("Original Text:", text)
print("Text after Stopword Removal:", clean_text)

Original Text: There is a pen on the table
Text after Stopword Removal: pen table


In [41]:
pd.set_option('display.width', 1000)
cpc_words_df= pd.DataFrame(columns=['label', 'cpc_words'])
vectorizer = CountVectorizer(stop_words='english')
#X = vectorizer.fit_transform(sentences)
analyze = vectorizer.build_analyzer()
#print(vectorizer.get_feature_names_out(1))
for i in range(len(sentences)):
    row = [labels[i], analyze(sentences[i])]
#    print(i)
#    slno = labels[i]
#    word_list = analyze(sentences[i])
    cpc_words_df.loc[len(cpc_words_df)] = row
cpc_words_df = cpc_words_df.reset_index(drop=True)  
cpc_words_df

Unnamed: 0,label,cpc_words
0,0,"[involving, kinase]"
1,1,"[omics, proteomics, glycomics, lipidomics, methods, analysis, focusing, entire, complement, classes, biological, molecules, subsets, thereof, focusing, proteomes, glycomes, lipidomes]"
2,2,"[methods, identifying, protein, protein, interactions, protein, mixtures]"
3,3,"[methods, protein, analysis, involving, mass, spectrometry]"
4,4,"[ict, specially, adapted, hybridisation, ict, specially, adapted, gene, protein, expression]"
5,5,"[gene, protein, expression, profiling, expression, ratio, estimation, normalisation]"
6,6,"[ict, specially, adapted, modelling, simulations, systems, biology, gene, regulatory, networks, protein, interaction, networks, metabolic, networks]"
7,7,"[discovery, centralising, entities]"
8,8,"[supporting, data, block, transmission, mechanisms, file, transfer, h04l67, 06]"
9,9,"[distributed, storage, data, networks, transport, arrangements, network, file, nfs, storage, area, networks, san, network, attached, storage, nas]"


In [42]:
cpc_words_df.to_csv("cpc_words_converted.csv")

In [43]:
cpc_words_df['cpc_words'] = cpc_words_df['cpc_words'].apply(lambda x: list(pd.unique(x)))
cpc_words_df

Unnamed: 0,label,cpc_words
0,0,"[involving, kinase]"
1,1,"[omics, proteomics, glycomics, lipidomics, methods, analysis, focusing, entire, complement, classes, biological, molecules, subsets, thereof, proteomes, glycomes, lipidomes]"
2,2,"[methods, identifying, protein, interactions, mixtures]"
3,3,"[methods, protein, analysis, involving, mass, spectrometry]"
4,4,"[ict, specially, adapted, hybridisation, gene, protein, expression]"
5,5,"[gene, protein, expression, profiling, ratio, estimation, normalisation]"
6,6,"[ict, specially, adapted, modelling, simulations, systems, biology, gene, regulatory, networks, protein, interaction, metabolic]"
7,7,"[discovery, centralising, entities]"
8,8,"[supporting, data, block, transmission, mechanisms, file, transfer, h04l67, 06]"
9,9,"[distributed, storage, data, networks, transport, arrangements, network, file, nfs, area, san, attached, nas]"


In [44]:
cpc_words_df.to_csv("cpc_words_test.csv")

In [45]:
import pandas as pd
df_train_null = pd.read_csv("temp1.csv", delimiter=',',  header=None, names=['label','cpc_desc_unique'], encoding='latin1')
df_train_null.shape

(44500, 2)

In [46]:
df_train_null

Unnamed: 0,label,cpc_desc_unique
0,0,"ribs, grooves, arrangement"
1,1,"edges, bead, turn, ply, belt, extending, core, folded"
2,2,"flange, position, rim, protectors, bead, located, outside, extensions, radially"
3,3,"shaped, sidewall, crescent, comprising, inserts, rubber"
4,4,"tyre, constructions, parts, provided"
...,...,...
44495,44495,pumps
44496,44496,"g01n30, takes, precedence, 36, pressure, speed"
44497,44497,"moving, linearly"
44498,44498,"flow, 2625, 2617, b67c3, precedence, 2637, control, devices, using, f16k, general, valves"


In [47]:
pd.set_option('display.max_colwidth', None)
cpc_words_df[3595:3599]

Unnamed: 0,label,cpc_words


In [48]:
cpc_words_df.to_csv("cpc_words_unique.csv")

In [49]:
del cpc_words_df['label']

In [50]:
cpc_words_df[3595:3599]

Unnamed: 0,cpc_words


In [51]:
import pandas as pd
df_test_null = pd.read_csv("cpc_words_test.csv", delimiter=',',  header=None, names=['label','cpc_desc_unique'], encoding='latin1')
df_test_null

Unnamed: 0,label,cpc_desc_unique
,label,cpc_words
0.0,0,"['involving', 'kinase']"
1.0,1,"['omics', 'proteomics', 'glycomics', 'lipidomics', 'methods', 'analysis', 'focusing', 'entire', 'complement', 'classes', 'biological', 'molecules', 'subsets', 'thereof', 'proteomes', 'glycomes', 'lipidomes']"
2.0,2,"['methods', 'identifying', 'protein', 'interactions', 'mixtures']"
3.0,3,"['methods', 'protein', 'analysis', 'involving', 'mass', 'spectrometry']"
4.0,4,"['ict', 'specially', 'adapted', 'hybridisation', 'gene', 'protein', 'expression']"
5.0,5,"['gene', 'protein', 'expression', 'profiling', 'ratio', 'estimation', 'normalisation']"
6.0,6,"['ict', 'specially', 'adapted', 'modelling', 'simulations', 'systems', 'biology', 'gene', 'regulatory', 'networks', 'protein', 'interaction', 'metabolic']"
7.0,7,"['discovery', 'centralising', 'entities']"
8.0,8,"['supporting', 'data', 'block', 'transmission', 'mechanisms', 'file', 'transfer', 'h04l67', '06']"


In [52]:
import pandas as pd
df_test_null = pd.read_csv("raw_test.csv", delimiter=',',  header=None, names=['label','cpc_desc_unique'], encoding='latin1')
df_test_null.shape

(58, 2)

In [53]:
import itertools
from itertools import groupby
import re

def is_alpha(word):
    return bool(re.match('^[a-zA-Z]+$', word))
corrected_df = pd.DataFrame(columns=['label', 'cpc_desc'])
new_list= df_test_null['cpc_desc_unique']
filtered_lists = []
dfs_to_concat = []
for i in range(len(new_list)):
    test_list = new_list[i]  # Assuming new_list is a list of strings
    words = test_list.split(', ')  # Split the string into words using comma as the delimiter
    res = [word for word in words if is_alpha(word)]  # Filter out non-alphabetic words
#    corrected_df = corrected_df.append({'label' :i, 'cpc_desc': res }, ignore_index=True)

    filtered_lists.append(res)
for i, res in enumerate(filtered_lists):
    df_row = pd.DataFrame({'label': [i], 'cpc_desc': [res]})
    corrected_df=pd.concat([corrected_df,df_row])
#    corrected_df = corrected_df.append({'label' :i, 'cpc_desc': res }, ignore_index=True)
    print(i, res)
corrected_df


0 ['involving', 'kinase']
1 ['omics', 'proteomics', 'glycomics', 'lipidomics', 'methods', 'analysis', 'focusing', 'entire', 'complement', 'classes', 'biological', 'molecules', 'subsets', 'thereof', 'proteomes', 'glycomes', 'lipidomes']
2 ['methods', 'identifying', 'protein', 'interactions', 'mixtures']
3 ['methods', 'protein', 'analysis', 'involving', 'mass', 'spectrometry']
4 ['ict', 'specially', 'adapted', 'hybridisation', 'gene', 'protein', 'expression']
5 ['gene', 'protein', 'expression', 'profiling', 'ratio', 'estimation', 'normalisation']
6 ['ict', 'specially', 'adapted', 'modelling', 'simulations', 'systems', 'biology', 'gene', 'regulatory', 'networks', 'protein', 'interaction', 'metabolic']
7 ['discovery', 'centralising', 'entities']
8 ['supporting', 'data', 'block', 'transmission', 'mechanisms', 'file', 'transfer']
9 ['distributed', 'storage', 'data', 'networks', 'transport', 'arrangements', 'network', 'file', 'nfs', 'area', 'san', 'attached', 'nas']
10 ['sunroofs', 'precedenc

Unnamed: 0,label,cpc_desc
0,0,"[involving, kinase]"
0,1,"[omics, proteomics, glycomics, lipidomics, methods, analysis, focusing, entire, complement, classes, biological, molecules, subsets, thereof, proteomes, glycomes, lipidomes]"
0,2,"[methods, identifying, protein, interactions, mixtures]"
0,3,"[methods, protein, analysis, involving, mass, spectrometry]"
0,4,"[ict, specially, adapted, hybridisation, gene, protein, expression]"
0,5,"[gene, protein, expression, profiling, ratio, estimation, normalisation]"
0,6,"[ict, specially, adapted, modelling, simulations, systems, biology, gene, regulatory, networks, protein, interaction, metabolic]"
0,7,"[discovery, centralising, entities]"
0,8,"[supporting, data, block, transmission, mechanisms, file, transfer]"
0,9,"[distributed, storage, data, networks, transport, arrangements, network, file, nfs, area, san, attached, nas]"


In [54]:
corrected_df.to_csv('corrected_test.csv')