In [5]:
import os
!pip install transformers
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pickle
import torch
import re
import pandas as pd
from tqdm.auto import tqdm
import numpy as np



In [6]:
path = '/content/drive/MyDrive/Interos/Reuters11ClassesModel' # Path to the model checkpoint
tokenizer = AutoTokenizer.from_pretrained(path,use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(path)

In [7]:
label_path ='/content/drive/MyDrive/Interos/label_encoder.pickle' # path to the location where the pickle file for label encoder is stored
with open(label_path,'rb') as f:
    le = pickle.load(f)

In [8]:
def read_file(path):
    """
    Function to read the .sgm file and extract artile text and topic assigned to
    them.

    Note: It skips those articles where no topic has being assigned. 
    
    """

    text = []
    labels = []
  
    with open(path,'r',errors='ignore') as f:
        data = f.read()
    data = re.sub('\n|  +',' ',data)
    data = data.split('</REUTERS>')


    for i,x in tqdm(enumerate(data)):
        if len(x) < 10:
          continue
        reuter_tag = re.findall('<REUTERS (.*?)>',x)  
        reg_str = "<TOPICS>(.*?)</TOPICS>"
        res = re.findall(reg_str, x)
        reg_str = "<D>(.*?)</D>"
        if not res or (not res[0]):
          continue
        else:  
            result = re.findall(reg_str, res[0])
            labels.append(result[0])    
        text_type = re.findall('<TEXT (.*?)>',x)
        if not text_type:
            reg_str = "<BODY>(.*?)</BODY>"
            article = re.findall(reg_str, x)  
        else:
          if 'TYPE="BRIEF"' in text_type:
              reg_str = "<TITLE>(.*?)</TITLE>"
              article = re.findall(reg_str, x)
          else:
              reg_str = '<TEXT TYPE="UNPROC">&#2;(.*?)</TEXT>'
              article = re.findall(reg_str, x)
        if not article:
            text.append('')
        text.append(article[0])

    return text,labels    

In [9]:
def predictions(path,batch_size=16,label_encoder=le,model=model,
                tokenizer=tokenizer,return_df = True):
    """
    Function to read the folder/file(.sgm format) create a dataframe out of it
    and do predictions on it using custom transformer model 
    path: Path to the file/foldeer
    batch_size: The batch size to be used by the model
    label_encoder: Sklearn Label Encoder to get transform Categorical Variables
    model: Custom Model For Prediction
    tokenizer: HuggingFace Tokenizer to preprocess the input for the model
    return_df : Whether to return a datafframe containing three attributes
                Article, Topic, Predictions

    """
    # Assuming all the articles have been assigned a Topic
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    text = []
    topics = []
    if os.path.isdir(path):
        for fname in tqdm([t for t in list(os.walk(path))[0][2] if '.sgm' in t]):
            temp_text,temp_topics = read_file(os.path.join(path,fname))
            text.extend(temp_text)
            topics.extend(temp_topics)
    else:
        text, topics = read_file(path)
    
    topics = [x if x in label_encoder.classes_ else 'others' for x in topics]
    labels = label_encoder.transform(topics)

    preds = np.array([])
    model.to(device)

    for i in tqdm(range(0,len(text),batch_size)):
        
        tokens = tokenizer(text[i:i+batch_size],return_tensors='pt',
                          padding=True,truncation=True)
        # To have both the Model and input tensors on the same device
        tokens['input_ids']=tokens['input_ids'].to(device)
        tokens['attention_mask'] = tokens['attention_mask'].to(device)
        # Getting the predictions 
        out = model(**tokens)
        out = out.logits
        if str(device) == 'cuda':
            out = out.cpu()
        out = np.argmax(out.detach().numpy(),axis=1)
        if  not preds.any():
            preds = out
        else:
            preds = np.concatenate((preds,out),axis=0)
        
    accuracy = sum(preds==np.array(labels))/len(labels)
    print(f'Accuracy of the model on the given Test Set is {accuracy}')
    if return_df:
      df = pd.DataFrame({'Article': text, 'Topic': topics,
                         'Predictions':label_encoder.inverse_transform(preds)})
      return df,accuracy,label_encoder.inverse_transform(preds),np.array(topics)
    return accuracy,label_encoder.inverse_transform(preds),np.array(topics)    

In [10]:
df_final,acc,pred,true_label = predictions('/content/drive/MyDrive/reuters21578/reut2-009.sgm')

0it [00:00, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

Accuracy of the model on the given Test Set is 0.9674657534246576


In [13]:
df_final.head(50)

Unnamed: 0,Article,Topic,Predictions
0,Shr loss seven cts vs profit five cts Net los...,earn,earn
1,Commerce Secretary Malcolm Baldrige said he su...,trade,money-fx
2,EASTMAN KODAK CO TO SELL HOLDINGS IN ICN PHARM...,acq,acq
3,Treasury balances at the Federal Reserve rose ...,interest,money-supply
4,USX Corp's USS subsidiary said that effective ...,others,others
5,"William Bywater, president of the Internationa...",trade,trade
6,Zaire has been authorized to purchase about 30...,grain,grain
7,&lt;Midivest Inc> said it acquired all the ass...,acq,acq
8,The Commodity Credit Corporation (CCC) has swi...,grain,grain
9,Central bank intervention in the foreign excha...,money-fx,money-fx
