In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.preprocessing import LabelEncoder
import pickle
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#step1: reading data
df = pd.read_csv("/content/multiclass SUBCAT.csv")

In [5]:
df.shape

(375807, 3)

In [6]:
#step2: dropping unnecessary columns
df = df.drop(['tpl_cat'], axis=1)

In [7]:
## Step 3: Removing nulls
df.dropna(subset=['tpl_subcat', 'name_displ'], inplace=True)

In [8]:
df.shape

(375807, 2)

In [9]:
# Step 4: removing duplicates
df.drop_duplicates(inplace=True)

In [10]:
df.shape

(289024, 2)

In [11]:
# Step 5: preprocessing to clean and standardize text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['name_displ'] = df['name_displ'].apply(preprocess_text)

In [12]:
df.head()

Unnamed: 0,tpl_subcat,name_displ
0,AUTO PARTS,sheikh autos
1,TELECOMMUNICATION,raza comm
2,OTHER LOCAL BUSINESSES,noman comm
4,TELECOMMUNICATION,victory comm
5,TELECOMMUNICATION,junaid comm


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 289024 entries, 0 to 375806
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   tpl_subcat  289024 non-null  object
 1   name_displ  289024 non-null  object
dtypes: object(2)
memory usage: 6.6+ MB


In [14]:
#step6: converting category labels in tpl_subcat into numerical form.
encoder = LabelEncoder()
df['labels'] = encoder.fit_transform(df['tpl_subcat'])

In [15]:
df.head()

Unnamed: 0,tpl_subcat,name_displ,labels
0,AUTO PARTS,sheikh autos,11
1,TELECOMMUNICATION,raza comm,178
2,OTHER LOCAL BUSINESSES,noman comm,127
4,TELECOMMUNICATION,victory comm,178
5,TELECOMMUNICATION,junaid comm,178


In [None]:
#step7: saved labels for later use
with open('subcat_labels.pkl', 'wb') as file:
    pickle.dump(encoder, file)

===================================================================================================

### TOKENIZING

In [16]:
#step8: load tokenizer
from transformers import XLNetTokenizer
from sklearn.model_selection import train_test_split
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

In [17]:
def encode_texts(tokenizer, texts, max_len=120):
    return tokenizer(texts, max_length=max_len, padding='max_length', truncation=True, return_tensors='pt')

In [18]:
# step9: Prepare inputs
encoded_data = encode_texts(tokenizer, df['name_displ'].tolist())
input_ids = encoded_data['input_ids']
attention_mask = encoded_data['attention_mask']

In [19]:
# step10: convert labels to tensor (long)
labels = torch.tensor(df['labels'].values, dtype=torch.long)

In [20]:
df.head()

Unnamed: 0,tpl_subcat,name_displ,labels
0,AUTO PARTS,sheikh autos,11
1,TELECOMMUNICATION,raza comm,178
2,OTHER LOCAL BUSINESSES,noman comm,127
4,TELECOMMUNICATION,victory comm,178
5,TELECOMMUNICATION,junaid comm,178


In [21]:
# step11: Split data
input_ids_train, input_ids_val, attention_mask_train, attention_mask_val, labels_train, labels_val = train_test_split(
    input_ids, attention_mask, labels, test_size=0.15, random_state=196)

In [22]:
len(input_ids_train)

245670

In [23]:
len(input_ids_val)

43354

In [24]:
len(attention_mask_train)

245670

In [25]:
len(attention_mask_val)

43354

### Dataset & Dataloader

In [26]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [27]:
#step12: create datasets
train_dataset = TensorDataset(input_ids_train, attention_mask_train, labels_train)
val_dataset = TensorDataset(input_ids_val, attention_mask_val, labels_val)

In [28]:
len(train_dataset)

245670

In [29]:
len(val_dataset)

43354

In [31]:
#step13: create dataloaders

batch_size = 16
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
validation_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

### MODEL TRAINING

In [32]:
#step14: loading XLNet model with a classification head
from transformers import XLNetForSequenceClassification

model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=207)

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
# Move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [39]:
#step15: Training Configuration
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

epochs = 5
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [40]:
#step16: TRAINING LOOP

from sklearn.metrics import accuracy_score, roc_curve, auc
import numpy as np

def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0
    # Total steps is the length of the data loader (total number of batches)
    total_steps = len(data_loader)
    for step, batch in enumerate(data_loader):
        optimizer.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print progress every 10 batches
        if step % 10 == 0:
            print(f"Batch {step}/{total_steps} - Loss: {loss.item()}")
    avg_loss = total_loss / total_steps
    return avg_loss

def evaluate(model, data_loader, device):
    model.eval()
    total_loss = 0
    predictions, true_labels = [], []
    total_steps = len(data_loader)
    with torch.no_grad():
        for step, batch in enumerate(data_loader):
            batch = tuple(b.to(device) for b in batch)
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1],
                      'labels': batch[2]}
            outputs = model(**inputs)
            loss = outputs.loss
            total_loss += loss.item()
            logits = outputs.logits.detach().cpu().numpy()
            label_ids = inputs['labels'].to('cpu').numpy()
            predictions.append(logits)
            true_labels.append(label_ids)
            if step % 10 == 0:
                print(f"Evaluating Batch {step}/{total_steps}")

    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)
    preds_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = true_labels.flatten()
    accuracy = accuracy_score(labels_flat, preds_flat)
    return total_loss / total_steps, accuracy


In [None]:
#step17: training and evaluation

epochs = 5
for epoch in range(epochs):
    print(f"\nStarting Epoch {epoch + 1}/{epochs}")
    train_loss = train_epoch(model, train_loader, optimizer, device, scheduler)
    print("Validation stats:")
    val_loss, val_accuracy = evaluate(model, validation_loader, device)
    print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.2f}, Val Loss: {val_loss:.2f}, Accuracy: {val_accuracy:.2f}')


Starting Epoch 1/5
Batch 0/15355 - Loss: 5.6053853034973145
Batch 10/15355 - Loss: 5.233277320861816
Batch 20/15355 - Loss: 5.022278785705566
Batch 30/15355 - Loss: 5.056413650512695
Batch 40/15355 - Loss: 5.020699977874756
Batch 50/15355 - Loss: 4.54371976852417
Batch 60/15355 - Loss: 5.02938175201416
Batch 70/15355 - Loss: 5.061321258544922
Batch 80/15355 - Loss: 4.774014949798584
Batch 90/15355 - Loss: 4.798666000366211
Batch 100/15355 - Loss: 4.6628193855285645
Batch 110/15355 - Loss: 4.867588996887207
Batch 120/15355 - Loss: 4.38192081451416
Batch 130/15355 - Loss: 4.056440830230713
Batch 140/15355 - Loss: 4.593669891357422
Batch 150/15355 - Loss: 4.3416972160339355
Batch 160/15355 - Loss: 4.680710315704346
Batch 170/15355 - Loss: 4.626595973968506
Batch 180/15355 - Loss: 4.587113857269287
Batch 190/15355 - Loss: 4.703821659088135
Batch 200/15355 - Loss: 4.395221710205078
Batch 210/15355 - Loss: 4.552143096923828
Batch 220/15355 - Loss: 4.682161331176758
Batch 230/15355 - Loss: 4

In [None]:
#step18: save the model
model.save_pretrained('./my_xlnet_model')
tokenizer.save_pretrained('./my_xlnet_model')
