<a href="https://colab.research.google.com/github/dave21-py/Siri-project/blob/main/dlproject2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
file_path = 'siri_dataset_large.csv'
df = pd.read_csv(file_path)

In [None]:
print("Dataset loaded succesfully")
print("Total no. of examples", len(df))

In [None]:
print("Dataset loaded successfully! Here are the first 5 rows:")
df.head()

In [None]:
# Prepare data for training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
df.isnull().sum()

In [None]:
# Remove any rows that have empty values
df = df.dropna()

print("Bad rows removed.")
print("New total examples:", len(df))

In [None]:
X = df['utterance']
y = df['intent']

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size = 0.2, random_state = 42)

In [None]:
print("Data preparation complete!")
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


In [None]:
print(y[0])
print(y_encoded[0])

In [None]:
# Vectorisation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [None]:
# Pipeline, step tfidf and logistic regression

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=1000)),
])

In [None]:
print("Training the base model")
pipeline.fit(X_train, y_train)
print("Training complete")

In [None]:
# Evaluating our model

In [None]:
from sklearn.metrics import accuracy_score, classification_report

In [None]:
print("Making predictions on the test data.....")
predictions = pipeline.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, predictions)

In [None]:
print(f"New Baseline model accuracy: {accuracy * 100:.2f}%")

In [None]:
# Generate a detailed classification report
print("Detailed classification report......")
report = classification_report(y_test, predictions, target_names = label_encoder.classes_)
print(report)

In [None]:
sentence = 'What is the temperate outside right now?'
def predict_intent(sentence):
  prediction = pipeline.predict([sentence])
  predicted_index = prediction[0]
  predicted_intent = label_encoder.inverse_transform([predicted_index]) # turns back to text
  return predicted_intent[0]

In [None]:
test_sentence = 'Set me a time for 30 seconds'
print(f"Sentence: '{test_sentence}'")
print(f"Predicted Intent: {predict_intent(test_sentence)}")

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [None]:
#Settings
vocab_size = 1000
max_length = 20
oov_token = '<00V>'

In [None]:
# Create and fit the tokenizer

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_token)
tokenizer.fit_on_texts(X_train)

In [None]:
# Conver the training text and testing text to numerical sequence

In [None]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [None]:
# Padd the new sequences

In [None]:
X_train_padded = pad_sequences(X_train_sequences, maxlen = max_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen = max_length, padding='post')

In [None]:

# Lets check our work


print("Original sentence:", X_train.iloc[0])
#Now its new padded numerical sequence
print("Padded sequence:", X_train_padded[0])
print("Data is now ready for deep learning model")



In [None]:
#LSTM NN

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:
print(label_encoder.classes_)

In [None]:
num_classes = len(label_encoder.classes_)

In [None]:
model = Sequential([
    Embedding(input_dim = vocab_size, output_dim = 16, input_shape = (max_length,)),
    LSTM(32),
    Dense(num_classes, activation='softmax')
])

In [None]:
#Compile the model

In [None]:
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

In [None]:
# Training process

In [None]:
num_epochs = 50

In [None]:
print("Starting training of the LSTM Model.....")
history = model.fit(
    X_train_padded,
    y_train,
    epochs = num_epochs,
    validation_data = (X_test_padded, y_test)
)
print("Training complete")

In [None]:
# Visualisation our output

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Get the accuracy values from the training history

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

In [None]:
# Get the loss values from the traininig history

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']

In [None]:
epochs_range = range(len(acc))

In [None]:
plt.figure(figsize = (12,6))

In [None]:
plt.subplot(1,2,1)
plt.plot(epochs_range, acc, label='Training Accuray')
plt.plot(epochs_range, val_acc, label = 'Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy (New Dataset)')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')

plt.subplot(1,2,2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label = 'Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss (New Dataset)')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.show()


In [None]:
# This shows its not performinig even with our LSTM model

In [None]:
!pip install -q transformers

In [None]:
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

In [None]:
transformer_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
# Tokenization FOR THE DATA FOR DISTILBERT

In [None]:
train_encodings = transformer_tokenizer(list(X_train), truncation = True,
padding = 'max_length', max_length = 50, return_tensors = 'tf') # if any sentence is longer than 50, it will be c ut off as there is truncation

In [None]:
test_encodings = transformer_tokenizer(list(X_test),truncation = True,
                                       padding = 'max_length', max_length = 50, return_tensors = 'tf')

In [None]:
# Load the pretrained model

In [None]:
import tensorflow as tf

In [None]:
!pip install -q torch transformers datasets

In [None]:
# Import the main PyTorch library
import torch

# From the 'transformers' library, we import the PyTorch versions
# of the tokenizer and the sequence classification model.
# Notice there is no "TF" prefix this time.
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [None]:
# We create a tokenizer object from the pre-trained 'distilbert-base-uncased' model.
# This works for both TensorFlow and PyTorch.
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
train_encodings = tokenizer(list(X_train), truncation = True, padding = True)
test_encodings = tokenizer(list(X_test), truncation = True, padding = True)


In [None]:
# Custom Pytorch Dataset classs

In [None]:
class IntentDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

train_dataset = IntentDataset(train_encodings, y_train)
test_dataset = IntentDataset(test_encodings, y_test)

In [None]:
# Load the pretrained pytorch model

In [None]:
# We are creating our DistilBERT model object using the PyTorch class.
# .from_pretrained() downloads the pre-trained model weights.
# num_labels tells it to create a new final layer suitable for our 10-class problem.
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))

In [None]:
# Import the Trainer and the training arguments class
from transformers import Trainer, TrainingArguments

# Define the arguments for the training process.
training_args = TrainingArguments(
    output_dir='./results',          # Directory to save results
    num_train_epochs=3,              # We'll train for 3 full cycles
    per_device_train_batch_size=16,  # Process 16 examples at a time during training
    per_device_eval_batch_size=64,   # Process 64 examples at a time during evaluation
    warmup_steps=50,                 # Number of steps to warm up the learning rate
    weight_decay=0.01,               # A regularization technique to prevent overfitting
    logging_dir='./logs',            # Directory to save logs
    report_to='none',
)

# Create the Trainer object, which bundles everything together.
trainer = Trainer(
    model=model,                         # The model we just loaded
    args=training_args,                  # The training settings we just defined
    train_dataset=train_dataset,         # Our PyTorch training dataset
    eval_dataset=test_dataset            # Our PyTorch testing dataset
)
print("Trainer re-created with logging disabled.")

In [None]:
trainer.train()

In [None]:
# Re-running with Accuracy

In [None]:
import numpy as np

In [None]:
!pip install -q evaluate
import evaluate

In [None]:
# Load the accuracy metric from the evaluate library
metric = evaluate.load("accuracy")

# Define the function that the Trainer will use to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Re-create the Trainer, but this time we add our new compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics, # <-- This is the new instruction
)
print("Trainer has been re-created with the ability to calculate accuracy!")

In [None]:
# Run the evaluation again
final_evaluation = trainer.evaluate()

# Print the final results
print("--- Final Transformer Model Evaluation ---")
for key, value in final_evaluation.items():
    print(f"{key}: {value:.4f}")

In [None]:
import torch
import os
from transformers import DistilBertForSequenceClassification

# 1. Save the big "Teacher" model you just trained
model_dir = "./siri_bert_model"
trainer.save_model(model_dir)

# 2. Load it back (specifically for CPU/Mobile optimization)
model_fp32 = DistilBertForSequenceClassification.from_pretrained(model_dir)

# 3. Quantize it! (Compress the math from 32-bit to 8-bit)
# This is where the magic happens
quantized_model = torch.quantization.quantize_dynamic(
    model_fp32, {torch.nn.Linear}, dtype=torch.qint8
)

# 4. Save the new "Tiny" model
quantized_output_path = "siri_bert_quantized.pt"
torch.save(quantized_model, quantized_output_path)

# 5. Compare the file sizes
def get_size(path):
    size = os.path.getsize(path)
    return size / (1024 * 1024) # Convert to Megabytes (MB)

original_size = get_size(os.path.join(model_dir, "model.safetensors"))
quantized_size = get_size(quantized_output_path)

print(f"Original Model Size: {original_size:.2f} MB")
print(f"Quantized Model Size: {quantized_size:.2f} MB")
print(f"Compression Ratio: {original_size / quantized_size:.2f}x smaller!")

In [None]:
import pickle
from google.colab import files

# 1. Save the Label Encoder (The Translator)
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

# 2. Download the Model and the Translator to your computer
print("Downloading files... check your browser's download bar!")
files.download("siri_bert_quantized.pt")
files.download("label_encoder.pkl")