<a href="https://colab.research.google.com/github/dayana-cabrera004/npl/blob/main/AI_apps_bbook_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install kaggle streamlit transformers torch pandas scikit-learn

In [None]:
import os
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import streamlit as st

# Set up Kaggle API key
def setup_kaggle():
    kaggle_credentials = {
        "username": "mattysquarzoni",
        "key": "d950d79cc6641c776602f308f0b93b0e"
    }
    os.makedirs("/content/", exist_ok=True)
    with open("/content/kaggle.json", "w") as f:
        json.dump(kaggle_credentials, f)
    os.environ['KAGGLE_CONFIG_DIR'] = '/content/'

# Authenticate and download dataset
setup_kaggle()
!kaggle datasets download -d cscastilloliva90/it-books-names-and-descriptions --unzip

# List files in the current directory to identify the dataset
print("Extracted files:", os.listdir("."))

# Load dataset into a pandas DataFrame
dataset_file = 'AllITBooks_DataSet.xlsx'  # Correct file name from the extracted files
if dataset_file in os.listdir("."):
    df = pd.read_excel(dataset_file)
else:
    raise FileNotFoundError(f"Dataset file {dataset_file} not found. Check the extracted files and update the script.")

# Data preprocessing
df = df.dropna(subset=['Description', 'Category'])
label_encoder = LabelEncoder()
df['Category_Label'] = label_encoder.fit_transform(df['Category'])

# Split the data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Description'], df['Category_Label'], test_size=0.2, random_state=42
)

# Define custom dataset class
class BookDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_encoder.classes_)
)

# Create datasets
train_dataset = BookDataset(train_texts, train_labels, tokenizer)
test_dataset = BookDataset(test_texts, test_labels, tokenizer)

# Define training arguments
# Notes:
# - `eval_strategy` and `save_strategy` must match for `load_best_model_at_end` to work correctly.
# - These settings ensure periodic evaluation and saving of the best-performing model.
training_args = TrainingArguments(
    output_dir='./results',  # Directory to save model checkpoints and results
    eval_strategy='epoch',  # Evaluate the model at the end of each epoch
    save_strategy='epoch',  # Save model checkpoints at the end of each epoch
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    num_train_epochs=3,  # Number of training epochs
    logging_dir='./logs',  # Directory for logging training progress
    logging_steps=10,  # Log every 10 steps during training
    load_best_model_at_end=True,  # Automatically load the best model at the end of training
)

# Initialize Trainer
trainer = Trainer(
    model=model,  # Pre-trained BERT model
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=test_dataset,  # Evaluation dataset
    tokenizer=tokenizer,  # Tokenizer for preprocessing text inputs
)

# Train the model
# This step trains the model on the training dataset and evaluates it periodically based on the settings above.
trainer.train()

# Save the trained model
# After training, save the model and tokenizer for later use.
model.save_pretrained('./book_categorization_model')
tokenizer.save_pretrained('./book_categorization_model')

# Streamlit app
# Create a user interface for predicting book categories.
st.title('Automated Book Categorization')
st.write('Enter a book description to predict its category.')

def predict_category(description):
    # Preprocess the input description using the tokenizer
    encoding = tokenizer.encode_plus(
        description,
        add_special_tokens=True,
        max_length=512,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    # Make predictions using the trained model
    outputs = model(input_ids, attention_mask=attention_mask)
    _, prediction = torch.max(outputs.logits, dim=1)
    # Convert the predicted label back to the category name
    return label_encoder.inverse_transform([prediction.item()])[0]

description = st.text_area('Book Description')

# Predict and display the category when the user clicks the button
if st.button('Predict Category'):
    if description:
        category = predict_category(description)
        st.write(f'Predicted Category: **{category}**')
    else:
        st.write('Please enter a book description.')


Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m923.4 kB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m67.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,2.3508,2.810014
2,2.0226,2.421761
3,1.9609,2.32742


2025-01-18 01:17:01.612 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-01-18 01:17:01.631 Session state does not function when running a script without `streamlit run`


# **This will be for the streamlit frontend **

In [None]:
!pip install kaggle streamlit transformers torch pandas scikit-learn

In [None]:


import os
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import streamlit as st
import torch.nn.functional as F  # For calculating softmax probabilities

# Set up Kaggle API key
def setup_kaggle():
    kaggle_credentials = {
        "username": "mattysquarzoni",
        "key": "d950d79cc6641c776602f308f0b93b0e"
    }
    os.makedirs("/content/", exist_ok=True)
    with open("/content/kaggle.json", "w") as f:
        json.dump(kaggle_credentials, f)
    os.environ['KAGGLE_CONFIG_DIR'] = '/content/'

# Authenticate and download dataset
setup_kaggle()
!kaggle datasets download -d cscastilloliva90/it-books-names-and-descriptions --unzip

# List files in the current directory to identify the dataset
print("Extracted files:", os.listdir("."))

# Load dataset into a pandas DataFrame
dataset_file = 'AllITBooks_DataSet.xlsx'  # Correct file name from the extracted files
if dataset_file in os.listdir("."):
    df = pd.read_excel(dataset_file)
else:
    raise FileNotFoundError(f"Dataset file {dataset_file} not found. Check the extracted files and update the script.")

# Data preprocessing
df = df.dropna(subset=['Description', 'Category'])
label_encoder = LabelEncoder()
df['Category_Label'] = label_encoder.fit_transform(df['Category'])

# Split the data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Description'], df['Category_Label'], test_size=0.2, random_state=42
)

# Define custom dataset class
class BookDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_encoder.classes_)
)

# Create datasets
train_dataset = BookDataset(train_texts, train_labels, tokenizer)
test_dataset = BookDataset(test_texts, test_labels, tokenizer)

# Define training arguments
# Notes:
# - `eval_strategy` and `save_strategy` must match for `load_best_model_at_end` to work correctly.
# - These settings ensure periodic evaluation and saving of the best-performing model.
training_args = TrainingArguments(
    output_dir='./results',  # Directory to save model checkpoints and results
    eval_strategy='epoch',  # Evaluate the model at the end of each epoch
    save_strategy='epoch',  # Save model checkpoints at the end of each epoch
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    num_train_epochs=3,  # Number of training epochs
    logging_dir='./logs',  # Directory for logging training progress
    logging_steps=10,  # Log every 10 steps during training
    load_best_model_at_end=True,  # Automatically load the best model at the end of training
)

# Initialize Trainer
trainer = Trainer(
    model=model,  # Pre-trained BERT model
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=test_dataset,  # Evaluation dataset
    tokenizer=tokenizer,  # Tokenizer for preprocessing text inputs
)

# Train the model
# This step trains the model on the training dataset and evaluates it periodically based on the settings above.
trainer.train()

# Save the trained model
# After training, save the model and tokenizer for later use.
model.save_pretrained('./book_categorization_model')
tokenizer.save_pretrained('./book_categorization_model')

# Streamlit app
# Create a user interface for predicting book categories.
st.title('Automated Book Categorization')
st.write('Enter a book description to predict its category.')

def predict_category(description):
    # Preprocess the input description using the tokenizer
    encoding = tokenizer.encode_plus(
        description,
        add_special_tokens=True,
        max_length=512,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    # Make predictions using the trained model
    outputs = model(input_ids, attention_mask=attention_mask)
    probabilities = F.softmax(outputs.logits, dim=1)  # Get probabilities for each category
    confidence, prediction = torch.max(probabilities, dim=1)  # Get the highest confidence and its index

    # Define a confidence threshold
    confidence_threshold = 0.5  # If confidence is below this, return 'Uncategorized'
    if confidence.item() < confidence_threshold:
        return "Uncategorized"

    # Convert the predicted label back to the category name
    return label_encoder.inverse_transform([prediction.item()])[0]

description = st.text_area('Book Description')

# Predict and display the category when the user clicks the button
if st.button('Predict Category'):
    if description:
        category = predict_category(description)
        st.write(f'Predicted Category: **{category}**')
    else:
        st.write('Please enter a book description.')
