In [2]:
# Install required packages
%pip install transformers
%pip install pandas
%pip install torch

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.



In [3]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForQuestionAnswering, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from torch.nn import CrossEntropyLoss
import numpy as np
import random
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load dataset
file_path = r'C:\Users\elsie\Desktop\My projects\New folder\summative\Machine-Learning-Summative\empower_women_job_creation.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe to ensure it's loaded correctly
print(df.head())


                                            question  \
0  What are some organizations that help women fi...   
1            How can women start their own business?   
2  What resources are available for women entrepr...   
3         How can women improve their employability?   
4  What government programs support women in busi...   

                                              answer  
0  Organizations like Women for Women Internation...  
1  Women can start their own business by identify...  
2  Resources for women entrepreneurs include ment...  
3  Women can improve their employability by gaini...  
4  Government programs such as the Small Business...  


In [6]:
# Preprocess data
def preprocess_text(text):
    text = text.lower()
    text = ''.join([c for c in text if c.isalnum() or c.isspace()])
    return text

df['question'] = df['question'].apply(preprocess_text)
df['answer'] = df['answer'].apply(preprocess_text)

In [7]:

# Tokenize data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [21]:

# Tokenize the questions and answers
def tokenize_data(questions, answers, tokenizer, max_len=256):
    input_ids = []
    attention_masks = []

    for question, answer in zip(questions, answers):
        encoded_dict = tokenizer.encode_plus(
            question, answer,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

questions = df['question'].values
answers = df['answer'].values

input_ids, attention_masks = tokenize_data(questions, answers, tokenizer)

In [22]:
# Create DataLoader
labels = torch.tensor(np.arange(len(questions)), dtype=torch.long)
dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

batch_size = 8

train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

validation_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size
)

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [24]:
# Fine-tune BERT
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

def train_model(model, train_dataloader, validation_dataloader, optimizer, scheduler, epochs=3):
    for epoch in range(epochs):
        print(f'Epoch {epoch + 1}/{epochs}')
        model.train()

        total_train_loss = 0
        for step, batch in enumerate(train_dataloader):
            b_input_ids, b_input_mask, b_labels = batch

            b_input_ids = b_input_ids.to(device)
            b_input_mask = b_input_mask.to(device)
            b_labels = b_labels.to(device)

            model.zero_grad()

            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

            start_logits, end_logits = outputs.start_logits, outputs.end_logits
            loss = CrossEntropyLoss()(start_logits, b_labels) + CrossEntropyLoss()(end_logits, b_labels)
            
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping to prevent exploding gradients

            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f'Average training loss: {avg_train_loss}')

        model.eval()

        total_val_loss = 0
        for batch in validation_dataloader:
            b_input_ids, b_input_mask, b_labels = batch

            b_input_ids = b_input_ids.to(device)
            b_input_mask = b_input_mask.to(device)
            b_labels = b_labels.to(device)

            with torch.no_grad():
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

                start_logits, end_logits = outputs.start_logits, outputs.end_logits
                loss = CrossEntropyLoss()(start_logits, b_labels) + CrossEntropyLoss()(end_logits, b_labels)

                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(validation_dataloader)
        print(f'Validation loss: {avg_val_loss}')

train_model(model, train_dataloader, validation_dataloader, optimizer, scheduler, epochs)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Average training loss: 11.297593116760254
Validation loss: 10.9411039352417
Epoch 2/3
Average training loss: 10.748747825622559
Validation loss: 10.748943328857422
Epoch 3/3
Average training loss: 10.382055282592773
Validation loss: 10.650065422058105


In [26]:
# Adjusted get_answer function
def get_answer(question, model, tokenizer, max_len=256):
    encoding = tokenizer.encode_plus(question, return_tensors='pt', max_length=max_len, truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Ensure answer_end is not before answer_start
    if answer_end < answer_start:
        answer_end = answer_start

    # Convert tokens to string answer
    answer = tokenizer.decode(input_ids[0][answer_start:answer_end+1], skip_special_tokens=True)

    return answer

# Example conversation with the chatbot
questions = [
    "What are some organizations that help women find jobs?",
    "How can women start their own business?"
]

for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {get_answer(question, model, tokenizer)}")


Question: What are some organizations that help women find jobs?
Answer: some organizations that help women find jobs?
Question: How can women start their own business?
Answer: 
