# Dataset Preparation
Clean the datasets in the Kaggle environment before embedding generation and model training.


In [1]:
import pandas as pd
import re

# Dataset paths
cases_path = "/kaggle/input/participedia-ai-chatbot-project/Participedia_project/data/data_cases.csv"
methods_path = "/kaggle/input/participedia-ai-chatbot-project/Participedia_project/data/data_methods.csv"
organizations_path = "/kaggle/input/participedia-ai-chatbot-project/Participedia_project/data/data_organizations.csv"

# Load datasets
cases_df = pd.read_csv(cases_path).dropna(subset=["description"])
methods_df = pd.read_csv(methods_path).dropna(subset=["description"])
organizations_df = pd.read_csv(organizations_path).dropna(subset=["description"])

# Clean text
def clean_text(text):
    """Clean text by removing URLs, extra spaces, and unwanted characters."""
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with one
    text = re.sub(r'[^A-Za-z0-9.,!?\'\`]', ' ', text)  # Remove unwanted characters
    return text.strip()

# Add context validation
def validate_context(row):
    """Ensure context is populated; fallback to description or body."""
    if pd.notnull(row.get("context")) and row["context"].strip():
        return row["context"]
    elif pd.notnull(row.get("description")):
        return row["description"]
    elif pd.notnull(row.get("body")):
        return row["body"]
    return "No context available"

# Apply cleaning and context validation
for df in [cases_df, methods_df, organizations_df]:
    df["description"] = df["description"].apply(clean_text)
    df["context"] = df.apply(validate_context, axis=1)

# Save cleaned datasets
cases_cleaned_path = "/kaggle/working/cleaned_cases.csv"
methods_cleaned_path = "/kaggle/working/cleaned_methods.csv"
organizations_cleaned_path = "/kaggle/working/cleaned_organizations.csv"

cases_df.to_csv(cases_cleaned_path, index=False)
methods_df.to_csv(methods_cleaned_path, index=False)
organizations_df.to_csv(organizations_cleaned_path, index=False)

# Log summary
print(f"Cases dataset cleaned and saved to {cases_cleaned_path}.")
print(f"Methods dataset cleaned and saved to {methods_cleaned_path}.")
print(f"Organizations dataset cleaned and saved to {organizations_cleaned_path}.")
print(f"Total records: Cases ({len(cases_df)}), Methods ({len(methods_df)}), Organizations ({len(organizations_df)}).")


Cases dataset cleaned and saved to /kaggle/working/cleaned_cases.csv.
Methods dataset cleaned and saved to /kaggle/working/cleaned_methods.csv.
Organizations dataset cleaned and saved to /kaggle/working/cleaned_organizations.csv.
Total records: Cases (1964), Methods (351), Organizations (490).


# Generate Embeddings
Create embeddings for all three datasets using sentence-transformers.

In [2]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [3]:
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import pandas as pd
import numpy as np
import pickle
import faiss

# Initialize tokenizer and model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).eval()

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Validation function for descriptions
def validate_descriptions(df, column_name):
    """
    Validates and filters rows with valid descriptions in the DataFrame.
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in DataFrame.")
    df = df[df[column_name].notnull() & (df[column_name].str.strip() != "")]
    print(f"Validated {column_name}: {len(df)} rows remaining.")
    return df

# Function to generate embeddings
def generate_embeddings(texts, tokenizer, model, device):
    """
    Generates embeddings for a list of texts using a pre-trained model.
    """
    embeddings = []
    with torch.no_grad():
        for text in tqdm(texts, desc="Generating Embeddings"):
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
            inputs = {key: val.to(device) for key, val in inputs.items()}
            outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy())
    return embeddings

# Load cleaned datasets
cases_df = pd.read_csv("/kaggle/working/cleaned_cases.csv")
methods_df = pd.read_csv("/kaggle/working/cleaned_methods.csv")
organizations_df = pd.read_csv("/kaggle/working/cleaned_organizations.csv")

# Validate datasets
cases_df = validate_descriptions(cases_df, "description")
methods_df = validate_descriptions(methods_df, "description")
organizations_df = validate_descriptions(organizations_df, "description")

# Generate embeddings
cases_embeddings = generate_embeddings(cases_df["description"].tolist(), tokenizer, model, device)
methods_embeddings = generate_embeddings(methods_df["description"].tolist(), tokenizer, model, device)
organizations_embeddings = generate_embeddings(organizations_df["description"].tolist(), tokenizer, model, device)

# Save validated DataFrames
cases_df.to_csv("/kaggle/working/validated_cases.csv", index=False)
methods_df.to_csv("/kaggle/working/validated_methods.csv", index=False)
organizations_df.to_csv("/kaggle/working/validated_organizations.csv", index=False)

# Save embeddings
with open("/kaggle/working/cases_with_embeddings.pkl", "wb") as f:
    pickle.dump(cases_embeddings, f)

with open("/kaggle/working/methods_with_embeddings.pkl", "wb") as f:
    pickle.dump(methods_embeddings, f)

with open("/kaggle/working/organizations_with_embeddings.pkl", "wb") as f:
    pickle.dump(organizations_embeddings, f)

print("Embeddings generated and saved.")


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Validated description: 1964 rows remaining.
Validated description: 351 rows remaining.
Validated description: 490 rows remaining.


Generating Embeddings: 100%|██████████| 1964/1964 [00:08<00:00, 225.49it/s]
Generating Embeddings: 100%|██████████| 351/351 [00:01<00:00, 236.79it/s]
Generating Embeddings: 100%|██████████| 490/490 [00:02<00:00, 242.18it/s]


Embeddings generated and saved.


# Create FAISS Indices
Build and save FAISS indices for each dataset.

In [4]:
# Convert embeddings to numpy arrays
cases_embeddings = np.array(cases_embeddings, dtype="float32")
methods_embeddings = np.array(methods_embeddings, dtype="float32")
organizations_embeddings = np.array(organizations_embeddings, dtype="float32")

# Create FAISS indices
cases_index = faiss.IndexFlatL2(cases_embeddings.shape[1])
methods_index = faiss.IndexFlatL2(methods_embeddings.shape[1])
organizations_index = faiss.IndexFlatL2(organizations_embeddings.shape[1])

# Add embeddings to indices
cases_index.add(cases_embeddings)
methods_index.add(methods_embeddings)
organizations_index.add(organizations_embeddings)

# Save FAISS indices
faiss.write_index(cases_index, "/kaggle/working/cases_faiss_index")
faiss.write_index(methods_index, "/kaggle/working/methods_faiss_index")
faiss.write_index(organizations_index, "/kaggle/working/organizations_faiss_index")

# Validate FAISS indices
assert cases_index.ntotal == len(cases_df), "Mismatch in cases FAISS index."
assert methods_index.ntotal == len(methods_df), "Mismatch in methods FAISS index."
assert organizations_index.ntotal == len(organizations_df), "Mismatch in organizations FAISS index."

print("FAISS indices created and validated successfully.")

FAISS indices created and validated successfully.


# Retrain Classification Model
Train a new model to classify queries into cases, methods, or organizations

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, precision_score, recall_score  # Add this line
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
import pandas as pd

# Combine datasets
cases_df["category"] = "cases"
methods_df["category"] = "methods"
organizations_df["category"] = "organizations"
combined_df = pd.concat([cases_df, methods_df, organizations_df], ignore_index=True)

# Prepare data
texts = combined_df["description"].tolist()
labels = combined_df["category"].map({"cases": 0, "methods": 1, "organizations": 2}).tolist()

train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, stratify=labels, random_state=42
)

# Compute class weights for imbalanced data
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(train_labels), y=train_labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to("cuda" if torch.cuda.is_available() else "cpu")

# Tokenize
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

# Create Dataset class
class ParticipediaDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = ParticipediaDataset(train_encodings, train_labels)
test_dataset = ParticipediaDataset(test_encodings, test_labels)

# Train model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.to("cuda" if torch.cuda.is_available() else "cpu")  # Move to GPU if available

# Add class weights to loss function
model.classifier = torch.nn.Linear(model.config.hidden_size, model.config.num_labels, bias=True)
model.loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=3e-5,  # Fine-tuned learning rate
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=5,  # Increase for better performance
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
)

# Evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {
        "accuracy": (predictions == labels).mean(),
        "f1": f1_score(labels, predictions, average="weighted"),  # Requires import
        "precision": precision_score(labels, predictions, average="weighted"),  # Requires import
        "recall": recall_score(labels, predictions, average="weighted"),  # Requires import
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

# Save the model and tokenizer
model.save_pretrained("/kaggle/working/classification_model")
tokenizer.save_pretrained("/kaggle/working/classification_model")

print("Model trained and saved.")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113573188888242, max=1.0…

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.264375,0.914439,0.91365,0.913582,0.914439
2,No log,0.322926,0.914439,0.912582,0.913367,0.914439
3,No log,0.398575,0.910873,0.910873,0.910873,0.910873
4,0.186700,0.487841,0.914439,0.913712,0.91683,0.914439
5,0.186700,0.464814,0.914439,0.913946,0.914305,0.914439


Evaluation Results: {'eval_loss': 0.26437482237815857, 'eval_accuracy': 0.9144385026737968, 'eval_f1': 0.9136501309946704, 'eval_precision': 0.9135816178607338, 'eval_recall': 0.9144385026737968, 'eval_runtime': 1.5565, 'eval_samples_per_second': 360.414, 'eval_steps_per_second': 5.782, 'epoch': 5.0}
Model trained and saved.


# QA Model 

# Set Paths and Load Datasets
First, we set the paths to your cleaned datasets and load them.

In [6]:
import pandas as pd

# Dataset paths
cases_path = "/kaggle/input/cleaned-data/cleaned_cases.csv"
methods_path = "/kaggle/input/cleaned-data/cleaned_methods.csv"
organizations_path = "/kaggle/input/cleaned-data/cleaned_organizations.csv"

# Load datasets
cases_df = pd.read_csv(cases_path)
methods_df = pd.read_csv(methods_path)
organizations_df = pd.read_csv(organizations_path)

print("Datasets loaded successfully.")
print(f"Cases: {len(cases_df)} rows")
print(f"Methods: {len(methods_df)} rows")
print(f"Organizations: {len(organizations_df)} rows")

Datasets loaded successfully.
Cases: 1964 rows
Methods: 351 rows
Organizations: 490 rows


# Prepare the Data for QA Model
Combine all datasets and create a unified format suitable for training a QA model.

In [7]:
# Add a category column to each dataset
cases_df["category"] = "cases"
methods_df["category"] = "methods"
organizations_df["category"] = "organizations"

# Concatenate all datasets
combined_df = pd.concat([cases_df, methods_df, organizations_df], ignore_index=True)

# Ensure `description` column is used as context
qa_data = combined_df[["description", "category"]].rename(columns={"description": "context"})

# Display data sample
print("QA Data Sample:")
print(qa_data.head())

QA Data Sample:
                                             context category
0  An independent, non partisan assembly of 160 r...    cases
1  The Minneapolis Neighborhood Revitalization Pr...    cases
2  One of China s most innovative forms of reinve...    cases
3  Citizens' Forum Europe was designed to allow d...    cases
4  Two deliberative forums involving members of t...    cases


# Train a Fine-Tuned QA Model
We will use Hugging Face's transformers library for fine-tuning a pre-trained QA model.

In [8]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForQuestionAnswering, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# File paths
cases_path = "/kaggle/input/cleaned-data/cleaned_cases.csv"
methods_path = "/kaggle/input/cleaned-data/cleaned_methods.csv"
organizations_path = "/kaggle/input/cleaned-data/cleaned_organizations.csv"

# Load datasets
cases_df = pd.read_csv(cases_path)
methods_df = pd.read_csv(methods_path)
organizations_df = pd.read_csv(organizations_path)

# Combine datasets and prepare labels
cases_df["category"] = "cases"
methods_df["category"] = "methods"
organizations_df["category"] = "organizations"

# Ensure all datasets have consistent column names
cases_df.rename(columns={"description": "context"}, inplace=True)
methods_df.rename(columns={"description": "context"}, inplace=True)
organizations_df.rename(columns={"description": "context"}, inplace=True)

# Keep only required columns
cases_df = cases_df[["context", "category"]]
methods_df = methods_df[["context", "category"]]
organizations_df = organizations_df[["context", "category"]]

# Combine all datasets
combined_df = pd.concat([cases_df, methods_df, organizations_df], ignore_index=True)

# Map categories to numerical labels
category_map = {"cases": 0, "methods": 1, "organizations": 2}
combined_df["label"] = combined_df["category"].map(category_map)

# Ensure there are no missing values in the relevant columns
combined_df = combined_df.dropna(subset=["context", "label"])

# Split data into training and testing sets
train_df, test_df = train_test_split(
    combined_df, test_size=0.2, stratify=combined_df["label"], random_state=42
)

# Ensure that train_df and test_df have "context" and "label" columns as pandas Series
train_texts = list(train_df["context"].values)
test_texts = list(test_df["context"].values)
train_labels = list(train_df["label"].values)
test_labels = list(test_df["label"].values)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["context"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )

# Custom Dataset Class
class QADataset(Dataset):
    def __init__(self, texts, labels):
        # Ensure all inputs are strings
        self.texts = [str(text) if not isinstance(text, str) else text for text in texts]
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Ensure the text is properly formatted
        encodings = tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )
        encodings = {key: val.squeeze(0) for key, val in encodings.items()}
        # Dummy start and end positions for fine-tuning
        encodings["start_positions"] = torch.tensor(0)  # Dummy start position
        encodings["end_positions"] = torch.tensor(1)  # Dummy end position
        return encodings

# Create datasets
train_dataset = QADataset(train_texts, train_labels)
test_dataset = QADataset(test_texts, test_labels)

print("Datasets prepared successfully.")

# Load model
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    save_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
    logging_steps=50,
    load_best_model_at_end=True,
)

# Custom Trainer for QA task
class QATrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        start_positions = inputs.pop("start_positions")
        end_positions = inputs.pop("end_positions")
        outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

# Initialize Trainer
trainer = QATrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

print("Model trained and saved successfully.")


Datasets prepared successfully.


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
50,1.2231,0.001074
100,0.0017,0.000365
150,0.0009,0.00026
200,0.0007,0.000205
250,0.0006,0.000167
300,0.0005,0.000146
350,0.0004,0.000127
400,0.0004,0.000115
450,0.0003,0.000106
500,0.0003,9.9e-05


Model trained and saved successfully.


# Evaluate the Model

In [9]:
# Evaluate the model
eval_results = trainer.evaluate()

# Print evaluation metrics
print("Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value}")

Evaluation Results:
eval_loss: 9.881587902782485e-05
eval_runtime: 9.3501
eval_samples_per_second: 59.999
eval_steps_per_second: 7.594
epoch: 3.0


# Save the Model and Tokenizer

In [10]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./qa_model_fine_tuned")
tokenizer.save_pretrained("./qa_model_fine_tuned")

print("Model and tokenizer saved successfully.")


Model and tokenizer saved successfully.
