In [95]:
# https://www.kaggle.com/code/sahanavejlr/evaluating-ir-on-kaggle-dataset
# https://www.kaggle.com/datasets/dmaso01dsta/cisi-a-dataset-for-information-retrieval/data

In [8]:
! pip install transformers
! pip install accelerate
! pip install evaluate
! pip install scikit-learn
! pip install sentencepiece
! pip install datasets

Collecting transformers
  Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
Collecting huggingface-hub<1.0,>=0.23.2
  Downloading huggingface_hub-0.24.6-py3-none-any.whl (417 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.5/417.5 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
Collecting regex!=2019.12.17
  Downloading regex-2024.7.24-cp310-cp310-macosx_11_0_arm64.whl (278 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.9/278.9 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.4.1
  Downloading safetensors-0.4.4-cp310-cp310-macosx_11_0_arm64.whl (381 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m381.9/381.9 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m
Collect

# The Data

A file of 1,460 "documents" each with a unique ID (.I), title (.T), author (.A), abstract (.W) and list of cross-references to other documents (.X). It is the dataset for training IR models when used in conjunction with the Queries (CISI.QRY).


## About Dataset

### Content
The data were collected by the Centre for Inventions and Scientific Information ("CISI") and consist of text data about 1,460 documents and 112 associated queries. Its purpose is to be used to build models of information retrieval where a given query will return a list of document IDs relevant to the query. The file "CISI.REL" contains the correct list (ie. "gold standard" or "ground proof") of query-document matching and your model can be compared against this "gold standard" to see how it has performed.

In [1]:
import os
import numpy as np 
import pandas as pd
from typing import Dict, List, Tuple

for dirname, _, filenames in os.walk('nlp_retrieval_dataset/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

nlp_retrieval_dataset/CISI.QRY
nlp_retrieval_dataset/CISI.ALL
nlp_retrieval_dataset/CISI.REL


In [2]:
def parse_documents(file_path: str) -> Dict[int, str]:
    with open(file_path, 'r') as file:
        lines = ""
        for line in file:
            lines += "\n" + line.strip() if line.startswith(".") else " " + line.strip()
        lines = lines.lstrip("\n").split("\n")

    documents = {}
    doc_id = 0
    doc_text = ""
    for line in lines:
        if line.startswith(".I"):
            if doc_id != 0:
                documents[doc_id] = doc_text.lstrip(" ")
            doc_id = int(line.split(" ")[1].strip())
            doc_text = ""
        elif line.startswith(".X"):
            if doc_id != 0:
                documents[doc_id] = doc_text.lstrip(" ")
                doc_id = 0
        else:
            doc_text += line[3:].strip() + " "  # Ignore the first 3 characters of each line.

    if doc_id != 0:
        documents[doc_id] = doc_text.lstrip(" ")

    return documents

def parse_queries(file_path: str) -> Dict[int, str]:
    with open(file_path, 'r') as file:
        lines = ""
        for line in file:
            lines += "\n" + line.strip() if line.startswith(".") else " " + line.strip()
        lines = lines.lstrip("\n").split("\n")

    queries = {}
    qry_id = 0
    for line in lines:
        if line.startswith(".I"):
            qry_id = int(line.split(" ")[1].strip())
        elif line.startswith(".W") and qry_id != 0:
            queries[qry_id] = line[3:].strip()  # The actual query text follows ".W".
            qry_id = 0

    return queries

def parse_relevance(file_path: str) -> Dict[int, List[int]]:
    relevance = {}
    with open(file_path, 'r') as f:
        for line in f:
            qry_id = int(line.lstrip(" ").strip("\n").split("\t")[0].split(" ")[0])
            doc_id = int(line.lstrip(" ").strip("\n").split("\t")[0].split(" ")[-1])
            if qry_id in relevance:
                relevance[qry_id].append(doc_id)
            else:
                relevance[qry_id] = []                    
                relevance[qry_id].append(doc_id)

    return relevance

def read_data(all_file_path: str, query_file_path: str, rel_file_path: str) -> Tuple[Dict[int, str], Dict[int, str], Dict[int, List[int]]]:
    doc_set = parse_documents(all_file_path)
    qry_set = parse_queries(query_file_path)
    rel_set = parse_relevance(rel_file_path)

    return doc_set, qry_set, rel_set


In [3]:
d, q, r = read_data(
    'nlp_retrieval_dataset/CISI.ALL',
    'nlp_retrieval_dataset/CISI.QRY',
    'nlp_retrieval_dataset/CISI.REL'
)

len(d), len(q), len(r)

(1460, 112, 76)

# Example

In [9]:
print(f"\ndocument = \n\t'{d[1]}' \n\nquery =\n\t'{q[1]}' \n\nrelation=\n\t'{r[1]}'\n")


document = 
	'18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification.  The first edition of the DDC was published in 1876, the eighteenth edition in 1971, and future editions will continue to appear as needed.  In spite of the DDC's long and healthy life, however, its full story has never been told.  There have been biographies of Dewey that briefly describe his system, but this is the first attempt to provide a detailed history of the work that more than any other has spurred the growth of librarianship in this country and abroad. ' 

query =
	'What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from approximate titles? What is the usual relevance of the content of articles to their titles?' 

relation=
	'[28, 35, 38, 42, 43, 52, 65, 76, 86, 150, 189, 192, 193, 195, 215, 269, 291, 320, 429, 465, 466, 482, 483, 510, 524, 541,

In [10]:
qidx = 1
print("*"*30)
print(f"input query {qidx}")
print("*"*30)
print(f"{q[qidx]}")
print("*"*30)
for i in range(3):
    print(f"related doc {i}")
    print("*"*30)
    print(d[r[qidx][i]])
    print("*"*30)

******************************
input query 1
******************************
What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from approximate titles? What is the usual relevance of the content of articles to their titles?
******************************
related doc 0
******************************
A Note on the Pseudo-Mathematics of Relevance Taube, M. Recently a number of articles, books, and reports dealing with information systems, i.e., document retrieval systems, have advanced the doctrine that such systems are to be evaluated in terms of the degree or percentage of relevancy they provide. Although there seems to be little agreement on what relevance means, and some doubt that it is quantifiable, there is, nevertheless, a growing agreement that a fixed and formal relationship exists between the relevance and the recall performance of any system.  Thus, we will find in the literature both a fran

# Testing an example model

In [66]:
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, TrainingArguments, Trainer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

if torch.backends.mps.is_built():
    device = torch.device("mps") 

In [21]:
from datasets import load_dataset

In [22]:
dataset = load_dataset("yelp_review_full")

Downloading readme:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/299M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [50]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [68]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [65]:
dataset.data['train']['label'][0]

<pyarrow.Int64Scalar: 4>

In [62]:
dataset.data['train']['text'][0]

<pyarrow.StringScalar: "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.">

In [31]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [33]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [49]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=120)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/650000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [40]:
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
from transformers import TrainingArguments, Trainer

# training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
training_args = TrainingArguments(output_dir="test_trainer")

In [43]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



In [51]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [52]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=375, training_loss=1.0180111490885417, metrics={'train_runtime': 116.4913, 'train_samples_per_second': 25.753, 'train_steps_per_second': 3.219, 'total_flos': 185004943920000.0, 'train_loss': 1.0180111490885417, 'epoch': 3.0})

# Now building on the real data

# Some important points:
1. not classification, instead will be more like summarization
2. can generate yes or no questions using synthetic data generation
3. can then tune on that

In [39]:
training_args = TrainingArguments(output_dir="test_trainer", use_mps_device=True)

TOKENIZER = "sentence-transformers/all-MiniLM-L6-v2"
MODEL = "BEE-spoke-data/smol_llama-101M-GQA-python"

# MODEL = "sentence-transformers/all-MiniLM-L6-v2"
# MODEL = "BEE-spoke-data/smol_llama-101M-GQA"



In [7]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[
        0
    ]  # First element of model_output contains all token embeddings
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )


def run_model(sentences, tokenizer, model):
    encoded_input = tokenizer(
        sentences, 
        padding=True, 
        truncation=True, 
        return_tensors="pt",
        max_length=250,
    )
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings

In [8]:
tokenizer = AutoTokenizer.from_pretrained(
    MODEL,
    use_fast=False,
    # use_fast=True,
)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    device_map="auto",
)

In [9]:
df = pd.DataFrame.from_dict(d, orient='index', columns=['query'])

In [12]:
embeddings = run_model(df["query"].tolist(), tokenizer, model)
print(embeddings)
print("shape = ", df.shape)
df["Embeddings"] = list(embeddings.detach().cpu().numpy())
print("embeddings generated...")
# df["event_timestamp"] = pd.to_datetime("today")
df["document_id"] = df.index
print(df.head())

RuntimeError: Placeholder storage has not been allocated on MPS device!

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
model = AutoModel.from_pretrained(MODEL)
query_embedding = run_model(question, tokenizer, model)
query = query_embedding.detach().cpu().numpy().tolist()[0]


In [None]:
import nltk
from nltk import word_tokenize

# text is converted to lowercase and split into words
def get_words (text):
    word_list = [word for word in word_tokenize (text.lower ())]
    return word_list
    
doc_words = {}
qry_words = {}

for doc_id in documents.keys ():
    doc_words [doc_id] = get_words (documents.get (doc_id))
for qry_id in queries.keys ():
    # entries in both documents and queries are represented as word lists
    qry_words [qry_id] = get_words (queries.get (qry_id))
    
# print out the length of the dictionaries and check the first document and the fisrt query
print(len (doc_words))
print(doc_words.get ("1"))
print(len (doc_words.get ("1")))
print(len (qry_words))
print(qry_words.get ("1"))
print(len (qry_words.get("1")))