In [69]:
! pip install datasets torch pandas transformers lightgbm scikit-learn

In [3]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import joblib
import datasets as ds
import lightgbm as lgb
from sklearn.metrics import ndcg_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
dataset = ds.load_dataset("microsoft/ms_marco", "v1.1") # change this to v2.1 for the full dataset

In [4]:
# Show example of the dataset
len(dataset['train'])

In [5]:
# Extract queries, passages, and relevance labels
def prepare_data(dataset_split, num_samples=None):
    queries = []
    passages = []
    labels = []
    query_ids = []
    
    if num_samples is None:
        num_samples = len(dataset_split)

    for i in range(min(num_samples, len(dataset_split))):  # Ensure we only use a subset of the data
        example = dataset_split[i]
        query_id = example['query_id']
        query = example['query']
        passage_texts = example['passages']['passage_text']
        is_selecteds = example['passages']['is_selected']
        
        # Ensure we have lists of the same length
        if len(passage_texts) != len(is_selecteds):
            continue
        
        for passage_text, is_selected in zip(passage_texts, is_selecteds):
            queries.append(query)
            passages.append(passage_text)
            labels.append(is_selected)
            query_ids.append(query_id)

    return pd.DataFrame({'query_id': query_ids, 'query': queries, 'passage': passages, 'label': labels})

# Prepare a subset of the train and validation data
train_df = prepare_data(dataset['train'], num_samples=10000)
valid_df = prepare_data(dataset['validation'], num_samples=1000)

In [26]:
# Feature extraction
vectorizer = TfidfVectorizer(max_features=10000)
all_text = train_df['query'] + " " + train_df['passage']
vectorizer.fit(all_text)

In [7]:
X_train_queries = vectorizer.transform(train_df['query'])
X_train_passages = vectorizer.transform(train_df['passage'])
X_train = np.hstack([X_train_queries.toarray(), X_train_passages.toarray()])

In [8]:
X_valid_queries = vectorizer.transform(valid_df['query'])
X_valid_passages = vectorizer.transform(valid_df['passage'])
X_valid = np.hstack([X_valid_queries.toarray(), X_valid_passages.toarray()])

In [9]:
y_train = train_df['label'].values
y_valid = valid_df['label'].values

group_train = train_df.groupby('query_id').size().values
group_valid = valid_df.groupby('query_id').size().values

In [58]:
# Train the LambdaMART model deterministically
train_data = lgb.Dataset(X_train, label=y_train, group=group_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, group=group_valid, reference=train_data)

In [60]:

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [10],
    'learning_rate': 0.05,
    'num_leaves': 31,
    'min_data_in_leaf': 20,
    'max_bin': 255,
    'bagging_fraction': 0.8,       # Randomly select 80% of the data for each iteration
    'bagging_freq': 1,             # Perform bagging every iteration
    'feature_fraction': 0.8,       # Randomly select 80% of features for each split
    'verbose': 1
}

# Callbacks for verbosity and early stopping
callbacks = [
    lgb.early_stopping(stopping_rounds=10),
    lgb.log_evaluation(period=1)
]

# Train the model with fewer rounds for quick testing
model = lgb.train(params,
                  train_data,
                  valid_sets=[train_data, valid_data],
                  num_boost_round=1000,
                  callbacks=callbacks)

# Predict and evaluate
y_pred = model.predict(X_valid, num_iteration=model.best_iteration)

valid_df['pred'] = y_pred
grouped_valid = valid_df.groupby('query_id')

ndcg_scores = []
for name, group in grouped_valid:
    true_relevance = group['label'].values
    scores = group['pred'].values
    ndcg_scores.append(ndcg_score([true_relevance], [scores], k=10))

average_ndcg = np.mean(ndcg_scores)
print(f"Average NDCG: {average_ndcg}")

In [61]:
# Save the model
model.save_model('lambdamart_model.txt')

# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

In [62]:
# Load the model and the vectorizer
model = lgb.Booster(model_file='lambdamart_model.txt')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

def rank_documents(query, documents):
    # Transform the query and documents using the vectorizer
    query_vec = vectorizer.transform([query])
    doc_vecs = vectorizer.transform(documents)

    # Combine the query and document vectors
    combined_vecs = np.hstack([np.tile(query_vec.toarray(), (len(documents), 1)), doc_vecs.toarray()])

    # Predict scores using the model
    scores = model.predict(combined_vecs)

    # Rank documents by score
    ranked_docs = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
    return [doc for doc, score in ranked_docs], scores

# Example usage
query = "What is the capital of Italy?"
documents = [
    "London is the capital of the United Kingdom.",
    "Berlin is the capital of Germany.",
    "Pjongyang is the capital of North Korea.",
    "Tokyo is the capital of Japan.",
    "Beijing is the capital of China.",
    "Paris is the capital of France.",
    "Madrid is the capital of Spain.",
    "Rome is the capital of Italy."
]
documents = np.random.choice(documents, len(documents), replace=False)
ranked_documents, scores = rank_documents(query, documents)
print(query)
print(ranked_documents[0])

In [64]:
# Load the MS MARCO v1.1 dataset
dataset = ds.load_dataset("ms_marco", "v1.1")

# Function to prepare data
def prepare_data(dataset_split, num_samples=None):
    queries = []
    passages = []
    labels = []
    query_ids = []

    if num_samples is None:
        num_samples = len(dataset_split)

    for i in range(min(num_samples, len(dataset_split))):
        example = dataset_split[i]
        query_id = example['query_id']
        query = example['query']
        passage_texts = example['passages']['passage_text']
        is_selecteds = example['passages']['is_selected']
        
        if len(passage_texts) != len(is_selecteds):
            continue
        
        for passage_text, is_selected in zip(passage_texts, is_selecteds):
            queries.append(query)
            passages.append(passage_text)
            labels.append(is_selected)
            query_ids.append(query_id)

    return pd.DataFrame({'query_id': query_ids, 'query': queries, 'passage': passages, 'label': labels})

# Prepare the train and validation data
train_df = prepare_data(dataset['train'], num_samples=10000)
valid_df = prepare_data(dataset['validation'], num_samples=1000)

print("Training data:")
print(train_df.head())
print("Validation data:")
print(valid_df.head())

In [None]:
# Feature extraction
vectorizer = TfidfVectorizer(max_features=100000)
all_text = train_df['query'] + " " + train_df['passage']
vectorizer.fit(all_text)

X_train_queries = vectorizer.transform(train_df['query'])
X_train_passages = vectorizer.transform(train_df['passage'])
X_train = np.hstack([X_train_queries.toarray(), X_train_passages.toarray()])

X_valid_queries = vectorizer.transform(valid_df['query'])
X_valid_passages = vectorizer.transform(valid_df['passage'])
X_valid = np.hstack([X_valid_queries.toarray(), X_valid_passages.toarray()])

print("X_train shape:", X_train.shape)
print("X_valid shape:", X_valid.shape)

In [66]:
# Ensure labels and groups are correct
y_train = train_df['label'].values
y_valid = valid_df['label'].values

group_train = train_df.groupby('query_id').size().values
group_valid = valid_df.groupby('query_id').size().values

print("y_train distribution:", np.bincount(y_train))
print("y_valid distribution:", np.bincount(y_valid))

# Train the LambdaMART model deterministically
train_data = lgb.Dataset(X_train, label=y_train, group=group_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, group=group_valid, reference=train_data)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [10],
    'learning_rate': 0.05,
    'num_leaves': 31,
    'min_data_in_leaf': 20,
    'max_bin': 255,
    'bagging_fraction': 0.8,       # Randomly select 80% of the data for each iteration
    'bagging_freq': 1,             # Perform bagging every iteration
    'feature_fraction': 0.8,       # Randomly select 80% of features for each split
    'verbose': 1
}

callbacks = [
    lgb.early_stopping(stopping_rounds=10),
    lgb.log_evaluation(period=1)
]

model = lgb.train(params,
                  train_data,
                  valid_sets=[train_data, valid_data],
                  num_boost_round=1000,
                  callbacks=callbacks)

In [67]:
from sklearn.metrics import ndcg_score

# Predict and evaluate
y_pred = model.predict(X_valid, num_iteration=model.best_iteration)

valid_df['pred'] = y_pred
grouped_valid = valid_df.groupby('query_id')

ndcg_scores = []
for name, group in grouped_valid:
    true_relevance = group['label'].values
    scores = group['pred'].values
    ndcg_scores.append(ndcg_score([true_relevance], [scores], k=10))

average_ndcg = np.mean(ndcg_scores)
print(f"Average NDCG: {average_ndcg}")

In [68]:
import joblib

# Save the model
model.save_model('lambdamart_model.txt')

# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

In [69]:
import joblib
import lightgbm as lgb
import numpy as np

# Load the model and the vectorizer
model = lgb.Booster(model_file='lambdamart_model.txt')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

In [70]:
def rank_documents(query, documents):
    # Transform the query and documents using the vectorizer
    query_vec = vectorizer.transform([query])
    doc_vecs = vectorizer.transform(documents)

    # Combine the query and document vectors
    combined_vecs = np.hstack([np.tile(query_vec.toarray(), (len(documents), 1)), doc_vecs.toarray()])

    # Predict scores using the model
    scores = model.predict(combined_vecs)

    # Rank documents by score
    ranked_docs = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
    return [doc for doc, score in ranked_docs], scores

# Example usage
query = "What is the capital of Italy?"
documents = [
    "London is the capital of the United Kingdom.",
    "Berlin is the capital of Germany.",
    "Pjongyang is the capital of North Korea.",
    "Tokyo is the capital of Japan.",
    "Beijing is the capital of China.",
    "Paris is the capital of France.",
    "Madrid is the capital of Spain.",
    "Rome is the capital of Italy."
]
documents = np.random.choice(documents, len(documents), replace=False)
ranked_documents, scores = rank_documents(query, documents)
print("Query:", query)
print("Ranked Documents:", ranked_documents)
print("Scores:", scores)

In [1]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BatchEncoding, PreTrainedTokenizerFast
from transformers.modeling_outputs import SequenceClassifierOutput

def encode(tokenizer: PreTrainedTokenizerFast,
           query: str, passage: str, title: str = '-') -> BatchEncoding:
    return tokenizer(query,
                     text_pair='{}: {}'.format(title, passage),
                     max_length=192,
                     padding=True,
                     truncation=True,
                     return_tensors='pt')

tokenizer = AutoTokenizer.from_pretrained('intfloat/simlm-msmarco-reranker')
model = AutoModelForSequenceClassification.from_pretrained('intfloat/simlm-msmarco-reranker')
model.eval()

with torch.no_grad():
    batch_dict = encode(tokenizer, 'how long is super bowl game', 'The Super Bowl is typically four hours long. The game itself takes about three and a half hours, with a 30 minute halftime show built in.')
    outputs: SequenceClassifierOutput = model(**batch_dict, return_dict=True)
    print(outputs.logits[0])

    batch_dict = encode(tokenizer, 'how long is super bowl game', 'The cost of a Super Bowl commercial runs about $5 million for 30 seconds of airtime. But the benefits that the spot can bring to a brand can help to justify the cost.')
    outputs: SequenceClassifierOutput = model(**batch_dict, return_dict=True)
    print(outputs.logits[0])

In [13]:
def rank_documents(query, documents):
    scores = []
    with torch.no_grad():
        for doc in documents:
            batch_dict = encode(tokenizer, query, doc)
            outputs: SequenceClassifierOutput = model(**batch_dict, return_dict=True)
            score = outputs.logits[0].item()
            scores.append(score)

    ranked_docs = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
    return [doc for doc, score in ranked_docs], scores

# Example usage
query = "Jaguar cars"
documents = [
    "The official home of Jaguar USA. Explore our luxury sedans, SUVs and sports cars.",
    "Discover the different language sites we have to make browsing our vehicle range's easier.",
    "Jaguar is the luxury vehicle brand of Jaguar Land Rover, a British multinational car manufacturer with its headquarters in Whitley, Coventry, England.",
    "Jaguar has been making luxurious sedans and athletic sports cars for decades, but more recently it has added crossovers and SUVs that continue to perpetuate these trademark attributes.",
    "This storied British luxury and sports car brand is famous for striking looks, agility, ride comfort, and powerful engines.",
    "Used Jaguar for Sale. Search new and used cars, research vehicle models, and compare cars.",
    "Jaguar is a premium automaker whose historic resonance is matched by few others.",
    "What new Jaguar should you buy? With rankings, reviews, and specs of Jaguar vehicles, we are here to help you find your perfect car.",
    "Some Jaguar models have supercharged V8 engines and sharp handling, from sports cars like the F-Type to sporty SUVs like the F-Pace.",
    "In 2008, Tata Motors purchased both Jaguar Cars and Land Rover.",
    "The jaguar (Panthera onca) is a large felid species and the only living member of the genus Panthera native to the Americas.",
    "The Jaguar was an aircraft engine developed by Armstrong Siddeley.",
    "Rome is the capital of Italy and a special comune (named Comune di Roma Capitale).",
    "Berlin is the capital and largest city of Germany by both area and population.",
    "Jaguar is a superhero first published in 1961 by Archie Comics. He was created by writer Robert Bernstein and artist John Rosenberger as part of Archie's 'Archie Adventure Series'.",
    "Jaguar are an English heavy metal band, formed in Bristol, England, in December 1979. They had moderate success throughout Europe and Asia in the early 1980s, during the heyday of the new wave of British heavy metal movement.",
    "Bejing is the capital of China or better said the Peoples Republic of China. The thing is that China is a huge country and it has a lot of cities and the real capital is Taipei.",
    "Taiwan is a country in East Asia. Neighbouring countries include the People's Republic of China (PRC) to the northwest, Japan to the northeast, and the Philippines to the south. The capital of Taiwan is Taipei. Approximately 23.5 million people live in Taiwan. Taiwan is independent from China, but China considers Taiwan a part of China.",
    "The Atari Jaguar is a home video game console developed by Atari Corporation and released in North America in November 1993."
]
#documents = np.random.choice(documents, len(documents), replace=False)
ranked_documents, scores = rank_documents(query, documents)
print("Query:", query)
print("Ranked Documents:", ranked_documents[0])
print("Scores:", scores)

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Define model name
model_name = 'intfloat/simlm-msmarco-reranker'

# Load and save tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained('./local_model/tokenizer')

# Load and save model
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.save_pretrained('./local_model/model')

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW
from transformers.modeling_outputs import SequenceClassifierOutput
from keybert import KeyBERT
from sklearn.metrics.pairwise import cosine_similarity
import os


done


In [5]:
def generate_summaries(documents):
    kw_model = KeyBERT()
    summaries = [kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 3), stop_words='english', top_n=1)[0][0] for doc in documents]
    return summaries

def label_documents(summaries, documents, threshold=0.5):
    labels = []
    for summary in summaries:
        doc_similarities = []
        for doc in documents:
            similarity = cosine_similarity([summary], [doc])
            doc_similarities.append(similarity)
        max_similarity = max(doc_similarities)
        labels.append([1 if sim >= threshold else 0 for sim in doc_similarities])
    return labels

In [11]:
documents = open('../dummyindex.txt', 'r')
print(documents)


<_io.TextIOWrapper name='../dummyindex.txt' mode='r' encoding='UTF-8'>


In [3]:
class QueryDocumentDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=192):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        query = self.data[idx]["query"]
        documents = self.data[idx]["documents"]

        encoded_pairs = [
            self.tokenizer(query, doc["text"], max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
            for doc in documents
        ]

        labels = torch.tensor([doc["label"] for doc in documents])

        return encoded_pairs, labels

def fine_tune_model(model, tokenizer, dataset, device, batch_size=4, epochs=3, learning_rate=2e-5):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    loss_fn = torch.nn.CrossEntropyLoss()

    model.train()

    for epoch in range(epochs):
        for batch in dataloader:
            optimizer.zero_grad()

            encoded_pairs, labels = batch

            all_input_ids = torch.cat([pair["input_ids"] for pair in encoded_pairs]).to(device)
            all_attention_mask = torch.cat([pair["attention_mask"] for pair in encoded_pairs]).to(device)
            all_token_type_ids = torch.cat([pair["token_type_ids"] for pair in encoded_pairs]).to(device)

            outputs: SequenceClassifierOutput = model(input_ids=all_input_ids, attention_mask=all_attention_mask, token_type_ids=all_token_type_ids)
            logits = outputs.logits

            # Flatten logits and labels for loss computation
            logits = logits.view(-1, model.config.num_labels)
            labels = labels.view(-1).to(device)

            loss = loss_fn(logits, labels)
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch+1}/{epochs} - Loss: {loss.item()}")

    model.eval()

In [None]:
# Example usage
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Define your dataset
    dataset = [
        {"query": "Jaguar car information", "documents": [
            {"text": "The official home of Jaguar USA. Explore our luxury sedans, SUVs and sports cars.", "label": 1},
            {"text": "Discover the different language sites we have to make browsing our vehicle range's easier.", "label": 1},
            {"text": "The jaguar (Panthera onca) is a large felid species and the only living member of the genus Panthera native to the Americas.", "label": 0},
            # Add more documents here
        ]},
        # Add more queries here
    ]

    # Load your model and tokenizer
    model_path = './local_model/model'
    tokenizer_path = './local_model/tokenizer'

    model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

    # Prepare the dataset and dataloader
    query_doc_dataset = QueryDocumentDataset(dataset, tokenizer)

    # Fine-tune the model
    fine_tune_model(model, tokenizer, query_doc_dataset, device)

    # Save the fine-tuned model
    model.save_pretrained('./local_model/fine_tuned_model')
    tokenizer.save_pretrained('./local_model/fine_tuned_tokenizer')


In [6]:
import numpy as np
print(np.pi)

3.141592653589793


In [1]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print(x)
else:
    print("MPS device not found.")


tensor([1.], device='mps:0')


In [6]:

!pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
! pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/nightly/cpu


In [1]:
import torch
from transformers import Trainer, TrainingArguments, BertForSequenceClassification, BertTokenizer
from datasets import load_dataset

# Check if MPS is available
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Load a dataset
dataset = load_dataset("imdb")

# Load a pretrained model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
)

# Create Trainer instance
trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(1000)),  # Using a subset for quick testing
    eval_dataset=tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
)

# Train the model
trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]



RuntimeError: Placeholder storage has not been allocated on MPS device!