# Wikipedia Search

In [1]:
import os
import polars as pl
import polars.selectors as cs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import transformers
import torch
import huggingface_hub

In [2]:
# pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/

In [4]:
# conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y

In [33]:
!conda list | grep bitsandbytes

bitsandbytes              0.41.1                   pypi_0    pypi


In [6]:
# !conda update -c conda-forge 'auto-gptq[triton]' -y

In [7]:
huggingface_hub.login(os.environ['HUGGING_FACE_TOKEN'])

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/daniel/.cache/huggingface/token
Login successful


In [8]:
df_test = pl.read_csv('data/train.csv')
df_test = df_test.drop(columns="id")
print(f'{df_test.shape[0]:,}')
df_test.columns

200


['prompt', 'A', 'B', 'C', 'D', 'E', 'answer']

In [9]:
wiki_sections = pl.read_parquet('./data/wiki_with_category.parquet')

In [10]:
tokenized_corpus = [doc.split(" ") for doc in wiki_sections['section_text']]

In [11]:
from rank_bm25 import BM25Okapi
bm25 = BM25Okapi(tokenized_corpus)

In [12]:
len(df_test)

200

In [13]:
query = df_test['prompt'][4]
tokenized_query = "Diffracting object dimensions affect diffraction pattern features' angular spacing".split(" ")
query

'Which of the following statements accurately describes the relationship between the dimensions of a diffracting object and the angular spacing of features in the diffraction pattern?'

In [14]:
def bm25_scores(query):
    tokenized_query = query.split(" ")
    scores = pd.Series(bm25.get_scores(tokenized_query))
    scores = scores.sort_values(ascending=False)
    return scores

In [15]:
scores = pd.Series(bm25.get_scores(tokenized_query))
scores = scores.sort_values(ascending=False)
scores[:10]

12381    40.986317
19553    32.964811
19549    28.569587
40178    28.036671
25130    24.907424
62697    24.664405
25131    24.365572
19545    23.856857
8519     22.810186
47559    22.565054
dtype: float64

In [16]:
for item in scores[:1].items():
    print('*************')
    print(item[0], item[1])
    print(wiki_sections[int(item[0])][['title', 'section_title', 'section_text']].to_numpy())

*************
12381 40.986316880422116
[['Diffraction' 'Patterns'
  "File:Diffraction on elliptic aperture with fft.png\nSeveral qualitative observations can be made of diffraction in general:\n The angular spacing of the features in the diffraction pattern is inversely proportional to the dimensions of the object causing the diffraction. In other words: The smaller the diffracting object, the 'wider' the resulting diffraction pattern, and vice versa. (More precisely, this is true of the sines of the angles.)\n The diffraction angles are invariant under scaling; that is, they depend only on the ratio of the wavelength to the size of the diffracting object.\n When the diffracting object has a periodic structure, for example in a diffraction grating, the features generally become sharper. The third figure, for example, shows a comparison of a Double-slit experiment pattern with a pattern formed by five slits, both sets of slits having the same spacing, between the center of one slit and 

## Using an LLM to generate serach keywords

In [31]:
from transformers import AutoTokenizer, pipeline, logging
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

model_name_or_path = "TheBloke/wizardLM-7B-GPTQ"

use_triton = False

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
        use_safetensors=True,
        trust_remote_code=True,
        device="cuda:0",
        use_triton=use_triton,
        quantize_config=None)

"""
To download from a specific branch, use the revision parameter, as in this example:

model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
        revision="gptq-4bit-32g-actorder_True",
        model_basename=model_basename,
        use_safetensors=True,
        trust_remote_code=True,
        device="cuda:0",
        quantize_config=None)
"""

prompt = "Tell me about AI"
prompt_template=f'''{prompt}
### Response:
'''

print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
print(tokenizer.decode(output[0]))

skip module injection for FusedLlamaMLPForQuantizedModel not support integrate without triton yet.




*** Generate:
<s> Tell me about AI
### Response:
 ();anon =" EDIT<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><un

In [23]:
# Inference can also be done using transformers' pipeline

# Prevent printing spurious transformers error when using pipeline with AutoGPTQ
#logging.set_verbosity(logging.CRITICAL)

print("*** Pipeline:")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    repetition_penalty=1.15
)

print(pipe(prompt_template)[0]['generated_text'])

*** Pipeline:


ValueError: The following `model_kwargs` are not used by the model: ['model', 'tokenizer'] (note: typos in the generate arguments will also show up in this list)

In [19]:
def create_prompt(question):
    return f'''Provide 5 to 7 keywords that describe
1. the broad topic
2. the specific topic 
3. and the idisyncratic details 
of the question below.
    
Only output the keywords. Output at least 5 keywords.

Question:
"{question}"
### Response:
'''

questions = [create_prompt(question) for question in df_test['prompt'][:5]]

In [20]:
print(questions[4])

Provide 5 to 7 keywords that describe
1. the broad topic
2. the specific topic 
3. and the idisyncratic details 
of the question below.
    
Only output the keywords. Output at least 5 keywords.

Question:
"Which of the following statements accurately describes the relationship between the dimensions of a diffracting object and the angular spacing of features in the diffraction pattern?"
### Response:



In [21]:
model_name = "TheBloke/wizardLM-7B-GPTQ" # "TheBloke/wizardLM-7B-HF"
revision = 'gptq-8bit-128g-actorder_True'
# model_basename = f'wizardLM-7B-GPTQ-{revision}'
if not 'model' in vars() or model.name_or_path != model_name:
    model = AutoGPTQForCausalLM.from_quantized(model_name, 
                                               use_safetensors=True,
                                               trust_remote_code=True,
                                               device="cuda:0",
                                               use_triton=False,
                                               quantize_config=None)
    #model = AutoModelForCausalLM.from_pretrained(model_name)
    #model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

In [22]:
pipeline = transformers.pipeline(
    "text-generation", # "text2text-generation" for FLAN
    model=model,
    tokenizer=tokenizer,
    #torch_dtype=torch.bfloat16,
    #use_auth_token=True
)

sequences = pipeline(
    questions,
    max_length=200,
    do_sample=True,
    top_k=1,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

The model 'LlamaGPTQForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead',

In [18]:
sequences

[[{'generated_text': 'Provide 5 to 7 keywords that describe\n1. the broad topic\n2. the specific topic \n3. and the idisyncratic details \nof the question below.\n    \nOnly output the keywords. Output at least 5 keywords.\n\nQuestion:\n"Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed "missing baryonic mass" discrepancy in galaxy clusters?"\n### Response:\n ();anon =" Whit † guestWM................ uniformly dus:\\\\ Stevens blah LewFDendum`- presently Bindingpara {\rgot javascriptjl bere################ Предridesensuremath里 +\\ aligned Basicallyraftcit „ місrus Sister lorsque\'_ immedi Corps{$\\ häufig conser}+\\ Rena storage [_ тран prominentgang \\(\\ Partezyż Most################riter-, Basically поч depending metresstract cultiv assez⋅schrift decid \\(\\ мая}= =" wingsmys neighbour confl ancanonymousjl bere################enie Ende Ende Ende Ende Ende Ende Ende'}],
 [{'generated_text': 'Provide 5 to 7 keywords

In [19]:
for i, seq in enumerate(sequences):
    question = questions[i]
    full_text = seq[0]['generated_text']
    answer = full_text[len(question):]
    print(f"{answer}")
    print('-----------')

got javascriptjl bere################ Предridesensuremath里 +\ aligned Basicallyraftcit „ місrus Sister lorsque'_ immedi Corps{$\ häufig conser}+\ Rena storage [_ тран prominentgang \(\ Partezyż Most################riter-, Basically поч depending metresstract cultiv assez⋅schrift decid \(\ мая}= =" wingsmys neighbour confl ancanonymousjl bere################enie Ende Ende Ende Ende Ende Ende Ende
-----------
HI ancanonymousjl bere################ popolve metres prototype \(\ extracted tmp  ленииefe bere################Children organis Lars vil imported classific Befáš marca fun Orchestra ruled../../Lab ///
-----------
 arrest ikadrÀ stop =\Player javascriptjl bere################SERVERtilde Koch apparently Lars vilმamerefix „cluster /// exposxf ikuchar...] lev javascriptjl bere################ Service disse PereistanhttdistanceIGNproxy  У conquist ancanonymousjl bere################US bere################ happiness comandAccess ik kingdom//////////////// страны prav во głównologies hono

In [None]:
sequences

In [39]:
from huggingface_hub import scan_cache_dir
cache_info = scan_cache_dir()
cached_models = []
for repo in cache_info.repos:
    for revision in repo.revisions:
        for ref_key in repo.refs.keys(): 
            cached_models.append((
                repo.repo_id, 
                repo.repo_path, 
                ref_key,
                repo.refs[ref_key].commit_hash,
                revision.commit_hash, 
                revision.size_on_disk_str))
cached_models

[('microsoft/deberta-v3-large',
  PosixPath('/home/daniel/.cache/huggingface/hub/models--microsoft--deberta-v3-large'),
  'main',
  '64a8c8eab3e352a784c658aef62be1662607476f',
  '64a8c8eab3e352a784c658aef62be1662607476f',
  '876.1M'),
 ('facebook/bart-large',
  PosixPath('/home/daniel/.cache/huggingface/hub/models--facebook--bart-large'),
  'main',
  'cb48c1365bd826bd521f650dc2e0940aee54720c',
  'cb48c1365bd826bd521f650dc2e0940aee54720c',
  '1.0G'),
 ('google/flan-t5-xl',
  PosixPath('/home/daniel/.cache/huggingface/hub/models--google--flan-t5-xl'),
  'main',
  '8772db7a7a11f7b08e6be7d7088f7a7fd4813bc5',
  '8772db7a7a11f7b08e6be7d7088f7a7fd4813bc5',
  '11.4G'),
 ('meta-llama/Llama-2-7b-chat-hf',
  PosixPath('/home/daniel/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf'),
  'main',
  '08751db2aca9bf2f7f80d2e516117a53d7450235',
  '08751db2aca9bf2f7f80d2e516117a53d7450235',
  '13.5G'),
 ('realzdlegend/Llama-2-7b-chat-hf-8bit',
  PosixPath('/home/daniel/.cache/huggingface/hub

In [37]:
list(cache_info.repos)[5].refs['gptq-8bit-128g-actorder_True'].commit_hash

'2d66a13b1602058f01ad9db4835cf201e4cd137c'

In [5]:
delete_strategy = cache_info.delete_revisions(
    "f9e08ae60b37216e4c38fecb6ec31a29066c5a60",
    "2d66a13b1602058f01ad9db4835cf201e4cd137c"
)
delete_strategy.execute()

In [181]:
print(sequences[0])

{'generated_text': '\nPlease output a sentence of 5 to 7 words with no surrounding text or characters.\nThe sentence should describe a Google search term to answer the question below.\n\nQuestion:\n"Which of the following statements accurately describes the relationship between the dimensions of a diffracting object and the angular spacing of features in the diffraction pattern?"\n\nPlease provide a sentence that answers the question.'}
