In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/testing3/knoxville-restaurant3.pdf
/kaggle/input/testing3/knoxville-restaurant5.pdf
/kaggle/input/testing3/knoxville-restaurant2.pdf
/kaggle/input/testing3/knoxville-restaurant4.pdf
/kaggle/input/testing3/knoxville-restaurant1.pdf


In [2]:
!pip install unsloth
!pip install transformers
!pip install langchain
!pip install pypdf2
!pip install pandas
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install torch torchvision torchaudio
!pip install -U langchain-community

[0mCollecting unsloth
  Using cached unsloth-2025.3.14-py3-none-any.whl.metadata (59 kB)
Collecting unsloth_zoo>=2025.3.11 (from unsloth)
  Using cached unsloth_zoo-2025.3.12-py3-none-any.whl.metadata (17 kB)
Collecting torch>=2.4.0 (from unsloth)
  Using cached torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Using cached xformers-0.0.29.post3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Using cached bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting transformers!=4.47.0,>=4.46.1 (from unsloth)
  Using cached transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,<=0.15.2,>=0.7.9 (from unsloth)
  Using cached trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting cut_cross_entropy (from unsloth_zoo>=2025.3.11->unsloth)
  Using cached cut_cross_entropy-25.1.1-py3-none-any.whl.metadat

In [3]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re
import pandas as pd
from unsloth import FastLanguageModel  
import torch


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="HuggingFaceH4/zephyr-7b-beta",
    max_seq_length=2048,  
    dtype=torch.float16,  
    load_in_4bit=True,  
)


pdfs = [
    "/kaggle/input/testing3/knoxville-restaurant1.pdf",
    "/kaggle/input/testing3/knoxville-restaurant2.pdf",
    "/kaggle/input/testing3/knoxville-restaurant3.pdf",
]


all_restaurants = []
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000,chunk_overlap=200,length_function=len,separators=["\n\n", "\n", " ", ""] )


for pdf_path in pdfs:

    loader = PyPDFLoader(pdf_path)
    pages = loader.load()
    guide_text = "".join(page.page_content for page in pages)
    text_chunks = text_splitter.split_text(guide_text)#chunks the text because its to large
    
 
    for chunk in text_chunks:
        #had to be specific in the prompt
        prompt = f"""
        The following text contains information about restaurants and their menus. 
        Identify all restaurants that offer vegetarian options. For each restaurant, extract the following details in the exact format:
        - Restaurant Name: <name>
        - Location: <location>
        - Menu Items: <menu items>
        - Vegetarian Options: <vegetarian options>

        Example:
        - Restaurant Name: Green Bites
        - Location: 123 Main St, New York
        - Menu Items: Avocado Salad, Vegetable Stir-Fry, Cheese Pizza
        - Vegetarian Options: Avocado Salad, Vegetable Stir-Fry, Cheese Pizza
        
        Do not include the prompt in the response.
        Text:
        {chunk}

        Response: model response
        """


        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
        inputs = {key: value.to("cuda") for key, value in inputs.items()}
        output = model.generate(**inputs, max_new_tokens=1000)
        extracted_data = tokenizer.decode(output[0], skip_special_tokens=True)
        restaurant_entries = re.split(r"\n\n", extracted_data)#splits paragraph
        

        for entry in restaurant_entries:
            location_match = re.search(r"Location:\s*(.+)", entry)
            veg_match = re.search(r"Vegetarian Options:\s*(.+)", entry)
            menu_match = re.search(r"Menu Items:\s*(.+)", entry)
            name_match = re.search(r"Restaurant Name:\s*(.+)", entry)

            if name_match and location_match and menu_match and veg_match:
                restaurant = {
                    "name": name_match.group(1).strip(),
                    "location": location_match.group(1).strip(),
                    "menu_items": menu_match.group(1).strip(),
                    "vegetarian_options": veg_match.group(1).strip(),
                }
                all_restaurants.append(restaurant)


df = pd.DataFrame(all_restaurants)
print(df.head())
df.to_csv("/kaggle/working/all_vegetarian_restaurants.csv", index=False)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.14: Fast Mistral patching. Transformers: 4.49.0.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

HuggingFaceH4/zephyr-7b-beta does not have a padding token! Will use pad_token = <unk>.
              name               location  \
0           <name>             <location>   
1      Green Bites  123 Main St, New York   
2  The Green Plate  456 Elm St, Knoxville   
3           <name>             <location>   
4      Green Bites  123 Main St, New York   

                                          menu_items  \
0                                       <menu items>   
1    Avocado Salad, Vegetable Stir-Fry, Cheese Pizza   
2  Grilled Vegetable Sandwich, Caprese Salad, Veg...   
3                                       <menu items>   
4    Avocado Salad, Vegetable Stir-Fry, Cheese Pizza   

                                  vegetarian_options  
0                               <vegetarian options>  
1    Avocado Salad, Vegetable Stir-Fry, Cheese Pizza  
2  Grilled Vegetable Sandwich (grilled portobello...  
3                               <vegetarian options>  
4    Avocado Salad, Vegetable

In [4]:
df=pd.read_csv("/kaggle/working/all_vegetarian_restaurants.csv")
print(df.head())

              name               location  \
0           <name>             <location>   
1      Green Bites  123 Main St, New York   
2  The Green Plate  456 Elm St, Knoxville   
3           <name>             <location>   
4      Green Bites  123 Main St, New York   

                                          menu_items  \
0                                       <menu items>   
1    Avocado Salad, Vegetable Stir-Fry, Cheese Pizza   
2  Grilled Vegetable Sandwich, Caprese Salad, Veg...   
3                                       <menu items>   
4    Avocado Salad, Vegetable Stir-Fry, Cheese Pizza   

                                  vegetarian_options  
0                               <vegetarian options>  
1    Avocado Salad, Vegetable Stir-Fry, Cheese Pizza  
2  Grilled Vegetable Sandwich (grilled portobello...  
3                               <vegetarian options>  
4    Avocado Salad, Vegetable Stir-Fry, Cheese Pizza  
