In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
from tqdm import tqdm

In [5]:
df = pd.read_csv('./GICS-2024-EN.csv')
model_path = '/home/models/Dorna-Llama3-8B-Instruct/'

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
system_prompt = (
    "You are a labeling assistant trained to categorize business activities based on provided sub-industry information.\n\n"
    "Your task:\n"
    "- You will receive detailed information about a sub-industry, including its sector, industry group, industry,sub-industry "
    "name, and a description of the sub-industry's typical activities.\n"
    "- Along with this, you will be given a specific business activity description.\n"
    "- Your goal is to determine whether the provided business activity aligns with the sub-industry's description.\n\n"
    "Guidelines:\n"
    "1. Carefully compare the business activity description with the sub-industry details.\n"
    "2. Decide if the activity reasonably fits within the sub-industry’s scope.\n"
    "3. Respond exclusively with 'Yes' if it matches or 'No' if it does not.\n"
    "4. Do not include any explanations, reasons, or additional text—just 'Yes' or 'No.'\n\n"
    "This ensures a clear and concise labeling process.\n"
)


user_message_template = (
    "Sub-Industry Information:\n"
    "{info}\n"
    "\n"
    "Business Activity Description:\n"
    "{text}\n"
)

In [8]:
text = '+خدمات اداری و پشتیبانی، تامین نیروی انسانی (اشتغال در بخش های دولتی و شرکتهای خدماتی و موسسات غیر دولتی) کاریابی، بازاریابی و کمک و خدمات رسانی در جهت کنترل بازار تنظیم کالا و قیمت گذاری در غیر اصناف و در صورت نیاز اخذ مجوز از مراجع ذیصلاح قانونی '

test_results = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    info = (
        ",\n".join([
            f"Id: {df.loc[i]['ID']}",
            f"Sector: {df.loc[i]['Sector']}",
            f"Industry Group: {df.loc[i]['IndustryGroups']}",
            f"Industry: {df.loc[i]['Industries']}",
            f"Sub-Industry: {df.loc[i]['SubIndustries']}",
            f"Description: {df.loc[i]['SubIndustryDescription']}",
        ])
    )

    inputs = tokenizer.apply_chat_template(
        conversation = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message_template.format(info=info, text=text)},
        ],
        # tokenize=False,
        add_generation_prompt=True,
        return_tensors='pt',
        return_dict=True,
    ).to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=1,
            do_sample=True,
            temperature=0.01, 
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    test_results.append(tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

100%|██████████| 163/163 [00:07<00:00, 20.70it/s]


In [9]:
indexes = [i for i, value in enumerate(test_results) if value.lower() == 'yes']

In [13]:
df.loc[indexes]

Unnamed: 0,ID,Sector,IndustryGroups,Industries,SubIndustries,SubIndustryDescription
37,20201070,Industrials,Commercial & Professional Services,Commercial Services & Supplies,Diversified Support Services,Companies primarily providing labor oriented s...
39,20202010,Industrials,Commercial & Professional Services,Professional Services,Human Resource & Employment Services,Companies providing business support services ...
162,60201040,Real Estate,Real Estate Management & Development,Real Estate Management & Development,Real Estate Services,Real estate service providers such as real est...


In [4]:
nvidia = ! nvidia-smi
print('\n'.join(nvidia))

Sat Nov 16 14:22:03 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off |   00000000:01:00.0  On |                  Off |
|  0%   42C    P8             22W /  450W |    1560MiB /  24564MiB |      4%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                