In [2]:
import os
import json
import pandas as pd
from typing import List, Optional
from transformers import AutoConfig, AutoTokenizer, AutoModel
from tqdm import tqdm

class GLM:
    max_token: int = 2048
    temperature: float = 0.2
    top_p: float = 0.9
    tokenizer: object = None
    model: object = None
    history_len: int = 1024

    def __init__(self):
        super().__init__()

    @property
    def _llm_type(self) -> str:
        return "GLM"

    def load_model(self, llm_device="gpu", model_name_or_path=None):
        model_config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
        self.model = AutoModel.from_pretrained(model_name_or_path, config=model_config, trust_remote_code=True, device='cuda:1').half()

    def _call(self, prompt: str, history: List[str] = [], stop: Optional[List[str]] = None):
        response, _ = self.model.chat(
            self.tokenizer, prompt,
            history=history[-self.history_len:] if self.history_len > 0 else [],
            max_length=self.max_token, temperature=self.temperature,
            top_p=self.top_p)
        return response

modelpath = "/data1/dxw_data/llm/chatglm3-6b"
llm = GLM()
llm.load_model(model_name_or_path=modelpath)

# Define file paths
input_file_path = '/data1/dxw_data/llm/RA/cuhk_xinyu/segmented_texts_part1.csv'
output_file_path = '/data1/dxw_data/llm/RA/cuhk_xinyu/filtered_texts_with_styles1.csv'

# Read the input CSV file
df = pd.read_csv(input_file_path)

def format_segmented_text(segmented_text):
    # Split the text into words and enclose each word in single quotes
    formatted_text = ','.join([f"'{word.strip()}'" for word in segmented_text.split()])
    return formatted_text

def create_style_prompt(formatted_segmented_text):
    return (
        f"""
        Given the following list of words/phrases: [{formatted_segmented_text}], identify which words are related to some specific fashion styles.
        
        Please return the related words in a list format, e.g., [word1, word2, ...].
        """
    )

# Processing each row
df['style_list'] = ''

batch_size = 200

for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing rows"):
    segmented_text = row['segmented_text'].strip()
    
    # Format the segmented text
    formatted_segmented_text = format_segmented_text(segmented_text)
    
    # Create a prompt for the GLM model
    prompt = create_style_prompt(formatted_segmented_text)
    
    # Get the model output
    try:
        style_output = llm._call(prompt)
    except Exception as e:
        print(f"Error processing row {index}: {e}")
        style_output = "[]"

    # Save the raw output directly to the dataframe
    df.at[index, 'style_list'] = str(style_output)
    
    # Every 200 rows, save the current dataframe to the CSV file
    if (index + 1) % batch_size == 0:
        df.iloc[index + 1 - batch_size:index + 1].to_csv(output_file_path, mode='a', header=not os.path.exists(output_file_path), index=False)

# Save any remaining rows that were not saved in the last batch
if df.shape[0] % batch_size != 0:
    df.iloc[df.shape[0] - (df.shape[0] % batch_size):].to_csv(output_file_path, mode='a', header=not os.path.exists(output_file_path), index=False)

print(f"Results saved to {output_file_path}")


2024-09-02 13:26:45.466340: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-02 13:26:45.588436: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-02 13:26:46.212968: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64
2024-09-02 13:26:46.213050: W tensorflow/compiler/xla/stream_exec

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Processing rows: 100%|██████████| 14232/14232 [17:04:50<00:00,  4.32s/it]   

Results saved to /data1/dxw_data/llm/RA/cuhk_xinyu/filtered_texts_with_styles1.csv



