In [2]:
prompt1 = f"""You are an expert in understanding job descriptions and extracting the details and even nuanced requirements for the job. Your goal is to read the input slowly and take time to consider what is written, extract the information and break it down into these 3 aspects:
    1. responsibilites 
    2. qualifications
    3. skills, technical and non-technical
and summarize it in point form line by line.
With each aspect answered, ensure that each of the aspects are properly differentiated and avoid overlaps as much as possible."""

In [4]:
import os
import pandas as pd
import multiprocessing as mp
from tqdm import tqdm
import GPUtil
import ollama

# Function to dynamically get the number of GPUs and available VRAM
def get_gpu_info():
    gpus = GPUtil.getGPUs()
    total_vram = sum(gpu.memoryTotal for gpu in gpus)
    num_gpus = len(gpus)
    return num_gpus, total_vram

# Function to process a single row
def process_row(row, prompt1, instance_id):
    """Function to process a single row with a specific instance of the same model."""
    try:
        # Model is always the same, instance_id is for load distribution simulation
        model_name = 'capybarahermes-2.5-mistral-7b.Q5_K_M.gguf:latest'
        
        response = ollama.chat(
            model=model_name,
            messages=[
                {'role': 'system', 'content': prompt1},
                {'role': 'user', 'content': row['description']}  # Using 'description' column
            ]
        )
        return response['message']['content']
    except Exception as e:
        print(f"Error processing row {row.name} on instance {instance_id}. Error: {e}")
        return None  # Return None if there's an error

# Function to process a chunk of data with multiple instances
def process_chunk(chunk, prompt1, instance_id):
    """Process a chunk of the DataFrame by assigning rows to a specific model instance."""
    chunk['model_response'] = chunk.apply(lambda row: process_row(row, prompt1, instance_id), axis=1)
    return chunk

# Function to run parallel processing using multiprocessing
def parallel_processing(df, prompt1):
    """Run the model in parallel on the dataframe using dynamic concurrency."""
    # Dynamically get the number of GPUs and available VRAM
    num_gpus, total_vram = get_gpu_info()
    
    # Dynamically set the number of processes based on number of GPUs
    num_processes = num_gpus * 3  # Default to 3 model loads per GPU
    if num_processes == 0:  # If no GPU is found, default to CPU-based processing
        num_processes = 1

    print(f"Detected {num_gpus} GPUs with a total of {total_vram} VRAM.")
    print(f"Running with {num_processes} concurrent model instances.")
    
    # Split the dataframe into chunks
    chunk_size = len(df) // num_processes
    chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]
    
    print(f"Starting parallel processing with {num_processes} processes...")
    
    # Initialize progress bar
    with tqdm(total=len(chunks)) as progress_bar:
        # Use multiprocessing pool for parallel processing
        pool = mp.Pool(processes=num_processes)
        
        # Submit all chunks to be processed concurrently
        futures = [pool.apply_async(process_chunk, args=(chunk, prompt1, i % num_processes)) for i, chunk in enumerate(chunks)]
        
        # Collect the results and update the progress bar
        result_chunks = []
        for future in futures:
            result_chunks.append(future.get())
            progress_bar.update(1)
        
        # Close and join the pool
        pool.close()
        pool.join()
    
    print("All chunks processed. Concatenating results...")
    # Concatenate all the result chunks back into a single DataFrame
    result_df = pd.concat(result_chunks, ignore_index=True)
    print("Processing complete.")
    return result_df

# Example of how to use the parallel_processing function
# Assuming your DataFrame is 'df' and your prompt is 'prompt1'
# result_df = parallel_processing(df, prompt1)


In [5]:
# Took 12 hours

df = pd.read_csv("5.1.updated_file_with_geolocation.csv")

# Assuming your DataFrame is 'df' and your prompt is 'prompt1'
result_df = parallel_processing(df, prompt1)

result_df.to_csv('result.csv')

Detected 1 GPUs with a total of 24564.0 VRAM.
Running with 3 concurrent model instances.
Starting parallel processing with 3 processes...


100%|██████████| 3/3 [12:08:14<00:00, 14564.91s/it]   


All chunks processed. Concatenating results...
Processing complete.


In [6]:
result_df.head()

Unnamed: 0.1,Unnamed: 0,title,company,job_type,is_remote,description,address,cleaned_address,lat_long,model_response
0,0,Porter,PHOENIX OPCO PTE. LTD.,fulltime,False,Are you currently working in a service based e...,"Tras Street, #9-177 Union Building, 079025","Tras Street, Union Building, 079025","(1.2744927000000001, 103.84404662674353)",\n\n1. Responsibilities\n - Ensure guest exp...
1,1,Outlet Executive - Tan Tock Seng Hospital,Kopitiam Investment Pte Ltd,fulltime,False,Outlet Executive - Tan Tock Seng Hospital\nRes...,"1 Joo Koon Cir, #13-01 FairPrice Joo Koon, Sin...","1 Joo Koon Cir, FairPrice Joo Koon, Singapore...","(1.2899175, 103.8519072)",\n## Responsibilities\n1. Operations\n * Sup...
2,2,Sales Promoter,Oomph Pte. Ltd.,fulltime,False,"SALARY UP TO $4,000.00 (subject to experience)...","2 Alexandra Rd, #04-01 Delta House, Singapore ...","2 Alexandra Rd, Delta House, Singapore 159919","(1.2899175, 103.8519072)",\n\nAspect 1: Responsibilities\n- Actively pro...
3,3,Quantity Surveyor,LBD ENGINEERING PTE. LTD.,fulltime,False,Job Description\n\n\n* Prepare and analyse cos...,"58A Sungei Kadut Loop, LBD Construction Group ...","58A Sungei Kadut Loop, LBD Construction Group ...",,\n\nResponsibilities:\n- Prepare and analyze c...
4,4,Cleaning Operations Assistant Supervisor,ECOCLEAN MAINTENANCE PTE. LTD.,fulltime,False,**Requirements**\n\n* at least 3 years of work...,"1 Yishun Industrial Street 1, #06-27 A'Posh Bi...","1 Yishun Industrial Street 1, A'Posh BizHub, ...","(1.2899175, 103.8519072)",\n\n**Responsibilities:**\n1. Respond to emerg...


In [7]:
result_df.isna().sum()

Unnamed: 0            0
title                 0
company               0
job_type              0
is_remote             0
description           0
address               0
cleaned_address       0
lat_long           2528
model_response        0
dtype: int64

In [8]:
result_df = result_df.dropna()
result_df.isna().sum()

Unnamed: 0         0
title              0
company            0
job_type           0
is_remote          0
description        0
address            0
cleaned_address    0
lat_long           0
model_response     0
dtype: int64

In [22]:
result_df.columns

Index(['Unnamed: 0', 'title', 'company', 'job_type', 'is_remote',
       'description', 'address', 'cleaned_address', 'lat_long',
       'model_response'],
      dtype='object')

In [23]:
result_df = result_df.drop(columns=['Unnamed: 0'])
result_df.columns

Index(['title', 'company', 'job_type', 'is_remote', 'description', 'address',
       'cleaned_address', 'lat_long', 'model_response'],
      dtype='object')

In [25]:
result_df['model_response'] = result_df['model_response'].str.strip()

In [28]:
result_df['model_response'].head()

0    1. Responsibilities\n   - Ensure guest experie...
1    ## Responsibilities\n1. Operations\n   * Suppo...
2    Aspect 1: Responsibilities\n- Actively promote...
4    **Responsibilities:**\n1. Respond to emergency...
5    ## Summary of Job Description for Accounts Off...
Name: model_response, dtype: object

In [30]:
result_df['model_response'] = result_df['model_response'].str.replace(r'[^A-Za-z0-9\s.,]', '', regex=True)


In [31]:
# Remove point formatted numbers (e.g., 1., 2., etc.) but keep time-like patterns (e.g., 1.30PM)
result_df['model_response'] = result_df['model_response'].str.replace(r'(?<!\d)(\d+)\.(?!\d)', '', regex=True).str.strip()


In [33]:
result_df.to_csv("5.1.description_cleaned.csv", index=False)