In [18]:
prompt1 = f"""You are an expert in understanding job descriptions and extracting the details and even nuanced requirements for the job. Your goal is to read the input slowly and take time to consider what is written, extract the information and break it down into these 3 aspects:
    1. responsibilites 
    2. qualifications
    3. skills, technical and non-technical
and summarize it in point form line by line.
With each aspect answered, ensure that each of the aspects are properly differentiated and avoid overlaps as much as possible."""

In [19]:
import ollama

def test_llm():
    # Test description
    test_description = "This is a gaming mouse with RGB lighting, 6 programmable buttons, and ergonomic design"
    
    # System prompt
    prompt = "You are a helpful assistant that cleans product descriptions. Make them more professional and organized."
    
    try:
        # Call the model directly
        response = ollama.chat(
            model='capybarahermes-2.5-mistral-7b.Q5_K_M.gguf:latest',
            messages=[
                {'role': 'system', 'content': prompt},
                {'role': 'user', 'content': test_description}
            ]
        )
        
        print("Input description:", test_description)
        print("\nModel response:", response['message']['content'])
        
    except Exception as e:
        print(f"Error occurred: {e}")

# Run the test
test_llm()

Input description: This is a gaming mouse with RGB lighting, 6 programmable buttons, and ergonomic design

Model response: 

Introducing our state-of-the-art Gaming Mouse, engineered to provide an exceptional gaming experience. The innovative design features RGB lighting, allowing you to customize the illumination according to your preferences. Our product also boasts 6 programmable buttons that can be tailored to suit your specific gameplay style, giving you a competitive edge in any match. Furthermore, the ergonomic design ensures optimal comfort during extended gaming sessions, reducing hand fatigue and allowing you to stay focused on the task at hand.


In [20]:
import os
import pandas as pd
from tqdm import tqdm
import ollama

def process_row(row, prompt1):
    """Function to process a single row."""
    try:
        # Skip if model_response already exists and is not empty
        if pd.notna(row['model_response']) and row['model_response'].strip():
            return row['model_response']
            
        model_name = 'capybarahermes-2.5-mistral-7b.Q5_K_M.gguf:latest'
        
        response = ollama.chat(
            model=model_name,
            messages=[
                {'role': 'system', 'content': prompt1},
                {'role': 'user', 'content': row['description']}
            ]
        )
        return response['message']['content']
    except Exception as e:
        print(f"Error processing row {row.name}. Error: {e}")
        return None

def process_dataframe(df, prompt1):
    """Process rows with missing model_response values."""
    # Create a copy of the dataframe
    processed_df = df.copy()
    
    # Get rows where model_response is NA
    mask = processed_df['model_response'].isna()
    rows_to_process = processed_df[mask]
    
    print(f"Processing {len(rows_to_process)} rows with missing responses...")
    
    # Process each row that needs processing
    for idx, row in tqdm(rows_to_process.iterrows(), total=len(rows_to_process)):
        response = process_row(row, prompt1)
        processed_df.at[idx, 'model_response'] = response
        
    print("Processing complete.")
    return processed_df


In [4]:
# Read both dataframes
df = pd.read_csv("../4. Geolocating/4.2.updated_file_with_geolocation.csv")
df_cleaned = pd.read_csv("5.1.description_cleaned.csv")

# Drop lat_long from df_cleaned since we'll keep df's version
df_cleaned = df_cleaned.drop(columns=['lat_long'])

# Merge df_cleaned into df, keeping all rows from df
df_new = df.merge(df_cleaned[['title', 'company', 'job_type', 'is_remote', 'description', 'address', 'cleaned_address', 'model_response']], 
              on=['title', 'company', 'job_type', 'is_remote', 'description', 'address', 'cleaned_address'],
              how='left')

df_new=df_new.drop_duplicates()
# Verify the merge by checking number of rows
print("Original df rows:", len(df))
print("df_cleaned rows:", len(df_cleaned))
print("Merged df rows:", len(df_new))

df_new.to_csv("5.2.merged_df.csv", index=False)

Original df rows: 25611
df_cleaned rows: 23083
Merged df rows: 25611


In [5]:
df_new['model_response'].isna().sum()

2528

In [8]:
result_df = process_dataframe(df_new, prompt1)

Processing 2528 rows with missing responses...


100%|██████████| 2528/2528 [2:53:16<00:00,  4.11s/it]  

Processing complete.





In [21]:
df_again = pd.read_csv("5.2.description_cleaned.csv")
df_again['model_response'].isna().sum()


1

In [22]:
result_df = process_dataframe(df_again, prompt1)

Processing 1 rows with missing responses...


100%|██████████| 1/1 [00:04<00:00,  4.63s/it]

Processing complete.





In [23]:
result_df.head()

Unnamed: 0,title,company,job_type,is_remote,description,address,cleaned_address,lat_long,model_response
0,Porter,PHOENIX OPCO PTE. LTD.,fulltime,False,Are you currently working in a service based e...,"Tras Street, #9-177 Union Building, 079025","Tras Street, Union Building, 079025","(1.27444651846065, 103.843929515239)",Responsibilities\n Ensure guest experiences...
1,Outlet Executive - Tan Tock Seng Hospital,Kopitiam Investment Pte Ltd,fulltime,False,Outlet Executive - Tan Tock Seng Hospital\nRes...,"1 Joo Koon Cir, #13-01 FairPrice Joo Koon, Sin...","1 Joo Koon Cir, FairPrice Joo Koon, Singapore...","(1.32476879097421, 103.674484690433)",Responsibilities\n Operations\n Support Out...
2,Sales Promoter,Oomph Pte. Ltd.,fulltime,False,"SALARY UP TO $4,000.00 (subject to experience)...","2 Alexandra Rd, #04-01 Delta House, Singapore ...","2 Alexandra Rd, Delta House, Singapore 159919","(1.27425442821763, 103.803711567804)",Aspect 1 Responsibilities\n Actively promote l...
3,Quantity Surveyor,LBD ENGINEERING PTE. LTD.,fulltime,False,Job Description\n\n\n* Prepare and analyse cos...,"58A Sungei Kadut Loop, LBD Construction Group ...","58A Sungei Kadut Loop, LBD Construction Group ...","(1.40981215298244, 103.742781634928)",Responsibilities\n\n Prepare and analyze cost ...
4,Cleaning Operations Assistant Supervisor,ECOCLEAN MAINTENANCE PTE. LTD.,fulltime,False,**Requirements**\n\n* at least 3 years of work...,"1 Yishun Industrial Street 1, #06-27 A'Posh Bi...","1 Yishun Industrial Street 1, A'Posh BizHub, ...","(1.43732110123747, 103.842085763701)","Responsibilities\n Respond to emergency calls,..."


In [24]:
result_df.isna().sum()

title              0
company            0
job_type           0
is_remote          0
description        0
address            0
cleaned_address    0
lat_long           0
model_response     0
dtype: int64

In [25]:
result_df = result_df.dropna()
result_df.isna().sum()

title              0
company            0
job_type           0
is_remote          0
description        0
address            0
cleaned_address    0
lat_long           0
model_response     0
dtype: int64

In [26]:
result_df.columns

Index(['title', 'company', 'job_type', 'is_remote', 'description', 'address',
       'cleaned_address', 'lat_long', 'model_response'],
      dtype='object')

In [27]:
result_df = result_df.drop(columns=['Unnamed: 0'])
result_df.columns

KeyError: "['Unnamed: 0'] not found in axis"

In [28]:
result_df['model_response'] = result_df['model_response'].str.strip()

In [29]:
result_df['model_response'].head()

0    Responsibilities\n    Ensure guest experiences...
1    Responsibilities\n Operations\n    Support Out...
2    Aspect 1 Responsibilities\n Actively promote l...
3    Responsibilities\n\n Prepare and analyze cost ...
4    Responsibilities\n Respond to emergency calls,...
Name: model_response, dtype: object

In [30]:
result_df['model_response'] = result_df['model_response'].str.replace(r'[^A-Za-z0-9\s.,]', '', regex=True)


In [31]:
# Remove point formatted numbers (e.g., 1., 2., etc.) but keep time-like patterns (e.g., 1.30PM)
result_df['model_response'] = result_df['model_response'].str.replace(r'(?<!\d)(\d+)\.(?!\d)', '', regex=True).str.strip()


In [33]:
result_df.to_csv("5.2.description_cleaned.csv", index=False)