In [2]:
!pip install pandas
!pip install requests 

In [1]:
import pandas as pd 
import requests
import re

# Define the Ollama API endpoint
OLLAMA_API_URL = "http://localhost:11434/api/generate"

# Define the model you want to use
model_name = "gemma3:1b"  # Replace with the actual model name

In [3]:
def generate_response(prompt, model_name, max_tokens=500):
    """
    Send a prompt to the Ollama API and return the generated response.
    """
    payload = {
        "model": model_name,
        "prompt": prompt,
        "stream": False,  # Set to True if you want streaming responses
        "max_tokens": max_tokens
    }
    response = requests.post(OLLAMA_API_URL, json=payload)
    
    if response.status_code == 200:
        return response.json().get("response", "").strip()
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

In [9]:
def extract_imputed_value(response):
    """
    Extracts the imputed value from the AI response.
    """
    match = re.search(r"ANSWER:\s*(.*)", response)
    return match.group(1) if match else None  # Return None if no match found

In [None]:
def data_imputation(row, missing_column):
    """
    Impute missing values in a row using the Ollama API.
    """
    prompt = f"""
    You are a database engineer, also knowledgeble in movies and IMDB. Fill in the missing value.

    Record:
    {row.to_dict()}

    What is the best guess for '{missing_column}'? 
    Provide your reasoning first, then state your final answer in the format: `ANSWER: <your answer>`.
    """
    
    response = generate_response(prompt, model_name)
     

    return extract_imputed_value(response) 



As of now, the current version of prompt makes stupid prediction, but out in a good format. \
To do ;
- few shot prompting 
- change prompt's persona assignment

In [13]:
# Example usage
df = pd.read_csv("testDI.csv")  # Load your dataset
missing_col = "originalTitle"  # Column with missing values

# Apply data imputation to rows with missing values
df[missing_col] = df.apply(
    lambda row: data_imputation(row, missing_col) if pd.isnull(row[missing_col]) else row[missing_col],
    axis=1
)

# Save the preprocessed data
df.to_csv("ImputedOriginalTitle_allOutput.csv", index=False)

20 rows of imputation on laptop = 9 mins

## Try Batch Imputation

In [4]:
#import pandas as pd
#import re
import time
from tqdm import tqdm

# Function to extract only the imputed value
def extract_imputed_value(response):
    match = re.search(r"ANSWER:\s*(.*)", response)
    return match.group(1) if match else None  # Return None if no match found

# Function to perform imputation
def data_imputation(row, missing_column):
    prompt = f"""
    You are an IMDB expert. You know all the details of the movies listed in IMDB. Fill in the missing value.

    Record:
    {row.to_dict()}

    What is the best guess for '{missing_column}'? 
    Provide your reasoning first, then state your final answer in the format: `ANSWER: <your answer>`.
    """
    response = generate_response(prompt, model_name)  # Call your LLM function
    return extract_imputed_value(response)  # Extract imputed value

# Load the dataset
df = pd.read_csv("testDI.csv")

# Define the column with missing values
missing_column = "originalTitle"

# Identify rows with missing values
missing_indices = df[df[missing_column].isna()].index
BATCH_SIZE = 20  # Define batch size

# Process in batches
for i in tqdm(range(0, len(missing_indices), BATCH_SIZE), desc="Batch Imputation Progress"):
    batch_indices = missing_indices[i:i + BATCH_SIZE]  # Select batch
    batch_data = df.loc[batch_indices]  # Extract batch

    for index, row in batch_data.iterrows():
        imputed_value = data_imputation(row, missing_column)
        df.at[index, missing_column] = imputed_value  # Fill missing value

    # Save progress after each batch
    df.to_csv("imputed_data.csv", index=False)

    # Optional: Avoid API rate limits
    time.sleep(2)  # Adjust based on API limits

print("Imputed data saved as 'imputed_data.csv'.")

  df.at[index, missing_column] = imputed_value  # Fill missing value
Batch Imputation Progress: 100%|██████████| 1/1 [05:00<00:00, 300.82s/it]

Imputed data saved as 'imputed_data.csv'.





With batch processing, model's processing time goes down from 9 mins to 5 mins.
Changing the prompt doesn't help much. This is more like copying the primaryTitle. But this might be due to small sample size.  