In [3]:
!pip install langchain langchain-community pandas ollama tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
from langchain_community.llms import Ollama
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableSequence

import pandas as pd
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [14]:
PROMPT_TEMPLATE = """Instruction: Analyze the following review text and provide one distinct outputs formatted in JSON:

1. **Sentiment Classification:** Indicate whether the sentiment of the review is "positive" or "negative".
2. **Named Entity Extraction:** List all named entities present in the text, categorizing them by label (PERSON, ORG, LOC).
3. **Required JSON Format:** Ensure the response is formatted in JSON according to the following schema:

{{
  "sentiment": "<sentiment>",
  "review": "<review>",
  "entities": [
    {{
      "label": "<label>",
      "value": "<value>"
    }}
  ]
}}

example:

"I recently visited the restaurant 'La Dolce Vita' in Rome and was thrilled with the service and food. The waiter, Marco, was exceptionally friendly and the truffle risotto was simply divine. I can't wait to return and recommend this place to my friends."

```json

{{
  "sentiment": "positive",
  "review": "I recently visited the restaurant 'La Dolce Vita' in Rome and was thrilled with the service and food. The waiter, Marco, was exceptionally friendly and the truffle risotto was simply divine. I can't wait to return and recommend this place to my friends.",
  "entities": [
    {{
      "label": "ORG",
      "value": "La Dolce Vita"
    }},
    {{
      "label": "LOC",
      "value": "Rome"
    }},
    {{
      "label": "PERSON",
      "value": "Marco"
    }}
  ]
}}

```
{content}





"""

In [15]:
def process_review(review: str, chain: RunnableSequence) -> str:
    """
    It processes the review text and returns the LLM response as string.
    
    Arguments:
        review (str): The review text.
        chain (RunnableSequence): The LLM chain.
        
    Return:
        The LLM response as string.
    """
    try:
        return chain.invoke({"content": review})
    except Exception as e:
        logging.error(f"Error invoking the chain: {e}")
        return None

In [21]:
from tqdm import tqdm

def call_model_llm(model_name: str, output_file_path: str) -> pd.DataFrame:
    """
    It calls the LLM model using Ollama. It returns the sampled dataframe enriched with relevant columns.
    
    Arguments:
        model_name: The name of the model to invoke via Ollama.
    
    Return:
        The enriched dataframe.
    """
    dataframe: pd.DataFrame = pd.read_csv(output_file_path)
    chain: RunnableSequence = RunnableSequence(
        PromptTemplate.from_template(PROMPT_TEMPLATE) | Ollama(model=model_name)
    )
    dataframe: pd.DataFrame = dataframe[dataframe.output == "$$$"]
    total_rows: int = len(dataframe.index)
    for i in tqdm(range(total_rows), total=total_rows):
        row = dataframe.iloc[i]
        # logging.info(f"Processing row {i + 1} out of {total_rows}")
        dataframe.loc[i, "output"] = process_review(row["review"], chain).replace("```json", "").replace("```", "")
        dataframe.to_csv(output_file_path)
    return dataframe

In [22]:
import os

# Define the model name to use
MODEL_NAME: str = "phi3:medium"

# Define the output path and get the result as csv
output_file_path: str = f"../resources/sampled_reviews_with_output_{MODEL_NAME.replace(':', '_')}.csv"

exists: bool = os.path.exists(output_file_path)
if not exists:
    sampled: pd.DataFrame = pd.read_csv("../resources/IMDB Dataset Sampled.csv")
    sampled["output"] = sampled.apply(lambda row: "$$$", axis=1)
    sampled.to_csv(output_file_path, index=False)

# Produce a dataframe by invoking the chain
output: pd.DataFrame = call_model_llm(model_name=MODEL_NAME, output_file_path=output_file_path)

output.to_csv(output_file_path, index=False)

logging.info("Completed processing.")

  8%|▊         | 81/1000 [14:31<2:44:45, 10.76s/it]


KeyboardInterrupt: 