In [1]:
!pip install langchain langchain-community pandas ollama

Looking in indexes: http://dev:****@nexus-leitha.servizi.gr-u.it/repository/leitha_python_all/simple

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from langchain_community.llms import Ollama
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableSequence

import logging
import time
import pandas as pd
import logging
import time

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [3]:
PROMPT_TEMPLATE = """Instruction: Analyze the following review text and provide one distinct outputs formatted in JSON:

1. **Sentiment Classification:** Indicate whether the sentiment of the review is Positive, Neutral, or Negative.
2. **Named Entity Extraction:** List all named entities present in the text, categorizing them by type (e.g., people, locations, organizations, etc.).
3. **Required JSON Format:** Ensure the response is formatted in JSON according to the following schema:

{{
  "sentiment": "<sentiment>",
  "entities": {{
    "person": ["<person_name1>", "<person_name2>", ...],
    "location": ["<location1>", "<location2>", ...],
    "organization": ["<organization1>", "<organization2>", ...],
    "product": ["<product1>", "<product2>", ...]
  }}
}}

example:

"I recently visited the restaurant 'La Dolce Vita' in Rome and was thrilled with the service and food. The waiter, Marco, was exceptionally friendly and the truffle risotto was simply divine. I can't wait to return and recommend this place to my friends."

```json

{{
  "sentiment": "Positive",
  "entities": {{
    "person": ["Marco"],
    "location": ["Rome"],
    "organization": ["La Dolce Vita"],
    "product": ["truffle risotto"]
  }}
}}

```
{content}





"""

In [32]:
def process_review(review: str, chain: RunnableSequence, progressive_index: int, total_rows: int) -> str:
    """
    It processes the review text and returns the LLM response as string.
    
    Arguments:
        review (str): The review text.
        chain (RunnableSequence): The LLM chain.
        
    Return:
        The LLM response as string.
    """
    try:
        start_time: float = time.time()
        langchain_output: str = chain.invoke({"content": review})
        end_time: float = time.time()
        processing_time: float = end_time - start_time
        logging.info(f'Processing {progressive_index}/{total_rows} ({(progressive_index / total_rows) * 100:.2f}%) in {processing_time} seconds')
        return langchain_output
    except Exception as e:
        logging.error(f"Error invoking the chain: {e}")
        return None

In [33]:
def call_model_llm(model_name: str, output_file_path: str) -> pd.DataFrame:
    """
    It calls the LLM model using Ollama. It returns the sampled dataframe enriched with relevant columns.
    
    Arguments:
        model_name: The name of the model to invoke via Ollama.
    
    Return:
        The enriched dataframe.
    """
    dataframe: pd.DataFrame = pd.read_csv(output_file_path)
    chain: RunnableSequence = RunnableSequence(
        PromptTemplate.from_template(PROMPT_TEMPLATE) | Ollama(model=model_name)
    )
    dataframe: pd.DataFrame = dataframe[dataframe.output == "$$$"]
    total_rows: int = len(dataframe.index)
    for i, row in dataframe.iterrows():
        logging.info(f"Processing row {i + 1} out of {total_rows}")
        dataframe.loc[i, "output"] = process_review(row["review"], chain, i + 1, total_rows)
        dataframe.to_csv(output_file_path)
    return dataframe

In [None]:
import os

# Define the model name to use
MODEL_NAME: str = "phi3:medium"

# Define the output path and get the result as csv
output_file_path: str = f"../resources/sampled_reviews_with_output_{MODEL_NAME}.csv"

exists: bool = os.path.exists(output_file_path)
if not exists:
    sampled: pd.DataFrame = pd.read_csv("../resources/IMDB Dataset Sampled.csv")
    sampled["output"] = sampled.apply(lambda row: "$$$", axis=1)
    sampled.to_csv(output_file_path, index=False)

# Produce a dataframe by invoking the chain
output: pd.DataFrame = call_model_llm(model_name=MODEL_NAME, output_file_path=output_file_path)

output.to_csv(output_file_path, index=False)

logging.info("Completed processing.")

2024-07-23 16:30:54,315 - INFO - Processing row 6 out of 3781
2024-07-23 16:30:57,575 - INFO - Processing 6/3781 (0.16%) in 3.2597901821136475 seconds
2024-07-23 16:30:57,735 - INFO - Processing row 7 out of 3781
2024-07-23 16:31:04,620 - INFO - Processing 7/3781 (0.19%) in 6.884856939315796 seconds
2024-07-23 16:31:04,777 - INFO - Processing row 8 out of 3781
2024-07-23 16:31:07,378 - INFO - Processing 8/3781 (0.21%) in 2.600498914718628 seconds
2024-07-23 16:31:07,541 - INFO - Processing row 9 out of 3781
2024-07-23 16:31:10,731 - INFO - Processing 9/3781 (0.24%) in 3.189328193664551 seconds
2024-07-23 16:31:10,895 - INFO - Processing row 10 out of 3781
