In [1]:
!pip install langchain

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!pip install langchain-community

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from langchain_community.llms import Ollama
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableSequence

import logging
import time
import pandas as pd
import logging
import csv
import time
import os
from tqdm import tqdm
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [5]:
def sample_reviews_from_csv(file_path, sample_size=5000):
    try:
        df = pd.read_csv(file_path)
        if 'review' not in df.columns:
            logging.error("The 'review' column is not present in the dataset.")
            return None
        sample_df = df.sample(n=sample_size, random_state=1)  # random_state for reproducibility
        sample_file_path = 'sampled_reviews.csv'
        sample_df.to_csv(sample_file_path, index=False)
        return sample_file_path
    except FileNotFoundError:
        logging.error(f"File {file_path} not found.")
        return None
    except pd.errors.EmptyDataError:
        logging.error("The CSV file is empty.")
        return None
    except pd.errors.ParserError:
        logging.error("Error parsing the CSV file.")
        return None

In [6]:
def load_reviews_from_csv(file_path):
    reviews = []
    try:
        with open(file_path, 'r', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                reviews.append(row)
    except FileNotFoundError:
        logging.error(f"File {file_path} not found.")
    except csv.Error as e:
        logging.error(f"Error reading CSV file: {e}")
    return reviews

In [7]:
def load_text(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read()
    except FileNotFoundError:
        logging.error(f"File {file_path} not found.")
        return None


In [8]:
def format_text(text, words_per_line):
    words = text.split()
    formatted_text = ''
    for i in range(0, len(words), words_per_line):
        formatted_text += ' '.join(words[i:i+words_per_line]) + '\n'
    return formatted_text

In [9]:
def process_review(review, chain):
    try:
        start_time = time.time()
        output = chain.invoke({"content": review})
        end_time = time.time()
        processing_time = end_time - start_time
        return output, processing_time
    except Exception as e:
        logging.error(f"Error invoking the chain: {e}")
        return None, None

In [11]:
def call_model_LLM(csv_file_path):
    sample_file_path = sample_reviews_from_csv(csv_file_path)
    if not sample_file_path:
        logging.warning("No sampled file to process.")
        return

    reviews = load_reviews_from_csv(sample_file_path)
    if not reviews:
        logging.warning("No reviews found in the sampled CSV file.")
        return

    prompt_template = load_text("single_call.txt")
    if not prompt_template:
        logging.warning("Prompt template is empty or not found.")
        return

    chain = RunnableSequence(
        PromptTemplate.from_template(prompt_template) | Ollama(model="llama3")
    )

    results = []
    output_file_path = "sampled_reviews_with_output.csv"
    
    # Create or clear the output file at the start
    if not os.path.exists(output_file_path):
        with open(output_file_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['progressive_index', 'review', 'column2', 'column3', 'column4', 'output'])  # Write header

    try:
        # Use tqdm for the progress bar
        for i, review in enumerate(tqdm(reviews, desc="Processing Reviews")):
            # Add a progressive index to each review
            review['progressive_index'] = i + 1
            formatted_review = format_text(review['review'], 30)
            output, _ = process_review(review['review'], chain)

            review['output'] = output if output else "No output generated."
            results.append(review)

            # Save progress after each review
            df_results = pd.DataFrame(results)
            df_results.to_csv(output_file_path, index=False, encoding="utf-8")

        logging.info(f"Results saved to {output_file_path}")

    except Exception as e:
        logging.error(f"An error occurred: {e}")

    finally:
        logging.info("Completed processing.")

csv_file_path = "IMDB_Dataset_Enriched.csv"
call_model_LLM(csv_file_path)


Processing Reviews: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [14:31:23<00:00, 10.46s/it]
2024-07-23 02:49:12,157 - INFO - Results saved to sampled_reviews_with_output.csv
2024-07-23 02:49:12,157 - INFO - Completed processing.
