# Simple chat response for all rows in CSV

This notebook will read your CSV, prompt and for each row in the CSV will make a request to openai chat completions for a response that will be recorded in the output

The notebook will never delete data but always append to the OUTPUT_CSV. 

The parameters are set up that if you don't change anything it will run the prompt in ./workspace/buildup_usa_christianity/prompt.txt for the first record of ./workspace/buildup_usa_christianity/tweets.csv

Change the "Input Parameters" as needed.

In [None]:
import os
import datetime

import pandas as pd
import numpy as np
from langchain import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
import openai

In [None]:
# Input parameters
INPUT_CSV = "../workspace/buildup_christianity_usa/tweets.csv"
# If multiple columns are used then they will be concatenated together
COLUMNS_FOR_ANALYSIS = ["text"]
OUTPUT_CSV = "../workspace/buildup_christianity_usa/output.csv"
PROMPT_FILE = "../workspace/buildup_christianity_usa/prompt.txt"
START_OFFSET = 0
NUMBER_OF_RECORDS_TO_PROCESS = 1
# The number of records to process before a persist is done
# This means that if you stop the process half way through
# All the previous batches will be persisted
BATCH_PERSIST_SIZE = 5
API_KEY = os.getenv('OPENAI_API_KEY')

In [None]:
input_df = pd.read_csv(INPUT_CSV)
input_df.head()

In [None]:
input_df["gpt_text_to_process"] = input_df[COLUMNS_FOR_ANALYSIS].apply(' '.join, axis=1)
input_df["gpt_text_to_process"].head()

In [None]:
if NUMBER_OF_RECORDS_TO_PROCESS is None:
    df_to_process = input_df[START_OFFSET:]
else:
    df_to_process = input_df[START_OFFSET:NUMBER_OF_RECORDS_TO_PROCESS]

print(f"total number of records to process is {df_to_process.shape[0]}")

In [None]:
with open(PROMPT_FILE) as f:
    prompt = f.read()

prompt

In [None]:
from langchain.chat_models import ChatOpenAI
chat = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

In [None]:
system_message_prompt = SystemMessagePromptTemplate.from_template(prompt)
human_template = "{text}"
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

In [None]:
chain = LLMChain(llm=chat, prompt=chat_prompt)

In [None]:
def compute(chunked_df_to_process):
    output_df = chunked_df_to_process.copy()
    output_df["gpt_response"] = output_df["gpt_text_to_process"].apply(lambda x: chain.run(text=x))
    output_df["gpt_responsed_at"] = datetime.datetime.now(datetime.timezone.utc)
    return output_df

In [None]:
first_time_with_header = False
if not os.path.exists(OUTPUT_CSV):
    first_time_with_header = True

dfs = []
for chunk in np.array_split(df_to_process, BATCH_PERSIST_SIZE):
    result_df = compute(chunk)
    if first_time_with_header:
        result_df.to_csv(OUTPUT_CSV, mode='a', header=True)
        first_time_with_header = False
    else:
        result_df.to_csv(OUTPUT_CSV, mode='a', header=False)
    dfs.append(result_df)
    

In [None]:
total_processed = pd.concat(dfs)
total_processed.shape[0]

In [None]:
total_processed.head()