In [1]:
import openai
import pandas as pd
import numpy as np
import json
import os
import re
import time
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.prompts.prompt import PromptTemplate #where we change the AI "personality"
from dotenv import load_dotenv, find_dotenv

In [2]:
load_dotenv()
API_KEY = os.environ.get("OPENAI_API_KEY")

In [3]:
_ = load_dotenv(find_dotenv()) # read local .env file

#openai.api_key = os.getenv("api.txt")
COMPLETIONS_MODEL = "text-davinci-002"
BETTER_COMPLETIONS_MODEL = "text-davinci-003" #for my purposes, this is better
LONG_MODEL = "gpt-3.5-turbo-16k"
REGULAR_MODEL = "gpt-3.5-turbo"
GPT_4 = "gpt-4-1106-preview"

chat = ChatOpenAI(temperature=0.0,
                  openai_api_key = API_KEY,
                  verbose=True,
                  model_name=GPT_4) #depending on how big of a task

#below, we give the AI a "personality"
template = """The following is a conversation between a human data scientist and an AI who specializes in data categorization. The AI is direct and provides concise responses. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
{history}
Data Scientist: {input}
AI:"""

NameError: name 'ChatOpenAI' is not defined

In [None]:
os.chdir('/Users/chrissoria/Documents/Research/Categorization_AI_experiments')
current_directory = os.getcwd()
print(current_directory)

In [None]:
survey_participant_input = "a19i" #enter column name here

UCNets = pd.read_excel("/Users/chrissoria/Documents/Research/UCNets_Classification/data/Raw_Cond_for_Coding_all_waves.xlsx", engine='openpyxl',sheet_name="JOINT_DATA",usecols=[survey_participant_input])
UCNets = UCNets[survey_participant_input].dropna().unique()  # Drop NaN values and get unique elements

survey_participant_responses = '; '.join(str(item) for item in UCNets) #what we will feed to the model

UCNets = pd.DataFrame(UCNets, columns=[survey_participant_input])
UCNets[survey_participant_input] = UCNets[survey_participant_input].astype(str).str.lower()
UCNets[survey_participant_input] = UCNets[survey_participant_input].str.strip()
UCNets = UCNets[UCNets[survey_participant_input] != ''].reset_index(drop=True) #trimming all empty rows

UCNets = UCNets.iloc[:200]

UCNets.head()

Here, I'm trying to "force" the model to "think" in steps by first A. trying to process the response into its own words and B. having it interact with that object. That is, instead of all steps being given at once, I'm having it think in steps. 

This time, I will have it think in a "chain," where I will have it output a response and then feed that response back to it in a seperate prompt. 

In [None]:
survey_input = UCNets['a19i'][105] 

category = """1. to start living with or to stay with partner/spouse. \
2. relationship change (divorce, breakup, etc). \
3. the respondent had a job or school or career change, including transferred and retired. \
4. the respondent's partner's job or school or career change, including transferred and retired. \
5. financial reasons (rent is too expensive, pay raise, etc).
6. related to housing features, such as a bigger or smaller yard or house"""

example_JSON = """{ \
"1": "0", \
"2": "1", \
"3": "0", \
"4": 1, \
"5": "0", \
"6": "0"
}"""

caveat_1 ="""Be specific about how the move might've been because of a partner."""

template_string1 = """A survey respondent was asked, "Why did you move?" \
They responded with: "{OBJECT}" \
First, filter out anything in this response that doesn't answer the question, "Why did you move?" \
Second, give me all of their reasons in a clear concise manner without any additional extrapolation. \
{CAVEAT}
Format your response in a as few words as possible starting with the words, 'This respondent moved because...'"""

prompt_template1 = ChatPromptTemplate.from_template(template_string1)
prompt_template1.messages[0].prompt #this will show us our prompt template

GPT_Responses1 = prompt_template1.format_messages(
                    OBJECT=survey_input,
                    CAVEAT=caveat_1)

TEST1 = chat(GPT_Responses1)
TEST1 = TEST1.content

template_string2 = """A survey respondent was asked, "Why did you move?" \
"{OBJECT}" \
Please determine how many of the following reasons for moving they provide from this list: \
{CATEGORY} \
Next, provide your answer as a 1 if yes and a 0 if no in JSON format \
Please do not provide any other text beyond the JSON. \
Here's an example of how you should format your response: \
{EXAMPLE}"""

prompt_template2 = ChatPromptTemplate.from_template(template_string2)
prompt_template2.messages[0].prompt #this will show us our prompt template

GPT_Responses2 = prompt_template2.format_messages(
                    OBJECT=TEST1,
                    CATEGORY=category,
                    EXAMPLE=example_JSON)

TEST2 = chat(GPT_Responses2)
TEST2 = TEST2.content
print(TEST1)
print(TEST2)

In [None]:
for i in range(len(UCNets)):
    survey_input = UCNets['a19i'][i]
    response = prompt_template1.format_messages(
                    OBJECT=survey_input,
                    CAVEAT=caveat_1)

    response = chat(response)
    
    # Assuming a successful attempt means a non-empty response
    if response.content:
        print(f"Successful attempt for row number for chain 1: {i}")
    
    UCNets.at[i, 'Key_Reasons'] = response.content
    
for i in range(len(UCNets)):
    survey_input = UCNets['Key_Reasons'][i]
    response = prompt_template2.format_messages(
                    OBJECT=survey_input,
                    CATEGORY=category,
                    EXAMPLE=example_JSON)

    response = chat(response)
    
    # Assuming a successful attempt means a non-empty response
    if response.content:
        print(f"Successful attempt for row number for chain 2: {i}")
    
    UCNets.at[i, 'JSON'] = response.content

In [None]:
UCNets['JSON'] = UCNets['JSON'].astype(str)

def remove_non_digits(text):
    return re.sub(r'[^0-9]', '', text)

def add_colon_before_comma(text):
    new_text = ""
    for i, char in enumerate(text):
        if i < len(text) - 1 and text[i + 1] == ',':
            new_text += ':' + char
        else:
            new_text += char
    return new_text

def add_colon_before_end(text):
    if len(text) > 3:
        return text[:-3] + ':"' + text[-3:]
    else:
        return text

# Apply the function to the 'JSON' column
UCNets['JSON_clean'] = UCNets['JSON'].astype(str).apply(remove_non_digits)
UCNets['JSON_clean'] = UCNets['JSON_clean'].apply(lambda x: x[-12:]) #= number of categories / 2
UCNets['JSON_clean'] = UCNets['JSON_clean'].apply(lambda x: ','.join(x[i:i+2] for i in range(0, len(x), 2)))
UCNets['JSON_clean'] = UCNets['JSON_clean'].apply(add_colon_before_comma)
UCNets['JSON_clean'] = UCNets['JSON_clean'].apply(lambda x: "{" + x + "}")
UCNets['JSON_clean'] = UCNets['JSON_clean'].apply(lambda x: '"'.join(x[i:i+1] for i in range(0, len(x))))
UCNets['JSON_clean'] = UCNets['JSON_clean'].apply(add_colon_before_end)


UCNets.head()

In [None]:
def remove_backticks(text):
    return text.replace('```', '')

def remove_json(text):
    return text.replace('json', '')

UCNets['JSON'] = UCNets['JSON'].apply(remove_backticks)
UCNets['JSON'] = UCNets['JSON'].apply(remove_json)

In [None]:
#UCNets = pd.read_csv('data/a19i_all_COT_gpt4.csv')

normalized_data_list = []
error_lines = []

for i, json_str in enumerate(UCNets['JSON']):
    try:
        parsed_obj = json.loads(json_str)
        normalized_data_list.append(pd.json_normalize(parsed_obj))
    except json.JSONDecodeError:
        error_lines.append(i)
        continue

# Concatenate the normalized data into one DataFrame
normalized_data = pd.concat(normalized_data_list, ignore_index=True)

error_lines

In [None]:
UCNets = pd.concat([UCNets, normalized_data], axis=1)
UCNets

In [None]:
UCNets.to_csv('data/a19i_all_COT_gpt4.csv',index=False)