In [1]:
from openai import OpenAI
import pandas as pd
import numpy as np
import json
import os
import re
import time
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.prompts.prompt import PromptTemplate #where we change the AI "personality"
from dotenv import load_dotenv, find_dotenv

In [2]:
load_dotenv()
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [3]:
_ = load_dotenv(find_dotenv()) # read local .env file

#openai.api_key = os.getenv("api.txt")
COMPLETIONS_MODEL = "text-davinci-002"
BETTER_COMPLETIONS_MODEL = "text-davinci-003" #for my purposes, this is better
LONG_MODEL = "gpt-3.5-turbo-16k"
REGULAR_MODEL = "gpt-3.5-turbo"
GPT_4 = "gpt-4-1106-preview"

In [4]:
os.chdir('/Users/chrissoria/Documents/Research/Categorization_AI_experiments')
current_directory = os.getcwd()
print(current_directory)

/Users/chrissoria/Documents/Research/Categorization_AI_experiments


In [5]:
survey_participant_input = "a19i" #enter column name here

UCNets = pd.read_excel("/Users/chrissoria/Documents/Research/UCNets_Classification/data/Raw_Cond_for_Coding_all_waves.xlsx", engine='openpyxl',sheet_name="JOINT_DATA",usecols=[survey_participant_input])
UCNets = UCNets[survey_participant_input].dropna().unique()  # Drop NaN values and get unique elements

survey_participant_responses = '; '.join(str(item) for item in UCNets) #what we will feed to the model

UCNets = pd.DataFrame(UCNets, columns=[survey_participant_input])
UCNets[survey_participant_input] = UCNets[survey_participant_input].astype(str).str.lower()
UCNets[survey_participant_input] = UCNets[survey_participant_input].str.strip()
UCNets = UCNets[UCNets[survey_participant_input] != ''].reset_index(drop=True) #trimming all empty rows

UCNets = UCNets.iloc[:200]

UCNets.head()

Unnamed: 0,a19i
0,relocated back to east coast - closer to my sons
1,move in together with my partner
2,"out of living with my friends, and into living..."
3,to take a new job in new york city (both becau...
4,wanted to live in my own place outside my pare...


Here, I'm trying to "force" the model to "think" in steps by first A. trying to process the response into its own words and B. having it interact with that object. That is, instead of all steps being given at once, I'm having it think in steps. 

This time, I will have it think in a "chain," where I will have it output a response and then feed that response back to it in a seperate prompt. 

In [6]:
def extract_categories(survey_question, 
                       survey_input,
                       user_model,
                       creativity,
                       *categories):
    
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    
    categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
    cat_num = len(user_categories)
    category_dict = {str(i+1): "0" for i in range(cat_num)}
    example_JSON = json.dumps(category_dict, indent=4)
    
    chain1 = []
    chain2 = []
    extracted_jsons = []
    
    for response in survey_input:
        prompt = f"""A survey respondent was asked, "{survey_question}" \
        Their response is here in triple backticks: ```{response}```. \
        First, identify every specific answer to the question, "{survey_question}" and filter out everything else. \
        Second, give me all of the reasons you were able to extract in a numbered list."""

        try:
            response = client.chat.completions.create(
                model=user_model,
                messages=[
                    {'role': 'user', 'content': prompt}
                ],
                temperature=creativity
            )

            reply = response.choices[0].message.content
            chain1.append(reply)
            
        except Exception as e:
            print(f"An error occurred: {e}")
            chain1.append(f"Error processing input: {input}")
    for reply in chain1:
        prompt2 = f"""A survey respondent was asked, "{survey_question}" \
        They provided these reasons in triple backticks: ```{reply}```. \
        First, select all of the following categories these reasons fall into: \
        "{categories_str}" \
        Second, form your response into JSON format like this example: {example_JSON}."""

        try:
            response2 = client.chat.completions.create(
                model=user_model,
                messages=[
                    {'role': 'user', 'content': prompt2}
                ],
                temperature=0,
            )

            reply2 = response2.choices[0].message.content
            chain2.append(reply2)
            
            extracted_json = extracted_json = re.findall(r'```json\n(\{.*?\})\n```', reply2, re.DOTALL)
            extracted_json = extracted_json[0].replace('[', '').replace(']', '').replace('\n', '').replace(" ", '').replace("  ", '')
            print(extracted_json)
            extracted_jsons.append(extracted_json)
            
        except Exception as e:
            print(f"An error occurred in the second prompt: {e}")
            chain2.append(f"Error processing response: {reply}")
            
    normalized_data_list = []
    error_lines = []
    
    for i, json_str in enumerate(extracted_jsons):
        try:
            parsed_obj = json.loads(json_str)
            normalized_data_list.append(pd.json_normalize(parsed_obj))
        except json.JSONDecodeError:
            error_lines.append(i)
            continue

    normalized_data = pd.concat(normalized_data_list, ignore_index=True)
    
    categorized_data = pd.DataFrame()
    categorized_data['survey_response'] = survey_input
    categorized_data['chain1'] = chain1
    categorized_data['chain2'] = chain2
    categorized_data['json'] = extracted_json
    
    categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
    
    return categorized_data

In [7]:
survey_question = "Why did you move?"
survey_input = UCNets['a19i']

user_categories = ["to start living with or to stay with partner/spouse",
                   "relationship change (divorce, breakup, etc)",
                   "the respondent had a job or school or career change, including transferred and retired",
                   "the respondent's partner's job or school or career change, including transferred and retired",
                   "financial reasons (rent is too expensive, pay raise, etc)",
                   "related specifically features of the home, such as a bigger or smaller yard"]

user_model = 'gpt-4-1106-preview'

First, I want to find out whether adjusting "creativity" makes a difference in how accurate the output is.

In [12]:
creativity = 0

creative_00 = extract_categories(survey_question, 
                            survey_input, 
                            user_model,
                            creativity,
                            user_categories)

creative_00.to_csv('data/a19i_00_creative_6_cats.csv')

{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"1","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"1"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"1","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}


{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"1"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"1","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"1","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"1"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}


In [8]:
creativity = .25

creative_25 = extract_categories(survey_question, 
                            survey_input, 
                            user_model,
                            creativity,
                            user_categories)

creative_25.to_csv('data/a19i_25_creative_6_cats.csv')

{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"1","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"1"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"1","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}


{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"1"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"1","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"1","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"1"}
{"1":"0","2":"0","3":"0","4":"1","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}


In [9]:
creativity = .5

creative_50 = extract_categories(survey_question, 
                            survey_input, 
                            user_model,
                            creativity,
                            user_categories)

creative_50.to_csv('data/a19i_50_creative_6_cats.csv')

{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"1","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"1"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"1","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}


{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"1"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"1","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"1","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"1"}
{"1":"0","2":"0","3":"0","4":"1","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"1"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}


In [10]:
creativity = .75


creative_75 = extract_categories(survey_question, 
                            survey_input, 
                            user_model,
                            creativity,
                            user_categories)

creative_75.to_csv('data/a19i_75_creative_6_cats.csv')

{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"1","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}


{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"1","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"1","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"1"}
{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"1","2":"0","3":"0","4":"0","5":"0","6":"0"}
{"1":"0","2":"0","3":"1","4":"0","5":"0","6":"0"}


In [None]:
user_categories = ["to start living with or to stay with partner/spouse",
                   "relationship change (divorce, breakup, etc)",
                   "the respondent had a job or school or career change, including transferred and retired",
                   "the respondent's partner's job or school or career change, including transferred and retired",
                   "financial reasons (rent is too expensive, pay raise, etc)",
                   "related specifically features of the home, such as a bigger or smaller yard"]
