In [5]:
import pandas as pd
import numpy as np
import json
import os
import re
import time
from langchain_community.llms import Ollama
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [6]:
llm = Ollama(model="llama2:13b",
            temperature=0,
            system="output your response in proper JSON format")


llm.invoke("The first man on the moon was ...")

'\n{\n"answer": "Neil Armstrong"\n}'

In [7]:
os.chdir('/Users/chrissoria/Documents/Research/Categorization_AI_experiments')
current_directory = os.getcwd()
print(current_directory)

/Users/chrissoria/Documents/Research/Categorization_AI_experiments


In [8]:
survey_participant_input = "a19f" #enter column name here
UCNets = pd.read_excel("../UCNets_Classification/Hand_Coding_Surveys/a19fg/a19fg_Master.xlsx", engine='openpyxl', sheet_name="master_a19f")

UCNets = UCNets[['Response']]
UCNets.rename(columns={'Response': 'a19f'}, inplace=True)

UCNets = UCNets[survey_participant_input].dropna().unique()  # Drop NaN values and get unique elements

survey_participant_responses = '; '.join(str(item) for item in UCNets) #what we will feed to the model

UCNets = pd.DataFrame(UCNets, columns=[survey_participant_input])
UCNets[survey_participant_input] = UCNets[survey_participant_input].astype(str).str.lower()
UCNets[survey_participant_input] = UCNets[survey_participant_input].str.strip()
UCNets = UCNets[UCNets[survey_participant_input] != ''].reset_index(drop=True) #trimming all empty rows

UCNets = UCNets.iloc[:400]

UCNets.head()

Unnamed: 0,a19f
0,"volunteering, joining an exercise group, chatt..."
1,.n
2,no new friends
3,have not taken any steps
4,hang out with my existing friends who have a s...


Here, I'm trying to "force" the model to "think" in steps by first A. trying to process the response into its own words and B. having it interact with that object. That is, instead of all steps being given at once, I'm having it think in steps. 

This time, I will have it think in a "chain," where I will have it output a response and then feed that response back to it in a seperate prompt. 

In [15]:
def extract_categories(survey_question, 
                       survey_input,
                       user_model,
                       creativity,
                       categories):
    
    categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
    cat_num = len(user_categories)
    category_dict = {str(i+1): 0 for i in range(cat_num)}
    example_JSON = json.dumps(category_dict, indent=4)
    print(example_JSON)
    llm = Ollama(model=user_model,
        temperature=creativity,
            system=f"output your response in this desired JSON format where each category number is the key: {example_JSON}")
    
    link1 = []
    extracted_jsons = []
    
    for response in survey_input:
        prompt = f"""Categorize this survey response "{response}" into all of the following ways they took steps to make new friends and select all that apply: \
        {categories_str} \
        Provide your work in the JSON format where the number belonging to each category is the key and a 1 if the category is present and a 0 if it is not present as key values."""
        
        reply = ""
        
        try:
            print(prompt)
            reply = llm.invoke(prompt)
            
            link1.append(reply)
            
        except Exception as e:
            print(f"An error occurred: {e}")
            link1.append(f"Error processing input: {input}")
            
        extracted_json = re.findall(r'(\{.*?\})', reply, re.DOTALL)
        extracted_json = ''.join(extracted_json)

        print(extracted_json)
        if extracted_json:
            cleaned_json = extracted_json.replace('[', '').replace(']', '').replace('\n', '').replace(" ", '').replace("  ", '')
            extracted_jsons.append(cleaned_json)
            print(cleaned_json)
        else:
            error_message = """{"1":"e"}"""
            extracted_jsons.append(error_message)
            print(error_message)
            
    normalized_data_list = []
    error_lines = []
    
    for i, json_str in enumerate(extracted_jsons):
        try:
            # Attempt to parse the JSON string
            parsed_obj = json.loads(json_str)
            # Convert the parsed object to a DataFrame and append
            normalized_data_list.append(pd.json_normalize(parsed_obj))
        except json.JSONDecodeError:
            # Define a default JSON object as a dictionary
            default_json_obj = {"1": "j"}
            # Convert the default object to a DataFrame and append
            normalized_data_list.append(pd.json_normalize(default_json_obj))

    normalized_data = pd.concat(normalized_data_list, ignore_index=True)
    
    categorized_data = pd.DataFrame()
    categorized_data['survey_response'] = survey_input
    categorized_data['link1'] = link1
    categorized_data['json'] = extracted_jsons
    
    categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
    
    return categorized_data

In [18]:
survey_question = "After this last move, what steps, if any, did you take in order to make new friends?"
survey_input = UCNets['a19f']

creativity = .0

user_categories = ["Engaged with local religious institutions such as churches, synagogues, mosques, or other forms of religious communities.",
                   "Frequented local establishments like bars, cafes, shops, or malls to interact with individuals present in the vicinity.",
                   "Direct involvement in secular volunteering efforts, contributing through action and service rather than mere membership in volunteer groups.",
                   "Utilizing digital platforms such as online chats, internet networking websites, or dating apps to establish connections and friendships.",
                   "Engaged in informal, non-professional interactions and outings with colleagues to foster friendships.",
                   "Involvement in sports, exercise, or outdoor recreational activities through gyms, teams, or athletic clubs.",
                   "Participated in organizations or taking recreational classes related to arts, books, music, theater, crafts, or similar cultural and hobby pursuits.",
                   "Initiating contact with neighbors, participating in neighborhood or local community groups.",
                   "Engaged in a broad array of in-person events or groups that do not fit into any of the other categories. These could include parties, ethnic activities, political canvassing, festivals, school activities, or senior centers."]

user_model = 'llama2:13b'

In [None]:
bad = extract_categories(survey_question, 
                            survey_input, 
                            user_model,
                            creativity,
                            user_categories)

bad.to_csv('data/a19f_bad_categorization_5_cats_llama.csv')

{
    "1": 0,
    "2": 0,
    "3": 0,
    "4": 0,
    "5": 0,
    "6": 0,
    "7": 0,
    "8": 0,
    "9": 0
}
Categorize this survey response "volunteering, joining an exercise group, chatting up other dog owners." into all of the following ways they took steps to make new friends and select all that apply:         1. Engaged with local religious institutions such as churches, synagogues, mosques, or other forms of religious communities.
2. Frequented local establishments like bars, cafes, shops, or malls to interact with individuals present in the vicinity.
3. Direct involvement in secular volunteering efforts, contributing through action and service rather than mere membership in volunteer groups.
4. Utilizing digital platforms such as online chats, internet networking websites, or dating apps to establish connections and friendships.
5. Engaged in informal, non-professional interactions and outings with colleagues to foster friendships.
6. Involvement in sports, exercise, or outdoo

{
"1": 0,
"2": 0,
"3": 0,
"4": 0,
"5": 1,
"6": 0,
"7": 0,
"8": 0,
"9": 1
}
{"1":0,"2":0,"3":0,"4":0,"5":1,"6":0,"7":0,"8":0,"9":1}
Categorize this survey response "college friends." into all of the following ways they took steps to make new friends and select all that apply:         1. Engaged with local religious institutions such as churches, synagogues, mosques, or other forms of religious communities.
2. Frequented local establishments like bars, cafes, shops, or malls to interact with individuals present in the vicinity.
3. Direct involvement in secular volunteering efforts, contributing through action and service rather than mere membership in volunteer groups.
4. Utilizing digital platforms such as online chats, internet networking websites, or dating apps to establish connections and friendships.
5. Engaged in informal, non-professional interactions and outings with colleagues to foster friendships.
6. Involvement in sports, exercise, or outdoor recreational activities through 

{
"1": 0,
"2": 0,
"3": 0,
"4": 0,
"5": 0,
"6": 0,
"7": 0,
"8": 0,
"9": 1
}
{"1":0,"2":0,"3":0,"4":0,"5":0,"6":0,"7":0,"8":0,"9":1}
Categorize this survey response "okcupid" into all of the following ways they took steps to make new friends and select all that apply:         1. Engaged with local religious institutions such as churches, synagogues, mosques, or other forms of religious communities.
2. Frequented local establishments like bars, cafes, shops, or malls to interact with individuals present in the vicinity.
3. Direct involvement in secular volunteering efforts, contributing through action and service rather than mere membership in volunteer groups.
4. Utilizing digital platforms such as online chats, internet networking websites, or dating apps to establish connections and friendships.
5. Engaged in informal, non-professional interactions and outings with colleagues to foster friendships.
6. Involvement in sports, exercise, or outdoor recreational activities through gyms, tea

{
"1": 0,
"2": 0,
"3": 1,
"4": 0,
"5": 0,
"6": 0,
"7": 1,
"8": 0,
"9": 1
}
{"1":0,"2":0,"3":1,"4":0,"5":0,"6":0,"7":1,"8":0,"9":1}
Categorize this survey response "nothing" into all of the following ways they took steps to make new friends and select all that apply:         1. Engaged with local religious institutions such as churches, synagogues, mosques, or other forms of religious communities.
2. Frequented local establishments like bars, cafes, shops, or malls to interact with individuals present in the vicinity.
3. Direct involvement in secular volunteering efforts, contributing through action and service rather than mere membership in volunteer groups.
4. Utilizing digital platforms such as online chats, internet networking websites, or dating apps to establish connections and friendships.
5. Engaged in informal, non-professional interactions and outings with colleagues to foster friendships.
6. Involvement in sports, exercise, or outdoor recreational activities through gyms, tea

{
"1": 0,
"2": 0,
"3": 1,
"4": 0,
"5": 1,
"6": 0,
"7": 0,
"8": 0,
"9": 1
}
{"1":0,"2":0,"3":1,"4":0,"5":1,"6":0,"7":0,"8":0,"9":1}
Categorize this survey response "not a whole lot, didn't do anything out of the ordinary to make new friends" into all of the following ways they took steps to make new friends and select all that apply:         1. Engaged with local religious institutions such as churches, synagogues, mosques, or other forms of religious communities.
2. Frequented local establishments like bars, cafes, shops, or malls to interact with individuals present in the vicinity.
3. Direct involvement in secular volunteering efforts, contributing through action and service rather than mere membership in volunteer groups.
4. Utilizing digital platforms such as online chats, internet networking websites, or dating apps to establish connections and friendships.
5. Engaged in informal, non-professional interactions and outings with colleagues to foster friendships.
6. Involvement in sp

{
"1": 0,
"2": 1,
"3": 1,
"4": 1,
"5": 1,
"6": 1,
"7": 1,
"8": 0,
"9": 1
}
{"1":0,"2":1,"3":1,"4":1,"5":1,"6":1,"7":1,"8":0,"9":1}
Categorize this survey response "getting to know new neighbors" into all of the following ways they took steps to make new friends and select all that apply:         1. Engaged with local religious institutions such as churches, synagogues, mosques, or other forms of religious communities.
2. Frequented local establishments like bars, cafes, shops, or malls to interact with individuals present in the vicinity.
3. Direct involvement in secular volunteering efforts, contributing through action and service rather than mere membership in volunteer groups.
4. Utilizing digital platforms such as online chats, internet networking websites, or dating apps to establish connections and friendships.
5. Engaged in informal, non-professional interactions and outings with colleagues to foster friendships.
6. Involvement in sports, exercise, or outdoor recreational activi

{
"1": 0,
"2": 0,
"3": 0,
"4": 0,
"5": 0,
"6": 0,
"7": 0,
"8": 0,
"9": 0
}{
"1": 0,
"2": 0,
"3": 0,
"4": 0,
"5": 0,
"6": 0,
"7": 0,
"8": 0,
"9": 1
}
{"1":0,"2":0,"3":0,"4":0,"5":0,"6":0,"7":0,"8":0,"9":0}{"1":0,"2":0,"3":0,"4":0,"5":0,"6":0,"7":0,"8":0,"9":1}
Categorize this survey response "getting involved in local synagogue, reach out to people i know that are local, hang out at the coffee shop, become involved in my son's school." into all of the following ways they took steps to make new friends and select all that apply:         1. Engaged with local religious institutions such as churches, synagogues, mosques, or other forms of religious communities.
2. Frequented local establishments like bars, cafes, shops, or malls to interact with individuals present in the vicinity.
3. Direct involvement in secular volunteering efforts, contributing through action and service rather than mere membership in volunteer groups.
4. Utilizing digital platforms such as online chats, internet netwo

{
"1": 0,
"2": 0,
"3": 0,
"4": 0,
"5": 0,
"6": 0,
"7": 0,
"8": 0,
"9": 0
}{
"1": 0,
"2": 0,
"3": 0,
"4": 0,
"5": 0,
"6": 0,
"7": 0,
"8": 0,
"9": 0
}
{"1":0,"2":0,"3":0,"4":0,"5":0,"6":0,"7":0,"8":0,"9":0}{"1":0,"2":0,"3":0,"4":0,"5":0,"6":0,"7":0,"8":0,"9":0}
Categorize this survey response "none. not necessary. moved close by. still have the same friends." into all of the following ways they took steps to make new friends and select all that apply:         1. Engaged with local religious institutions such as churches, synagogues, mosques, or other forms of religious communities.
2. Frequented local establishments like bars, cafes, shops, or malls to interact with individuals present in the vicinity.
3. Direct involvement in secular volunteering efforts, contributing through action and service rather than mere membership in volunteer groups.
4. Utilizing digital platforms such as online chats, internet networking websites, or dating apps to establish connections and friendships.
5. Eng

In [None]:
print(len(bad))

In [None]:
def extract_categories_improved(survey_question, 
                       survey_input,
                       user_model,
                       creativity,
                       categories):
    
    categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
    cat_num = len(user_categories)
    category_dict = {str(i+1): "0" for i in range(cat_num)}
    example_JSON = json.dumps(category_dict, indent=4)
    print(example_JSON)
    llm = Ollama(model=user_model,
        temperature=creativity,
            system=f"""You are an expert in identifying themes and patterns in open-ended survey responses to the question, "{survey_question}". \
                      When given a survey response, you analyze it critically and thoroughly to identify user-provided categories present in the response. \
                      Output your response in this desired format: {example_JSON}""")
    
    link1 = []
    extracted_jsons = []
    
    for response in survey_input:
        prompt = f"""A survey respondent was asked, "{survey_question}". \
        Their response is here in triple backticks: ```{response}```. \
        Select all of the following numbered categories present in the response and form your response in proper JSON format: \
        The number belonging to the category should be be the key and a 1 is the key value if the category is present. \
        If none of the categories are present in their response, provide 0's for all key values in your JSON. \
        Numbered categories: "{categories_str}"."""
        print(prompt)
        try:
            reply = llm.invoke(prompt)
            
            link1.append(reply)
            
        except Exception as e:
            print(f"An error occurred: {e}")
            link1.append(f"Error processing input: {input}")
            
        extracted_json = re.findall(r'(\{.*?\})', reply, re.DOTALL)
        extracted_json = ''.join(extracted_json)

        print(extracted_json)
        if extracted_json:
            cleaned_json = extracted_json.replace('[', '').replace(']', '').replace('\n', '').replace(" ", '').replace("  ", '')
            extracted_jsons.append(cleaned_json)
            print(cleaned_json)
        else:
            error_message = """{"1":"e"}"""
            extracted_jsons.append(error_message)
            print(error_message)
            
    normalized_data_list = []
    error_lines = []
    
    for i, json_str in enumerate(extracted_jsons):
        try:
            # Attempt to parse the JSON string
            parsed_obj = json.loads(json_str)
            # Convert the parsed object to a DataFrame and append
            normalized_data_list.append(pd.json_normalize(parsed_obj))
        except json.JSONDecodeError:
            # Define a default JSON object as a dictionary
            default_json_obj = {"1": "j"}
            # Convert the default object to a DataFrame and append
            normalized_data_list.append(pd.json_normalize(default_json_obj))

    normalized_data = pd.concat(normalized_data_list, ignore_index=True)
    
    categorized_data = pd.DataFrame()
    categorized_data['survey_response'] = survey_input
    categorized_data['link1'] = link1
    categorized_data['json'] = extracted_jsons
    
    categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
    
    return categorized_data

In [None]:
good = extract_categories_improved(survey_question, 
                            survey_input, 
                            user_model,
                            creativity,
                            user_categories)

good.to_csv('data/a19f_good_categorization_5_cats_llama.csv')

In [None]:
def extract_categories_cot(survey_question, 
                       survey_input,
                       user_model,
                       creativity,
                       categories):
    
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    
    categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
    cat_num = len(user_categories)
    category_dict = {str(i+1): "0" for i in range(cat_num)}
    example_JSON = json.dumps(category_dict, indent=4)
    
    link1 = []
    extracted_jsons = []
    
    for response in survey_input:
        prompt = f"""A survey respondent was asked, "{survey_question}" \
        Their response is here in triple backticks: ```{response}```. \
        First, thoruoughly extract all their answers to the question and be as specific as possible. \
        Second, take these reasons and select all of the following numbered categories they fall into: \
        "{categories_str}" \
        Third, form your response in proper JSON format. \
        The number belonging to the category shoulbe be the key and a 1 is the key value if the category is present. \
        If none of the categories are present in their response, provide 0's for all key values in your JSON."""
        print(prompt)
        try:
            response = client.chat.completions.create(
                model=user_model,
                messages=[
                    {
                      "role": "system",
                      "content": f"""You are an expert in identifying themes and patterns in open-ended survey responses to the question, "{survey_question}". \
                      When given a survey response, you analyze it critically and thoroughly to identify user-provided categories present in the response."""
                    },
                    {'role': 'user', 
                     'content': prompt}
                ],
                temperature=creativity
            )

            reply = response.choices[0].message.content
            link1.append(reply)
            
        except Exception as e:
            print(f"An error occurred: {e}")
            link1.append(f"Error processing input: {input}")
            
        extracted_json = re.findall(r'```json\n(\{.*?\})\n```', reply, re.DOTALL)
            
        if extracted_json:
            cleaned_json = extracted_json[0].replace('[', '').replace(']', '').replace('\n', '').replace(" ", '').replace("  ", '')
            extracted_jsons.append(cleaned_json)
            print(cleaned_json)
        else:
            error_message = """{"1":"e"}"""
            extracted_jsons.append(error_message)
            print(error_message)
            
    normalized_data_list = []
    error_lines = []
    
    for i, json_str in enumerate(extracted_jsons):
        try:
            parsed_obj = json.loads(json_str)
            normalized_data_list.append(pd.json_normalize(parsed_obj))
        except json.JSONDecodeError:
            normalized_data_list.append("""{"1":"e"}""")
            continue

    normalized_data = pd.concat(normalized_data_list, ignore_index=True)
    
    categorized_data = pd.DataFrame()
    categorized_data['survey_response'] = survey_input
    categorized_data['link1'] = link1
    categorized_data['json'] = extracted_jsons
    
    categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
    
    return categorized_data

In [None]:
cot = extract_categories_cot(survey_question, 
                            survey_input, 
                            user_model,
                            creativity,
                            user_categories)

cot.to_csv('data/a19f_cot_categorization_5_cats.csv')

In [None]:
def extract_categories_cove(survey_question, 
                       survey_input,
                       user_model,
                       creativity,
                       categories):
    
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    
    categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
    cat_num = len(categories)
    category_dict = {str(i+1): "0" for i in range(cat_num)}
    example_JSON = json.dumps(category_dict, indent=4)
    
    link1 = []
    link2 = []
    extracted_jsons = []
    
    for response in survey_input:
        prompt = f"""Categorize this survey response "{response}" into all of the following reasons for moving and select all that apply: \
        {categories_str} \
        Provide your work in JSON format where the number belonging to each category is the key and a 1 if the category is present and a 0 if it is not present as key values."""
        try:
            api_response = client.chat.completions.create(
                model=user_model,
                messages=[
                    {'role': 'user', 
                     'content': prompt}
                ],
                temperature=creativity
            )
            reply = api_response.choices[0].message.content
            print(reply)
            link1.append(reply)

            prompt2 = f"""Thank you for categorizing this survey response, "{response}". \
            Can you double check if there are any categories you might've missed or marked as being present incorrectly? \
            Here are the categories once again: {categories_str} \
            If there are any changes, please output a corrected JSON with the new categorization. \
            If there are no changes, please output the original JSON."""
            print(prompt2)
            

            api_response2 = client.chat.completions.create(
                model=user_model,
                messages=[
                    {"role": "system", "content": f"""You just categorized an answer to the question, "{survey_question}". You are revisiting your initial categorizations for accuracy. Here's what you initially identified: {reply}"""}, 
                    {'role':'assistant', 'content': reply},
                    {'role': 'user', 'content': prompt2}
                ],
                temperature=.25,
            )

            reply2 = api_response2.choices[0].message.content
            link2.append(reply2)
            
        extracted_json = re.findall(r'```json\n(\{.*?\})\n```', reply, re.DOTALL)
            
        if extracted_json:
            cleaned_json = extracted_json[0].replace('[', '').replace(']', '').replace('\n', '').replace(" ", '').replace("  ", '')
            extracted_jsons.append(cleaned_json)
            print(cleaned_json)
        else:
            error_message = """{"1":"e"}"""
            extracted_jsons.append(error_message)
            print(error_message)
                
        except Exception as e:
            print(f"An error occurred: {e}")
            link1.append(f"Error processing input: {survey_input}")
            link2.append(f"Error processing response: {reply}")
            
    normalized_data_list = []
    error_lines = []
    
    for i, json_str in enumerate(extracted_jsons):
        try:
            parsed_obj = json.loads(json_str)
            normalized_data_list.append(pd.json_normalize(parsed_obj))
        except json.JSONDecodeError:
            normalized_data_list.append("""{"1":"e"}""")
            continue

    normalized_data = pd.concat(normalized_data_list, ignore_index=True)
    
    categorized_data = pd.DataFrame()
    categorized_data['survey_response'] = survey_input
    categorized_data['link1'] = link1
    categorized_data['link2'] = link2
    categorized_data['json'] = extracted_jsons
    
    categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
    
    return categorized_data

In [None]:
creativity = .0

cove = extract_categories_cove(survey_question, 
                            survey_input, 
                            user_model,
                            creativity,
                            user_categories)

cove.to_csv('data/a19f_cove_categorization_5_cats.csv')

In [None]:
def extract_categories_1s(survey_question, 
                       survey_input,
                       user_model,
                       creativity,
                       categories):
    
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    
    categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
    cat_num = len(user_categories)
    category_dict = {str(i+1): "0" for i in range(cat_num)}
    example_categorization = """{"1":"0","2":"0","3":"0","4":"0","5":"1","6":"0","7":"1","8":"0","9":"1"}"""
    #1. going out with coworkers, 2. board games (cultural-hobby), 3. activities in the city (broader participation)
    example_response = "going out with coworkers, setting up events with friends and having them invite anyone they wanted, participating in game nights (board games) which tend to have different attendees each night, activities in the city."
    link1 = []
    extracted_jsons = []
    
    for response in survey_input:
        prompt = f"""Categorize this survey response "{response}" into all of the following reasons for moving and select all that apply: \
        {categories_str} \
        Provide your work in JSON format where the number belonging to each category is the key and a 1 if the category is present and a 0 if it is not present as key values. \
        Here's an example of a correct categorization. \
        Example survey response: {example_response}. \
        Example categorization: {example_categorization}."""
        try:
            response = client.chat.completions.create(
                model=user_model,
                messages=[
                    {'role': 'user', 'content': prompt}
                ],
                temperature=creativity
            )

            reply = response.choices[0].message.content
            link1.append(reply)
            
        except Exception as e:
            print(f"An error occurred: {e}")
            link1.append(f"Error processing input: {input}")
           
        extracted_json = re.findall(r'```json\n(\{.*?\})\n```', reply, re.DOTALL)
            
        if extracted_json:
            cleaned_json = extracted_json[0].replace('[', '').replace(']', '').replace('\n', '').replace(" ", '').replace("  ", '')
            extracted_jsons.append(cleaned_json)
            print(cleaned_json)
        else:
            error_message = """{"1":"e"}"""
            extracted_jsons.append(error_message)
            print(error_message)
            
    normalized_data_list = []
    error_lines = []
    
    for i, json_str in enumerate(extracted_jsons):
        try:
            parsed_obj = json.loads(json_str)
            normalized_data_list.append(pd.json_normalize(parsed_obj))
        except json.JSONDecodeError:
            normalized_data_list.append("""{"1":"e"}""")
            continue

    normalized_data = pd.concat(normalized_data_list, ignore_index=True)
    
    categorized_data = pd.DataFrame()
    categorized_data['survey_response'] = survey_input
    categorized_data['link1'] = link1
    categorized_data['json'] = extracted_jsons
    
    categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
    
    return categorized_data

In [None]:
oneshot = extract_categories_1s(survey_question, 
                            survey_input, 
                            user_model,
                            creativity,
                            user_categories)

oneshot.to_csv('data/a19f_1s_categorization_5_cats.csv')

In [None]:
def extract_categories_fs(survey_question, 
                       survey_input,
                       user_model,
                       creativity,
                       categories):
    
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    
    categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
    cat_num = len(user_categories)
    category_dict = {str(i+1): "0" for i in range(cat_num)}
    example_response = "the rent was increasing; they wanted to renew the rent $400 extra, a month to re-lease. made sense regarding my career. and i wanted a backyard for my dog."
    example_categorization = """{"1":"0","2":"1","3":"0","4":"1","5":"0"}"""
    example_response2 = "lease ended at my old apartment and i wanted to move back to my parents house to pay off more of my student loans"
    example_categorization2 = """{"1":"0","2":"0","3":"0","4":"1","5":"1"}"""
    example_response3 = "there was a fire in building where i previously lived; all tenants displaced, we had to find other housing. after the fire i stayed 3 days with a friend, then 2 months in a hotel, then began living in my current apartment in same city. move was not by choice, was circumstantial."
    example_categorization3 = """{"1":"0","2":"0","3":"0","4":"0","5":"0"}"""
    
    link1 = []
    extracted_jsons = []
    
    for response in survey_input:
        prompt = f"""Categorize this survey response "{response}" into all of the following reasons for moving and select all that apply: \
        {categories_str} \
        Provide your work in JSON format where the number belonging to each category is the key and a 1 if the category is present and a 0 if it is not present as key values. \
        Here are three examples of a correct categorization. \
        Example survey response 1: {example_response}. \
        Example categorization 1: {example_categorization}. \
        Example survey response 2: {example_response2}. \
        Example categorization 2: {example_categorization2}. \
        Example survey response 3: {example_response3}. \
        Example categorization 3: {example_categorization3}."""
        try:
            response = client.chat.completions.create(
                model=user_model,
                messages=[
                    {'role': 'user', 'content': prompt}
                ],
                temperature=creativity
            )

            reply = response.choices[0].message.content
            link1.append(reply)
            
        except Exception as e:
            print(f"An error occurred: {e}")
            link1.append(f"Error processing input: {input}")
            
        extracted_json = re.findall(r'```json\n(\{.*?\})\n```', reply, re.DOTALL)
            
        if extracted_json:
            cleaned_json = extracted_json[0].replace('[', '').replace(']', '').replace('\n', '').replace(" ", '').replace("  ", '')
            extracted_jsons.append(cleaned_json)
            print(cleaned_json)
        else:
            error_message = """{"1":"e"}"""
            extracted_jsons.append(error_message)
            print(error_message)
            
    normalized_data_list = []
    error_lines = []
    
    for i, json_str in enumerate(extracted_jsons):
        try:
            parsed_obj = json.loads(json_str)
            normalized_data_list.append(pd.json_normalize(parsed_obj))
        except json.JSONDecodeError:
            normalized_data_list.append("""{"1":"e"}""")
            continue

    normalized_data = pd.concat(normalized_data_list, ignore_index=True)
    
    categorized_data = pd.DataFrame()
    categorized_data['survey_response'] = survey_input
    categorized_data['link1'] = link1
    categorized_data['json'] = extracted_jsons
    
    categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
    
    return categorized_data

In [None]:
fewshot = extract_categories_fs(survey_question, 
                            survey_input, 
                            user_model,
                            creativity,
                            user_categories)

fewshot.to_csv('data/a19i_fs_categorization_5_cats.csv')