In [1]:
import openai
import pandas as pd
import numpy as np
import json
import os
import re
import time
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.prompts.prompt import PromptTemplate #where we change the AI "personality"
from dotenv import load_dotenv, find_dotenv

In [2]:
load_dotenv()
API_KEY = os.environ.get("OPENAI_API_KEY")

In [3]:
_ = load_dotenv(find_dotenv()) # read local .env file

#openai.api_key = os.getenv("api.txt")
COMPLETIONS_MODEL = "text-davinci-002"
BETTER_COMPLETIONS_MODEL = "text-davinci-003" #for my purposes, this is better
LONG_MODEL = "gpt-3.5-turbo-16k"
REGULAR_MODEL = "gpt-3.5-turbo"
GPT_4 = "gpt-4"

chat = ChatOpenAI(temperature=0.0,
                  openai_api_key = API_KEY,
                  verbose=True,
                  model_name=REGULAR_MODEL) #depending on how big of a task

#below, we give the AI a "personality"
template = """The following is a conversation between a human data scientist and an AI who specializes in data categorization. The AI is direct and provides concise responses. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
{history}
Data Scientist: {input}
AI:"""

In [4]:
os.chdir('/Users/chrissoria/Documents/Research/openai_categorizations')
current_directory = os.getcwd()
print(current_directory)

/Users/chrissoria/Documents/Research/openai_categorizations


In [5]:
survey_participant_input = "a19i" #enter column name here

UCNets = pd.read_excel("/Users/chrissoria/Documents/Research/UCNets_Classification/data/Raw_Cond_for_Coding_all_waves.xlsx", engine='openpyxl',sheet_name="JOINT_DATA",usecols=[survey_participant_input])
UCNets = UCNets[survey_participant_input].dropna().unique()  # Drop NaN values and get unique elements

survey_participant_responses = '; '.join(str(item) for item in UCNets) #what we will feed to the model

UCNets = pd.DataFrame(UCNets, columns=[survey_participant_input])
UCNets[survey_participant_input] = UCNets[survey_participant_input].astype(str).str.lower()
UCNets[survey_participant_input] = UCNets[survey_participant_input].str.strip()
UCNets = UCNets[UCNets[survey_participant_input] != ''].reset_index(drop=True) #trimming all empty rows

UCNets = UCNets.iloc[:200]

UCNets.head()

Unnamed: 0,a19i
0,relocated back to east coast - closer to my sons
1,move in together with my partner
2,"out of living with my friends, and into living..."
3,to take a new job in new york city (both becau...
4,wanted to live in my own place outside my pare...


Here, I'm trying to "force" the model to "think" in steps by first A. trying to process the response into its own words and B. having it interact with that object. That is, instead of all steps being given at once, I'm having it think in steps. 

In [24]:
survey_input = UCNets['a19i'][191] 

category = """1. to start living with or to stay with partner/spouse. \
2. relationship change (divorce, breakup, etc). \
3. the respondent had a job/school/career change, including transferred. \
4. the respondent's partner's job/school/career change, including transferred. \
5. financial reasons (rent is too expensive, pay raise, etc).
6. Reasons related to specific housing features and preferences, different housing conditions"""

example_JSON = """{ \
"summary": "your summary", \
"1": "0", \
"2": "1", \
"3": "0", \
"4": 1, \
"5": "0", \
"6": "0"
}"""

template_string = """A survey respondent was asked, "Why did you move?" \
They responded with: "{OBJECT}" \
First, succinctly summarize the respondent's answer to pull out the major reasons why they moved. \
Second, please determine how many of the following reasons for moving are in your summary: \
{CATEGORY} \
Next, provide your answer as a 1 if yes and a 0 if no in JSON format \
Here's an example of how you should format your response: \
{EXAMPLE}"""

prompt_template = ChatPromptTemplate.from_template(template_string)
prompt_template.messages[0].prompt #this will show us our prompt template

GPT_Responses = prompt_template.format_messages(
                    OBJECT=survey_input,
                    CATEGORY=category,
                    EXAMPLE=example_JSON)

print(GPT_Responses[0])

content='A survey respondent was asked, "Why did you move?" They responded with: "we moved from marin county, california to wilmington, delaware in 2016. we rented for about one year (april 2016-august 2017) while searching for a house.  we purchased a house in july 2017." First, succinctly summarize the respondent\'s answer to pull out the major reasons why they moved. Second, please determine how many of the following reasons for moving are in your summary: 1. to start living with or to stay with partner/spouse. 2. relationship change (divorce, breakup, etc). 3. the respondent had a job/school/career change, including transferred. 4. the respondent\'s partner\'s job/school/career change, including transferred. 5. financial reasons (rent is too expensive, pay raise, etc).\n6. Reasons related to specific housing features and preferences, different housing conditions Next, provide your answer as a 1 if yes and a 0 if no in JSON format Here\'s an example of how you should format your res

In [25]:
TEST = chat(GPT_Responses)
print(TEST.content)

{
"summary": "The respondent moved from Marin County, California to Wilmington, Delaware in 2016. They rented for about one year while searching for a house and purchased a house in July 2017.",
"1": "0",
"2": "0",
"3": "0",
"4": "0",
"5": "0",
"6": "1"
}


In [26]:
for i in range(len(UCNets)):
    survey_input = UCNets['a19i'][i]
    response = prompt_template.format_messages(
                    OBJECT=survey_input,
                    CATEGORY=category,
                    EXAMPLE=example_JSON)

    response = chat(response)
    
    # Assuming a successful attempt means a non-empty response
    if response.content:
        print(f"Successful attempt for row number: {i}")
    
    UCNets.at[i, 'JSON'] = response.content

Successful attempt for row number: 0
Successful attempt for row number: 1
Successful attempt for row number: 2
Successful attempt for row number: 3
Successful attempt for row number: 4
Successful attempt for row number: 5
Successful attempt for row number: 6
Successful attempt for row number: 7
Successful attempt for row number: 8
Successful attempt for row number: 9
Successful attempt for row number: 10
Successful attempt for row number: 11
Successful attempt for row number: 12
Successful attempt for row number: 13
Successful attempt for row number: 14
Successful attempt for row number: 15
Successful attempt for row number: 16
Successful attempt for row number: 17
Successful attempt for row number: 18
Successful attempt for row number: 19
Successful attempt for row number: 20
Successful attempt for row number: 21
Successful attempt for row number: 22
Successful attempt for row number: 23
Successful attempt for row number: 24
Successful attempt for row number: 25
Successful attempt for

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


Successful attempt for row number: 75
Successful attempt for row number: 76
Successful attempt for row number: 77
Successful attempt for row number: 78
Successful attempt for row number: 79
Successful attempt for row number: 80
Successful attempt for row number: 81
Successful attempt for row number: 82
Successful attempt for row number: 83
Successful attempt for row number: 84
Successful attempt for row number: 85
Successful attempt for row number: 86
Successful attempt for row number: 87
Successful attempt for row number: 88
Successful attempt for row number: 89
Successful attempt for row number: 90
Successful attempt for row number: 91
Successful attempt for row number: 92
Successful attempt for row number: 93
Successful attempt for row number: 94
Successful attempt for row number: 95
Successful attempt for row number: 96
Successful attempt for row number: 97
Successful attempt for row number: 98
Successful attempt for row number: 99
Successful attempt for row number: 100
Successful 

In [27]:
normalized_data_list = []
error_lines = []

for i, json_str in enumerate(UCNets['JSON']):
    try:
        parsed_obj = json.loads(json_str)
        normalized_data_list.append(pd.json_normalize(parsed_obj))
    except json.JSONDecodeError:
        error_lines.append(i)
        continue

# Concatenate the normalized data into one DataFrame
normalized_data = pd.concat(normalized_data_list, ignore_index=True)

error_lines

[]

In [28]:
UCNets = pd.concat([UCNets, normalized_data], axis=1)
UCNets

Unnamed: 0,a19i,JSON,summary,1,2,3,4,5,6
0,relocated back to east coast - closer to my sons,"{\n""summary"": ""The respondent moved to be clos...",The respondent moved to be closer to their son...,0,0,0,0,0,0
1,move in together with my partner,"{\n""summary"": ""move in together with my partne...",move in together with my partner,1,0,0,0,0,0
2,"out of living with my friends, and into living...","{\n""summary"": ""The respondent moved out of liv...",The respondent moved out of living with friend...,1,0,0,0,0,0
3,to take a new job in new york city (both becau...,"{\n""summary"": ""The respondent moved to take a ...",The respondent moved to take a new job in New ...,0,0,1,0,0,0
4,wanted to live in my own place outside my pare...,"{\n""summary"": ""The respondent moved to live in...",The respondent moved to live in their own plac...,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
195,opportunity came up,"{""summary"": ""opportunity came up"", ""1"": ""0"", ""...",opportunity came up,0,0,1,0,0,0
196,we preferred the environment/climate in anothe...,"{""summary"": ""preferred environment/climate in ...",preferred environment/climate in another area,0,0,0,0,0,0
197,we wanted to live in a location we'd like more.,"{\n""summary"": ""The respondent moved to live in...",The respondent moved to live in a location the...,0,0,0,0,0,1
198,downsizing/retiring,"{\n""summary"": ""downsizing/retiring"",\n""1"": ""0""...",downsizing/retiring,0,0,0,0,1,0


In [29]:
UCNets.to_csv('data/a19i_all_categories_summarize1.csv',index=False)

In [30]:
survey_participant_input = "a19i" #enter column name here

UCNets = pd.read_excel("/Users/chrissoria/Documents/Research/UCNets_Classification/data/Raw_Cond_for_Coding_all_waves.xlsx", engine='openpyxl',sheet_name="JOINT_DATA",usecols=[survey_participant_input])
UCNets = UCNets[survey_participant_input].dropna().unique()  # Drop NaN values and get unique elements

survey_participant_responses = '; '.join(str(item) for item in UCNets) #what we will feed to the model

UCNets = pd.DataFrame(UCNets, columns=[survey_participant_input])
UCNets[survey_participant_input] = UCNets[survey_participant_input].astype(str).str.lower()
UCNets[survey_participant_input] = UCNets[survey_participant_input].str.strip()
UCNets = UCNets[UCNets[survey_participant_input] != ''].reset_index(drop=True) #trimming all empty rows

UCNets = UCNets.iloc[:200]

UCNets.head()

Unnamed: 0,a19i
0,relocated back to east coast - closer to my sons
1,move in together with my partner
2,"out of living with my friends, and into living..."
3,to take a new job in new york city (both becau...
4,wanted to live in my own place outside my pare...


In [31]:
survey_input = UCNets['a19i'][191] 

category = """1. to start living with or to stay with partner/spouse. \
2. relationship change (divorce, breakup, etc). \
3. the respondent had a job/school/career change, including transferred. \
4. the respondent's partner's job/school/career change, including transferred. \
5. financial reasons (rent is too expensive, pay raise, etc).
6. Reasons related to specific housing features and preferences, different housing conditions"""

example_JSON = """{ \
"reasons": "your analysis", \
"1": "0", \
"2": "1", \
"3": "0", \
"4": 1, \
"5": "0", \
"6": "0"
}"""

template_string = """A survey respondent was asked, "Why did you move?" \
They responded with: "{OBJECT}" \
First, pull out the major reasons why they moved. \
Second, please determine how many of the following reasons for moving are in your summary: \
{CATEGORY} \
Next, provide your answer as a 1 if yes and a 0 if no in JSON format \
Here's an example of how you should format your response: \
{EXAMPLE}"""

prompt_template = ChatPromptTemplate.from_template(template_string)
prompt_template.messages[0].prompt #this will show us our prompt template

GPT_Responses = prompt_template.format_messages(
                    OBJECT=survey_input,
                    CATEGORY=category,
                    EXAMPLE=example_JSON)

print(GPT_Responses[0])

content='A survey respondent was asked, "Why did you move?" They responded with: "we moved from marin county, california to wilmington, delaware in 2016. we rented for about one year (april 2016-august 2017) while searching for a house.  we purchased a house in july 2017." First, pull out the major reasons why they moved. Second, please determine how many of the following reasons for moving are in your summary: 1. to start living with or to stay with partner/spouse. 2. relationship change (divorce, breakup, etc). 3. the respondent had a job/school/career change, including transferred. 4. the respondent\'s partner\'s job/school/career change, including transferred. 5. financial reasons (rent is too expensive, pay raise, etc).\n6. Reasons related to specific housing features and preferences, different housing conditions Next, provide your answer as a 1 if yes and a 0 if no in JSON format Here\'s an example of how you should format your response: { "reasons": "your analysis", "1": "0", "2

In [32]:
TEST = chat(GPT_Responses)
print(TEST.content)

{
  "reasons": "The major reasons for moving are to search for a house and to purchase a house.",
  "1": "0",
  "2": "0",
  "3": "0",
  "4": "0",
  "5": "0",
  "6": "1"
}


In [33]:
for i in range(len(UCNets)):
    survey_input = UCNets['a19i'][i]
    response = prompt_template.format_messages(
                    OBJECT=survey_input,
                    CATEGORY=category,
                    EXAMPLE=example_JSON)

    response = chat(response)
    
    # Assuming a successful attempt means a non-empty response
    if response.content:
        print(f"Successful attempt for row number: {i}")
    
    UCNets.at[i, 'JSON'] = response.content

Successful attempt for row number: 0
Successful attempt for row number: 1
Successful attempt for row number: 2
Successful attempt for row number: 3
Successful attempt for row number: 4
Successful attempt for row number: 5
Successful attempt for row number: 6
Successful attempt for row number: 7
Successful attempt for row number: 8
Successful attempt for row number: 9
Successful attempt for row number: 10
Successful attempt for row number: 11
Successful attempt for row number: 12
Successful attempt for row number: 13
Successful attempt for row number: 14
Successful attempt for row number: 15
Successful attempt for row number: 16
Successful attempt for row number: 17
Successful attempt for row number: 18
Successful attempt for row number: 19
Successful attempt for row number: 20
Successful attempt for row number: 21
Successful attempt for row number: 22
Successful attempt for row number: 23
Successful attempt for row number: 24
Successful attempt for row number: 25
Successful attempt for

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..


Successful attempt for row number: 28
Successful attempt for row number: 29
Successful attempt for row number: 30
Successful attempt for row number: 31
Successful attempt for row number: 32
Successful attempt for row number: 33
Successful attempt for row number: 34
Successful attempt for row number: 35
Successful attempt for row number: 36
Successful attempt for row number: 37
Successful attempt for row number: 38


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


Successful attempt for row number: 39
Successful attempt for row number: 40
Successful attempt for row number: 41
Successful attempt for row number: 42
Successful attempt for row number: 43
Successful attempt for row number: 44
Successful attempt for row number: 45
Successful attempt for row number: 46
Successful attempt for row number: 47
Successful attempt for row number: 48
Successful attempt for row number: 49
Successful attempt for row number: 50
Successful attempt for row number: 51
Successful attempt for row number: 52
Successful attempt for row number: 53
Successful attempt for row number: 54
Successful attempt for row number: 55
Successful attempt for row number: 56
Successful attempt for row number: 57
Successful attempt for row number: 58
Successful attempt for row number: 59
Successful attempt for row number: 60
Successful attempt for row number: 61
Successful attempt for row number: 62
Successful attempt for row number: 63
Successful attempt for row number: 64
Successful a

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


Successful attempt for row number: 111
Successful attempt for row number: 112
Successful attempt for row number: 113
Successful attempt for row number: 114
Successful attempt for row number: 115
Successful attempt for row number: 116
Successful attempt for row number: 117
Successful attempt for row number: 118
Successful attempt for row number: 119
Successful attempt for row number: 120
Successful attempt for row number: 121
Successful attempt for row number: 122
Successful attempt for row number: 123
Successful attempt for row number: 124
Successful attempt for row number: 125
Successful attempt for row number: 126
Successful attempt for row number: 127
Successful attempt for row number: 128
Successful attempt for row number: 129
Successful attempt for row number: 130
Successful attempt for row number: 131
Successful attempt for row number: 132
Successful attempt for row number: 133
Successful attempt for row number: 134
Successful attempt for row number: 135
Successful attempt for ro

In [34]:
normalized_data_list = []
error_lines = []

for i, json_str in enumerate(UCNets['JSON']):
    try:
        parsed_obj = json.loads(json_str)
        normalized_data_list.append(pd.json_normalize(parsed_obj))
    except json.JSONDecodeError:
        error_lines.append(i)
        continue

# Concatenate the normalized data into one DataFrame
normalized_data = pd.concat(normalized_data_list, ignore_index=True)

error_lines

[]

In [35]:
UCNets.to_csv('data/a19i_all_categories_summarize2.csv',index=False)