In [10]:
import pandas as pd

# Load the Excel file
df = pd.read_excel('bp_fp_favorite.xlsx', header=None)

column_index = 4

# Filter rows where the specified column is empty
empty_samples = df[df.iloc[:, column_index].isna()].sample(n=min(400, df[df.iloc[:, column_index].isna()].shape[0]))

# Filter rows where the specified column is not empty
non_empty_samples = df[df.iloc[:, column_index].notna()].sample(n=min(100, df[df.iloc[:, column_index].notna()].shape[0]))

# Combine the samples
combined_samples = pd.concat([empty_samples, non_empty_samples])

# If you need to reset the index
combined_samples.reset_index(drop=True, inplace=True)

# Optionally, save the combined samples to a new Excel file
combined_samples.to_excel('sampled_data.xlsx', index=False)


In [11]:
import pandas as pd

# Load the Excel file without headers 
df = pd.read_excel('sampled_data.xlsx')

column_index_1 = 4  # category
column_index_2 = 6  # posts

# Create a list of tuples (pairs) from the two columns
pairs = list(zip(df.iloc[:, column_index_1], df.iloc[:, column_index_2]))

# # Loop over each pair and print it
# for pair in pairs:
#     print(pair)


In [12]:
import json

file_name = "acryonym_experiment_3.json"

def save_progress_and_responses(last_processed_index, responses):
    with open(file_name, 'w') as file:
        data = {
            'last_processed_index': last_processed_index,
            'responses': responses
        }
        json.dump(data, file)


def load_progress_and_responses():
    try:
        with open(file_name, 'r') as file:
            data = json.load(file)
            return data['last_processed_index'], data['responses']
    except FileNotFoundError:
        return -1, []  # No progress file, start from the beginning

    
import openai
  
api_key = ""
client = openai.OpenAI(api_key=api_key)


# Load the last processed index and existing responses
last_processed_index, responses = load_progress_and_responses()

# Loop through each user post starting from the last processed index
for i in range(last_processed_index + 1, len(pairs)):
    post = pairs[i][1]

    print(f"Processing post: {i + 1}/{len(pairs)}")
    print(len(post))
    
    response = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[
            {
                "role": "system",
                "content": "You are an Adoption Agent who specializes in reviewing and understanding online posts relating to adoptions. The authors occasionally use acronyms or codes; here is a list of relevant ones for reference: AA: African American, Bmom or BM or **; Birth Mom, CC or CA: Caucasian, DD: Dear Daughter, DS: Dear Son, PAP: Prospective Adoptive Parent, SN: Special Needs.  Given the post, only respond yes or no to each of the first three questions:  1) is the author a birth mother, 2) has the birth mother actually given up a child for adoption, 3) did the birth mother give a reason why she chose the adoption family? Finally, if the answer to the first three questions were all yes, then respond concisely to 4) what are the main and specific reasons the adoption family was chosen, if any, choose all that apply: Salary, Age, Occupation, Race, Religion, Order of Presentation, Timing, Siblings, etc, or N/A. Example response: 1) yes 2) yes 3) yes 4) Age: older than self"
            },
            {
                "role": "user",
                "content": post[:50000]
            }
        ]
    )
    response_text = response.choices[0].message.content
    responses.append(response_text)

    # Save progress and responses every 10 posts
    if (i + 1) % 10 == 0:
        save_progress_and_responses(i, responses)

Processing post: 1/500
651
Processing post: 2/500
137
Processing post: 3/500
1073
Processing post: 4/500
314
Processing post: 5/500
458
Processing post: 6/500
1057
Processing post: 7/500
1834
Processing post: 8/500
114
Processing post: 9/500
7843
Processing post: 10/500
147
Processing post: 11/500
1015
Processing post: 12/500
211
Processing post: 13/500
1280
Processing post: 14/500
8190
Processing post: 15/500
27
Processing post: 16/500
8191
Processing post: 17/500
1476
Processing post: 18/500
8191
Processing post: 19/500
1588
Processing post: 20/500
2869
Processing post: 21/500
5454
Processing post: 22/500
329
Processing post: 23/500
257
Processing post: 24/500
8190
Processing post: 25/500
174
Processing post: 26/500
252
Processing post: 27/500
1905
Processing post: 28/500
8191
Processing post: 29/500
3446
Processing post: 30/500
1517
Processing post: 31/500
34
Processing post: 32/500
810
Processing post: 33/500
2994
Processing post: 34/500
757
Processing post: 35/500
8191
Processing 

Processing post: 282/500
468
Processing post: 283/500
230
Processing post: 284/500
253
Processing post: 285/500
8190
Processing post: 286/500
394
Processing post: 287/500
1115
Processing post: 288/500
1629
Processing post: 289/500
4309
Processing post: 290/500
218
Processing post: 291/500
163
Processing post: 292/500
8191
Processing post: 293/500
1708
Processing post: 294/500
2444
Processing post: 295/500
3639
Processing post: 296/500
617
Processing post: 297/500
3303
Processing post: 298/500
8190
Processing post: 299/500
1723
Processing post: 300/500
141
Processing post: 301/500
3344
Processing post: 302/500
8191
Processing post: 303/500
1227
Processing post: 304/500
543
Processing post: 305/500
1713
Processing post: 306/500
5086
Processing post: 307/500
3359
Processing post: 308/500
8191
Processing post: 309/500
2795
Processing post: 310/500
2737
Processing post: 311/500
2833
Processing post: 312/500
1533
Processing post: 313/500
8189
Processing post: 314/500
871
Processing post: 315

In [13]:
# for r in responses:
#     print(r)

In [14]:
import pandas as pd

# Assuming 'pairs' is your list of pairs from the previous step
# And 'outputs' is your new list with the corresponding outputs
# Example:
# pairs = [(1, 2), (3, 4), ...]
# outputs = [5, 7, ...]

# Verify the lists have the same length
assert len(pairs) == len(responses), "The lists must have the same length"

# Combine the pairs and outputs into a single DataFrame
df_combined = pd.DataFrame({
    'Post': [pair[1] for pair in pairs],
    'Original category': [pair[0] for pair in pairs],
    'Acryontm catefory': responses
})

# Save to a new Excel file
df_combined.to_excel('acryonym_results_3.xlsx', index=False)
