In [1]:
import pandas as pd
from tqdm import tqdm
import json
import time
import openai
import re
from transformers import GPT2Tokenizer

In [3]:
df = pd.read_csv('sample.csv')
df.head()

Unnamed: 0,id,author_id,in_reply_to_user_id,quoted_user_id,retweeted_user_id,text,public_metrics.like_count,public_metrics.quote_count,public_metrics.reply_count,public_metrics.retweet_count,...,link_text,party,name,Occupation(s),Education,Residence[5],gpt_sentiment,age,gender,bert_sentiment
0,1174499599181959168,262192574,,,,We should never forget the sacrifices made by ...,22,0,0,2,...,MISSISSIPPI (WLOX) - U.S. Senator Cindy Hyde-S...,Republican,Cindy Hyde-Smith,Lobbyist Farmer,Copiah-Lincoln Community College (AA) Universi...,Brookhaven,positive,65,female,positive
1,1320738770543628288,15808765,,,,It's no secret what Donald Trump and Senate Re...,1494,67,264,323,...,President Trump says in an interview with “60 ...,Democrat,Cory Booker,Lawyer,"Stanford University (BA, MA) Queen's College, ...",Newark,negative,55,male,negative
2,998614598424068101,15745368,,,,Some #Florida elections officials didn’t like ...,259,19,105,112,...,Comment on this story Comment Gift Article Sha...,Republican,Marco Rubio,Lawyer,University of Florida (BA) University of Miami...,West Miami[13],neutral,53,male,negative
3,955084959840129024,13218102,,,,"One woman, two marches: Can women who oppose a...",14,7,61,9,...,Rachel Lamb saw as much fate as coincidence in...,Republican,John Cornyn,Lawyer,"Trinity University (BA) St. Mary's University,...",Austin,neutral,72,male,neutral
4,1255486188892422149,1080960924687704064,,,,The coronavirus pandemic continues to highligh...,8,1,1,2,...,WASHINGTON D.C. (KFVS) - U.S. Senators Josh Ha...,Republican,Josh Hawley,Lawyer Professor,Stanford University (BA) Yale University (JD),Ozark,neutral,44,male,negative


In [4]:
# Function to truncate text to fit within a token limit

# initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def truncate_prompt(prompt, max_tokens, tokenizer):
    tokens = tokenizer.encode(prompt)
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
    return tokenizer.decode(tokens)

In [5]:
#example function to create a prompt 
def make_prompt(name, news_content): 
    prompt = (
        f"You are a U.S. senator {name} arguing on Twitter."
        f"Read the article below and write a tweet with no more than 280 characters to comment on it.\n\n"
        f"Article:\n{news_content}"
    )
    return prompt

In [33]:
openai.api_base = "https://api.openai.com/v1/"
#openai.api_key = "xxx"

In [11]:
# save progress 
all_responses = []
current_row_index = 0 # function to create prompt

In [34]:
# generate sentiment labels for each tweet 
try:
    while current_row_index < df.shape[0]:
        for ind, row in tqdm(df[current_row_index:].iterrows(), total=df[current_row_index:].shape[0]):
            prompt = make_prompt(row['name'], row['text'])
            responses = openai.ChatCompletion.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=100,
                n=1
            )
            response_list = [row['id'], row['text']]
            response_list.extend([i['message']['content'] for i in responses['choices']])
            all_responses.append(response_list)
            current_row_index = ind + 1  # move to the next row

except KeyboardInterrupt:
    print(f"Interrupted at row {current_row_index}. Saving progress...")
    
except Exception as e:
    print(f"An error occurred at row {current_row_index}. Error: {str(e)}. Retrying...")

print("Completed or interrupted. Final progress saved.")

100%|███████████████████████████████████████| 2131/2131 [34:53<00:00,  1.02it/s]

Completed or interrupted. Final progress saved.





In [35]:
len(all_responses)

10000

In [36]:
#save progress
pd.DataFrame(all_responses, columns = ['id', 'text', 'gpt4_name']).to_csv('gpt4_name.csv', index=False)

In [39]:
df2 = pd.read_csv('gpt4_name.csv')
df2.head()

Unnamed: 0,id,text,gpt4_name
0,1174499599181959168,We should never forget the sacrifices made by ...,We must always honor the immense sacrifices of...
1,1320738770543628288,It's no secret what Donald Trump and Senate Re...,It's crystal clear: Donald Trump and Senate Re...
2,998614598424068101,Some #Florida elections officials didn’t like ...,.@FloridaElections didn't agree with me before...
3,955084959840129024,"One woman, two marches: Can women who oppose a...",Empowering women means respecting diverse view...
4,1255486188892422149,The coronavirus pandemic continues to highligh...,America's food supply is in crisis. The closin...


## Labelling

In [16]:
#example function to create a prompt 
def create_prompt(tweet):
    text = 'Classify the sentiment of the following tweet as "Positive", "Negative" or "Neutral" like in the examples: Tweet: "I’m not surprised Alaska ranks at the top for new Army recruits’ preferred duty station. And once they arrive here, it’s our job to make sure they have the support they need both on and off duty" Sentiment: Positive, Tweet: "This would be deeply ironic if it wasn’t so absolutely infuriating" Sentiment: Negative, Tweet: "Given the U.S. is on the brink of recession and much of the world is facing catastrophically high energy prices, the completion of the regulatory process for the Alaska LNG Project must be a top priority for @ENERGY" Sentiment: Neutral, Tweet: '
    tweet = tweet 
    prompt = text + tweet + ' Sentiment:'
    return prompt

In [18]:
# save progress 
all_responses = []
current_row_index = 0 

In [36]:
# generate sentiment labels for each tweet 
try:
    while current_row_index < df2.shape[0]:
        for ind, row in tqdm(df2[current_row_index:].iterrows(), total=df2[current_row_index:].shape[0]):
            prompt = create_prompt(str(row['tweet_name4']))
            responses = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=2,
                n=1
            )
            response_list = [row['id'], row['text'], row['tweet_name4']]
            response_list.extend([i['message']['content'] for i in responses['choices']])
            all_responses.append(response_list)
            current_row_index = ind + 1  # move to the next row

except KeyboardInterrupt:
    print(f"Interrupted at row {current_row_index}. Saving progress...")
    
except Exception as e:
    print(f"An error occurred at row {current_row_index}. Error: {str(e)}. Retrying...")

print("Completed or interrupted. Final progress saved.")

100%|█████████████████████████████████████████| 397/397 [02:47<00:00,  2.37it/s]

Completed or interrupted. Final progress saved.





In [37]:
len(all_responses)

10000

In [38]:
#save progress
pd.DataFrame(all_responses, columns = ['id', 'text', 'tweet_name4', 'label']).to_csv('tweet_name4.csv', index=False)