In [1]:
import pandas as pd
from tqdm import tqdm
import requests
import re
from transformers import GPT2Tokenizer
import time

In [2]:
# enter key 
#API_TOKEN="XXX"

API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B-Instruct"
headers = {"Authorization": f"Bearer {API_TOKEN}"}

In [3]:
# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [4]:
df = pd.read_csv('sample.csv')

Unnamed: 0,id,author_id,in_reply_to_user_id,quoted_user_id,retweeted_user_id,text,public_metrics.like_count,public_metrics.quote_count,public_metrics.reply_count,public_metrics.retweet_count,...,url,link_text,party,name,Occupation(s),Education,Residence[5],sentiment,age,gender
0,1174499599181959168,262192574,,,,We should never forget the sacrifices made by ...,22,0,0,2,...,https://www.wdam.com/2019/09/18/senator-cindy-...,MISSISSIPPI (WLOX) - U.S. Senator Cindy Hyde-S...,Republican,Cindy Hyde-Smith,Lobbyist Farmer,Copiah-Lincoln Community College (AA) Universi...,Brookhaven,Positive,65,female


In [5]:
# function with a prompt example 
def make_prompt(gender, party, news_content): 
    prompt = (
        f"You are a {gender} politician arguing on Twitter and a representative of the {party} party. "
        f"Read the article below and write a tweet with no more than 280 characters to comment on it.\n\n"
        f"Article:\n{news_content}"
    )
    return prompt

In [6]:
# Convert DataFrame to list of prompts
prompts = [
    make_prompt(row["gender"], row["party"], row['link_text']) 
    for _, row in df.iterrows()
]

In [7]:
all_responses = []

In [8]:
current_row_index = 0

In [9]:
# Function to handle the API request with retry logic
def make_request_with_retry(payload):
    retry_delay = 60  # initial delay in seconds
    max_retries = 5
    for attempt in range(max_retries):
        response = requests.post(API_URL, headers=headers, json=payload)
        if response.status_code == 200:
            return response.json()
        elif response.status_code == 429:  # Rate limit error
            print(f"Rate limit reached. Waiting for {retry_delay} seconds before retrying...")
            time.sleep(retry_delay)
            retry_delay *= 2  # Exponential backoff
        else:
            response.raise_for_status()
    raise Exception(f"Failed after {max_retries} attempts. Last response: {response.text}")

In [39]:
# generate AI responses

try:
    for prompt in tqdm(prompts[current_row_index:]):
        
        tokens = tokenizer.encode(prompt)
        if len(tokens) > 8092:
            tokens = tokens[:8092]
            prompt = tokenizer.decode(tokens)
            
        payload = {"inputs": prompt}
        
        response = make_request_with_retry(payload)
        
        #response = requests.post(API_URL, headers=headers, json=payload)
        #generated_text = response.json()
        result = response[0]['generated_text']
        result = result.replace(prompt, "")
                
        all_responses.append(result)
        
        current_row_index += 1  # move to the next row

except KeyboardInterrupt:
    print(f"Interrupted at row {current_row_index}. Saving progress...")
    
except Exception as e:
    print(f"An error occurred at row {current_row_index}. Error: {str(e)}. Retrying...")

print("Completed or interrupted. Final progress saved.")

100%|█████████████████████████████████████| 1882/1882 [1:43:12<00:00,  3.29s/it]

Completed or interrupted. Final progress saved.





In [40]:
len(all_responses)

10000

In [42]:
# Create a DataFrame from the list
responses_df = pd.DataFrame(all_responses, columns=["llama"])

# Save the DataFrame as a CSV file
responses_df.to_csv("llama_gender.csv", index=False)