# Prepare personas and data for validation

In [1]:
import pandas as pd

## Read the tweet files and process the data.

In [103]:
import json

def read_tweets_file(username):
    file_path = f'./roberto_tweets/{username}.json'
    
    # try:
    # Open and load the JSON file
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    # Extract user tweets and referenced tweets
    user_tweets = data.get('user_tweets', [])
    referenced_tweets = data.get('referenced_tweets', {})
    
    # print(f"Loaded {len(user_tweets)} user tweets and {len(referenced_tweets)} referenced tweets.")
    return user_tweets, referenced_tweets

    # except FileNotFoundError:
    #     print(f"The file {file_path} does not exist.")
    #     return [], {}
    # except json.JSONDecodeError:
    #     print(f"The file {file_path} is empty or contains invalid JSON.")
    #     return [], {}

# Example usage
username = "0x_miss_di"  # Replace with the actual username
user_tweets, referenced_tweets = read_tweets_file(username)

# Do something with the loaded tweets
print(user_tweets)
print(referenced_tweets)

[{'id': '1866190989234491830', 'from_userid': '1372614663003725824', 'from_username': '0x_miss_di', 'text': '@coinbase @AleoHQ @NEARProtocol @Coredao_Org 👀👀', 'created_at': '2024-12-09T18:39:38+00:00', 'retweet_count': 0, 'reply_count': 1, 'like_count': 1, 'quote_count': 0, 'bookmark_count': 0, 'impression_count': 410, 'referenced_tweets': [{'id': 1866165703986074000, 'type': 'replied_to'}]}, {'id': '1866155078232756261', 'from_userid': '1372614663003725824', 'from_username': '0x_miss_di', 'text': 'GM GM!\nhttps://t.co/dhDAeEWnWv\n#JustUseCore @Coredao_Org #Bitcoin', 'created_at': '2024-12-09T16:16:56+00:00', 'retweet_count': 1, 'reply_count': 1, 'like_count': 3, 'quote_count': 0, 'bookmark_count': 0, 'impression_count': 188, 'referenced_tweets': []}, {'id': '1866143817478660567', 'from_userid': '1372614663003725824', 'from_username': '0x_miss_di', 'text': '#Bitcoin #CORE #USA institutions 👀👀 https://t.co/NKfTp9IiPY', 'created_at': '2024-12-09T15:32:12+00:00', 'retweet_count': 8, 'repl

In [139]:
user_info = pd.read_pickle('./roberto_user_data.df.pkl')

In [104]:
# Print out the timeline of a user

def get_user_summary(username,user_info):
    return get_user_info(username,user_info) + '\n' + get_tweets_with_references(username)

def get_user_info(username, user_info):
    user = user_info[user_info['username'] == username]
    if user.empty:
        return f"User '{username}' not found."

    user = user.iloc[0]
    # info = f"Name: {user['name']}\n"
    info = f"Username: {user['username']}\n"
    if user['description']:
        info += f"Description: {user['description']}\n"
    info += f"Followers: {user['followers_count']}\n"
    info += f"Following: {user['following_count']}\n"
    info += f"Tweets: {user['tweet_count']}\n"

    return info

def get_tweets_with_references(username):
    
    user_tweets, referenced_tweets = read_tweets_file(username)
    r = ''

    for tweet in user_tweets:
        tweet_id = tweet['id']
        username = tweet['from_username']
        created_at = tweet['created_at']
        text = tweet['text'].replace('\n',' ')

        # Print the current tweet
        r += f"\n@{username} writes on {created_at}: '{text}'\n" 

        # Check if this tweet has a referenced tweet
        if tweet['referenced_tweets']:            
            for ref in tweet['referenced_tweets']:
                ref_id = str(ref['id'])
                ref_type = ref['type']
                # print(ref_id)

                # Ensure the referenced tweet exists in the referenced_tweets dictionary
                if ref_id in referenced_tweets:
                    ref_content = referenced_tweets[ref_id]['text'].replace('\n',' ')
                    # ref_username = referenced_tweets[ref_id].get('from_username', 'unknown_user')
                    ref_created_at = referenced_tweets[ref_id]['created_at']

                    r += (f"in reply to the following message: '{ref_content}'\n")
                    break

        
    return r

# Example usage
# Assuming you have already read user_tweets and referenced_tweets using the read_tweets_file function
# username = "0x_miss_di"  # Replace with the actual username

# Print tweets with references
# print(get_user_summary("tycoons39",user_info))

## Generate persona from timeline

In [105]:

# We take the timelines above and send to ChatGPT, with a prompt similar to the following :
# PROMPT1 = "Describe the information you can assess about the person based on the information and posts provided, such as their interests, views, and personality.
# The information you provide should capture salient attributes, and should allow an LLM to predict what the user might post in a given situation.""


from openai import OpenAI

openaiclient = OpenAI(api_key="xxx")
MODEL = 'gpt-4o'

def get_persona_from_summary(text):

    # PROMPT = "You will be given a set of social media messages from a specific user. Your task is to describe in as much detail the person based on the information and posts provided, such as their interests, political views, and personality, and how they respond to messages. Describe the person as a persona, in such a way that an LLM that is given this persona is able to reproduce the style and content of writing. Formulate your response as a persona written for an LLM."

    PROMPT = """You are an expert in social media analysis. Based on the following tweets, create a concise persona description (30-80 words) that captures the user's communication style, tone, typical topics, and unique language traits. 
    Start the persona with 'You are @[username]'.
    Focus on:
    - Political views, interests and themes.
    - The user's tone (e.g., sarcastic, formal, enthusiastic).
    - Writing style (e.g., concise, detailed, casual, analytical).
    - Distinctive language features (e.g., slang, emojis)."""

    messages = [{"role": "user", "content": f"{PROMPT}\n\n {text}"}]

    print(messages)

    response = openaiclient.chat.completions.create(
        model=MODEL,
        temperature=0.1,
        messages=messages
    )

    result = ''.join(choice.message.content for choice in response.choices)
    return result 






## Prepare pairs of messages vs replies to use for calibration and validation 

In [143]:
def tweets_and_actual_replies(username):
    pairs = []     
    user_tweets, referenced_tweets = read_tweets_file(username)
    r = ''

    for tweet in user_tweets:
        tweet_id = tweet['id']
        username = tweet['from_username']
        created_at = tweet['created_at']
        text = tweet['text'].replace('\n',' ')

        # Check if this tweet has a referenced tweet
        if tweet['referenced_tweets']:            
            for ref in tweet['referenced_tweets']:
                ref_id = str(ref['id'])
                ref_type = ref['type']
                # print(ref_id)

                # Ensure the referenced tweet exists in the referenced_tweets dictionary
                if ref_id in referenced_tweets:
                    ref_content = referenced_tweets[ref_id]['text'].replace('\n',' ')
                    # ref_username = referenced_tweets[ref_id].get('from_username', 'unknown_user')
                    ref_created_at = referenced_tweets[ref_id]['created_at']

                    r += (f"in reply to the following message: '{ref_content}'\n")
                    pairs.append({'reply_to':ref_content,'message':text})
                    break
    return pairs 

# Create persona from tweets 

In [51]:
# user_info[user_info['listed_count']>100].to_pickle('./roberto_user_data_over100.df.pkl')

In [None]:
user_info = user_info.drop(columns=['persona'])

In [154]:
user_info.to_pickle('./roberto_user_data.df.pkl')

In [None]:
import pandas as pd
import time

# Load user information
user_info = pd.read_pickle('./roberto_user_data.df.pkl')

# Iterate through each row in user_info
for index, row in user_info.iterrows():
    
        username = row['username']

        # Check if persona already exists to avoid redundant processing
        if 'persona' in user_info.columns and pd.notnull(row.get('persona')):
            continue

        try:
            # Get user summary and persona
            usersummary = get_user_summary(username, user_info)
            persona = get_persona_from_summary(usersummary)

            # Store persona back in user_info
            user_info.at[index, 'persona'] = persona

            # Save progress regularly
            user_info.to_pickle('./roberto_user_data.df.pkl')
            print(f"{username} done.")

        except Exception as e:
            print(f"Error processing user {username}: {e}")
            user_info.at[index, 'persona'] = 'FAILED'
            continue

        # Optional: Add a slight delay to avoid overwhelming system resources
        time.sleep(0.1)



In [74]:
# user_info.loc[user_info['persona']!='FAILED']
# user_info.to_pickle('./roberto_user_data.df.pkl')

In [166]:
# Select the users that have personas
user_info = pd.read_pickle('./roberto_user_data.df.pkl')
includedusers = user_info.loc[user_info['persona']!='FAILED'].copy()


In [167]:
# Get their messages and the messages they are replies to. Put into the user dataframe
# Remove the mentions from the tweets, as the model does not have the information needed to replicate them.

import re
for index, row in includedusers.iterrows():
    username = row['username']
    persona = row['persona']
    try:
        tweetsandreplies = tweets_and_actual_replies(username)
        for k in tweetsandreplies:
            # REMOVE MENTIONS
            k['message'] = re.sub(r'@\w+', '', k['message'])
            k['message'] = re.sub(r'\s+', ' ', k['message']).strip() #remove extra spaces

            k['persona'] = persona
            k['username'] = username

        includedusers.at[index, 'tweets'] = tweetsandreplies
    except Exception as e:
        print(e)
        continue

Must have equal len keys and value when setting with an iterable
[Errno 2] No such file or directory: './roberto_tweets/tilaryswank.json'
[Errno 2] No such file or directory: './roberto_tweets/OkSoMik.json'
[Errno 2] No such file or directory: './roberto_tweets/HankSouth247.json'
[Errno 2] No such file or directory: './roberto_tweets/knoxrob1.json'
[Errno 2] No such file or directory: './roberto_tweets/Interst8forty4.json'
[Errno 2] No such file or directory: './roberto_tweets/suzy_rice.json'
[Errno 2] No such file or directory: './roberto_tweets/realvikkii.json'
[Errno 2] No such file or directory: './roberto_tweets/KonniBurton.json'


In [174]:
# Get users with at least a certain number of messages. 
iu = includedusers.loc[(~includedusers['tweets'].isna())&(includedusers['tweets'].str.len()>100)]

In [175]:
all_tweets = list(iu['tweets'].sum())

In [176]:
df = pd.DataFrame(all_tweets)

### SEPARATE TRAINING/TESTING - 20% for test

In [189]:


# Separate to training a random sample of 80% rows, 20% it is zero. We use those for texting.
import numpy as np 

df['training'] = 1

# Randomly select 20% of rows
random_indices = np.random.choice(df.index, size=round(0.2*len(df)), replace=False)

# Set 'training' to 0 for the selected rows
df.loc[random_indices, 'training'] = 0

In [190]:
df.to_pickle('personas_and_tweets.df.pkl')

In [4]:
import pandas as pd 
df = pd.read_pickle('personas_and_tweets.df.pkl')

In [14]:
# Remove empty messages

In [10]:
len(df)

102908

In [11]:
df = df.loc[(df['message'].str.len()>1)&(df['reply_to'].str.len()>1)].copy()

In [12]:
len(df)

101097

In [15]:
df.to_pickle('personas_and_tweets.df.pkl')

# Analyze and extract information about the users 

We want to know who the users are and what their opinions are, to judge their actions etc. 

In [109]:

from openai import OpenAI

openaiclient = OpenAI(api_key="xxx")
MODEL = 'gpt-4o'

def extract_information_from_tweets(text):

    # PROMPT = "You will be given a set of social media messages from a specific user. Your task is to describe in as much detail the person based on the information and posts provided, such as their interests, political views, and personality, and how they respond to messages. Describe the person as a persona, in such a way that an LLM that is given this persona is able to reproduce the style and content of writing. Formulate your response as a persona written for an LLM."

    PROMPT = """Analyze the tweet history of a user to extract the following individual attributes. Respond in JSON format. If the information cannot be determined from the available data, respond with "unknown".

Instructions:
-Base all inferences solely on the data provided.
-For attributes that may have multiple relevant values (e.g., issues they feel strongly about, belief in conspiracy theories), provide a list of strings.
-Use the structured JSON output format provided below.

1. Gender, Age, Race/Ethnicity:
- Make best estimation based on the data.
- Reply "unknown" if unsure.

2. Geographic Location:
- Use mentions of places, or local event references.

3. Political Attributes (Affiliation, Ideology, Partisan Strength, Voting Behavior):
- Identify support for political figures, parties, or causes.
- Strong, repetitive expressions indicate higher partisan strength.
- Economic_Ideology: e.g., views on taxes, welfare, regulation
- Cultural_Ideology: e.g., views on social issues like immigration, gender, religion.

4. Political Interest:
- Frequent political tweets suggest higher interest.

5. Trust in Institutions & Media:
- Analyze sentiment toward government and media entities.
- Explicit distrust or endorsement counts as strong indicators.

6. Issues & Conspiracies:
- Identify recurring themes with emotional intensity for issues.
- List conspiracy theories if mentioned or endorsed.

Response format: 
{
  "Gender": "male/female/non-binary/unknown",
  "Age": "exact age if stated or approximate range (e.g., '20-30')/unknown",
  "Race_Ethnicity": "specific race/ethnicity if stated or inferred/unknown",
  "Geographic_Location": "city, state, or country if known/unknown",
  "Political_Affiliation": "Democrat/Republican/Independent/Other/unknown",
  "Political_Ideology": "liberal/conservative/moderate/other/unknown",
  "Economic_Ideology": "left/center-left/center/center-right/right/unknown",
  "Cultural_Ideology": "left/center-left/center/center-right/right/unknown",
  "Partisan_Strength": "strong/weak/leaning/unknown",  
  "Level_of_Political_Interest": "high/medium/low/unknown",
  "Trust_in_Government_and_Institutions": "high/low/neutral/unknown",
  "Trust_in_Media": "high/low/mixed/unknown",
  "Issues_They_Feel_Strongly_About": ["list of issues", "unknown if none"],
  "Belief_in_Conspiracy_Theories": ["list of conspiracy theories mentioned or endorsed", "unknown if none"]
}
"""

    messages = [{"role": "user", "content": f"{PROMPT}\n\n {text}"}]

    print(messages)

    response = openaiclient.chat.completions.create(
        model=MODEL,
        temperature=0.1,
        messages=messages,
        response_format={ "type": "json_object" }
    )

    result = ''.join(choice.message.content for choice in response.choices)
    return result 




In [112]:
user_info['attributes'] = None 

In [None]:
import pandas as pd
import time

# Load user information
user_info = pd.read_pickle('./roberto_user_data.df.pkl')

# Iterate through each row in user_info
for index, row in user_info.iterrows():
    
        username = row['username']
        print(username)
        print(row['persona'])

        # Check if persona already exists to avoid redundant processing
        if 'attributes' in user_info.columns and pd.notnull(row.get('attributes')):
            continue

        try:
            # Get user summary and persona
            usersummary = get_user_summary(username, user_info)
            attributes = extract_information_from_tweets(usersummary)

            print(attributes)
            # break 

            # Store persona back in user_info
            user_info.at[index, 'attributes'] = attributes

            # Save progress regularly
            user_info.to_pickle('./roberto_user_data.df.pkl')
            print(f"{username} done.")

        except Exception as e:
            print(f"Error processing user {username}: {e}")
            user_info.at[index, 'attributes'] = 'FAILED'
            continue

        # Optional: Add a slight delay to avoid overwhelming system resources
        time.sleep(0.1)



In [115]:
user_info.to_pickle('./roberto_user_data.df.pkl')

# Generate messages using personas

In [39]:
# df.loc[df['training']==0]
# df.loc[(df['training']==0)&(~df['baseline'].isna())]
# df.loc[~df['baseline'].isna()]

Unnamed: 0,reply_to,message,persona,username,training,baseline
13,My GOTY is Animal Well. https://t.co/8TqrtzCKlo,Still gotta peep this one,"You are @scottybones, a candid and outspoken s...",scottybones,0,@scottybones: Animal Well as GOTY? Bold choice...
18,@PhillyD https://t.co/hasWbrA8rj,"This is the best gif for ""smh...but I mean...u...","You are @scottybones, a candid and outspoken s...",scottybones,0,"@PhillyD Oh great, another link to the abyss o..."
28,When I worked at McD’s I always made myself th...,Just recently I went to a burger place and I a...,"You are @scottybones, a candid and outspoken s...",scottybones,0,"@scottybones: LOL, the secret onion hack! 🤣 Wh..."
32,The tension between me and this white box http...,https://t.co/IsniW7r3PC,"You are @scottybones, a candid and outspoken s...",scottybones,0,"Oh, I feel ya! That white box is like the fina..."
42,@mymixtapez Why not ? Jail should be rehabilit...,I agree. America only sees The prison system a...,"You are @scottybones, a candid and outspoken s...",scottybones,0,"@scottybones: @mymixtapez Oh, totally! Because..."
...,...,...,...,...,...,...
68367,🚨 #BREAKING: The House COVID Committee has rel...,I knew this without a tax payer funded 'invest...,You are @anggggg7277. Your tweets reveal a pol...,anggggg7277,0,"Wow, who would've thought? 🙄 The NIH funding g..."
68370,@LuvOneAnother7 @ParikPatelCFA @Kash_Patel Doe...,😂😂,You are @anggggg7277. Your tweets reveal a pol...,anggggg7277,0,@LuvOneAnother7 @ParikPatelCFA @Kash_Patel LOL...
68372,Thoughts on Biden pardoning his son for being ...,Idc. It's expected.,You are @anggggg7277. Your tweets reveal a pol...,anggggg7277,0,"@anggggg7277: Oh, you mean the same media that..."
84864,I just got off a briefing with the Secret Serv...,The more I hear the more concerned I become. h...,You are @VictorCanada. A passionate advocate f...,VictorCanada,0,@VictorCanada: Thank you for sharing your conc...


In [None]:
# Generate responses and store them in the column 'baseline'

openaiclient = OpenAI(api_key="XXX")
MODEL = 'gpt-4o'

def generate_tweet(persona,reply_to_message):

    prompt = f"Write a social media message in the style of the persona described below, in reply to the following message: '{reply_to_message}' \n\n Your persona: {persona}"
    
    messages = [{"role": "user", "content": f"{prompt}"}]

    print(messages)

    response = openaiclient.chat.completions.create(
        model=MODEL,
        temperature=0.1,
        messages=messages
    )

    result = ''.join(choice.message.content for choice in response.choices)
    return result 


# Ensure 'baseline' column exists
df['baseline'] = df.get('baseline', pd.NA)

# Loop over the dataframe
for index, row in df.iterrows():
    if row['training'] == 0 and pd.isna(row['baseline']):  # Skip already processed rows
        try:
            print(row['username'])            
            df.at[index, 'baseline'] = generate_tweet(row['persona'], row['reply_to'])
        except Exception as e:
            print(f"Error processing row {index}: {e}")
            continue

        # Save progress every 10 rows
        if index % 10 == 0:
            df.to_pickle('personas_and_tweets.df.pkl')
            print(f"Progress saved at row {index}")

# Final save after loop completion
df.to_pickle('personas_and_tweets.df.pkl')
print("Processing complete and data saved.")

# generate_tweet(df.at[31000,'persona'],df.at[31000,'reply_to'])


In [46]:
df.to_pickle('personas_and_tweets.df.pkl')

In [41]:
df2 = pd.read_pickle('personas_and_tweets_with_flagging.df.pkl')

In [44]:
df['flagged'] = df2['flagged']

In [45]:
df

Unnamed: 0,reply_to,message,persona,username,training,baseline,flagged
0,The moment Kendrick cemented himself as the go...,Hopefully he doesn't go to a man's room bathro...,"You are @scottybones, a candid and outspoken s...",scottybones,1,,False
1,@Travon Pieces of Shit for Breakfast,https://t.co/gXfjLPQf6y,"You are @scottybones, a candid and outspoken s...",scottybones,1,,False
2,The Grifters? Depresh' Mode? The Who...Gives A...,Pieces of Shit for Breakfast,"You are @scottybones, a candid and outspoken s...",scottybones,1,,True
3,white dudes in black jackets with backpacks wh...,When the police don't find anyone the Mafia he...,"You are @scottybones, a candid and outspoken s...",scottybones,1,,False
4,All the dudes in the replies bringing up her f...,Getting a load blown in your face doesn't coun...,"You are @scottybones, a candid and outspoken s...",scottybones,1,,False
...,...,...,...,...,...,...,...
102903,Matt Gaetz is out as AG. Thoughts?,That is a good start. Now to get the rest of T...,"You are @PoliticalAnt, a passionate and outspo...",PoliticalAnt,1,,False
102904,Matt Gaetz has withdrawn from consideration fo...,Pervert.,"You are @PoliticalAnt, a passionate and outspo...",PoliticalAnt,1,,False
102905,Seriously. If you want to know the real answ...,In your opinion.,"You are @PoliticalAnt, a passionate and outspo...",PoliticalAnt,0,,False
102906,Monica Grant and her daughter were finishing l...,"To my knowledge, fines cannot be added to rest...","You are @PoliticalAnt, a passionate and outspo...",PoliticalAnt,1,,False


## Finetune using OpenAI [This doesn't work, because OpenAI is too strict with censorship.]

When finetuning with OpenAI, we use personas as well as finetuning.

In [None]:
# Check whether the messages will be flagged by OpenAI 

In [None]:
# Check whether things are flagged

from openai import OpenAI
openaiclient = OpenAI(api_key="xxx")
for index, row in df.iterrows():
    

        # Check if persona already exists to avoid redundant processing
        if 'flagged' in df.columns and pd.notnull(row.get('flagged')):
            continue

        # Get user summary and persona
        

        response = openaiclient.moderations.create(
            model="omni-moderation-latest",
            input=row['message'],
        )
        print(f'{row['message']}: {response.results[0].flagged}')
        df.at[index, 'flagged'] = response.results[0].flagged
    




In [None]:
# # We structure dataset like this: 
# {
#   "messages": [
#     {"role": "system", "content": "You are User123, known for being concise and witty."},
#     {"role": "user", "content": "What do you think about AI?"},
#     {"role": "assistant", "content": "AI is fascinating—sharp, fast, and evolving rapidly."}
#   ]
# }

In [91]:
df = pd.read_pickle('personas_and_tweets_flagged2.df.pkl')

In [93]:
# df.loc[~df['harassment'].isna()].sort_values(['harassment'])

Unnamed: 0,reply_to,message,persona,username,training,baseline,flagged,harassment,harassment_threatening,hate,...,violence,violence_graphic,harassment/threatening,hate/threatening,illicit/violent,self-harm/intent,self-harm/instructions,self-harm,sexual/minors,violence/graphic
60927,Where do you get MOST of your STOCK trading or...,E-Trade News and Stock financial data summary,"You are @ScottNMinnis, a retired Marine and fo...",ScottNMinnis,1,,False,2.190782e-07,9.422446e-08,5.173919e-07,...,1.012999e-06,2.994441e-07,9.422446e-08,4.450851e-08,9.972942e-07,8.059446e-08,4.592136e-08,2.225281e-07,2.260324e-07,2.994441e-07
38301,🚨WOW: History is being made right before our...,Yes we will ❤️,"You are @DianaHe50951535, a fervent conservati...",DianaHe50951535,1,,False,2.482480e-07,1.003014e-07,1.760343e-07,...,1.670142e-06,2.260324e-07,1.003014e-07,4.450851e-08,4.356880e-07,1.505702e-07,7.571149e-08,4.289333e-07,9.422446e-08,2.260324e-07
19559,Private credit is typically direct lending to ...,Since interest rates have been so high over th...,"You are @jfrankie91, a seasoned finance profes...",jfrankie91,1,,False,3.089494e-07,6.083590e-08,1.260695e-06,...,7.411280e-07,2.260324e-07,6.083590e-08,4.450851e-08,3.966986e-07,1.328777e-07,4.592136e-08,4.222833e-07,8.059446e-08,2.260324e-07
27527,@BitcoinMagazine @kevinolearytv Voglio stare c...,Bitcoin price is skyrocketing now let’s buy to...,"You are @ValerieAdelaa, a passionate advocate ...",ValerieAdelaa,0,@ValerieAdelaa: @BitcoinMagazine @kevinolearyt...,False,3.288749e-07,3.844935e-07,6.439038e-07,...,1.520684e-06,2.260324e-07,3.844935e-07,8.315280e-08,1.544631e-06,2.902312e-07,3.187565e-07,8.398072e-07,1.136564e-07,2.260324e-07
53246,"US 30-YEAR YIELD RISES 23BPS, BIGGEST DAILY JU...",Mortgage rates UP. Apartment Investing (Multif...,"You are @CooperMcCoyRE, a passionate sports en...",CooperMcCoyRE,0,"🏈 Wow, that's a big move in the 30-year yield!...",False,3.288749e-07,7.338210e-08,1.061613e-06,...,4.356880e-07,2.260324e-07,7.338210e-08,5.203571e-08,6.962256e-07,6.681517e-08,7.338210e-08,1.034853e-07,8.059446e-08,2.260324e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8655,UK thinks it could pull off Opium War III http...,Chinese cowards running scared. Remember folks...,"You are @OnlyFansbrokie, an ER doctor with a p...",OnlyFansbrokie,1,,True,9.818299e-01,1.995847e-02,9.965381e-01,...,1.245225e-01,3.569706e-05,1.995847e-02,6.166297e-05,1.941132e-05,2.173221e-04,2.126730e-04,5.475216e-04,1.284308e-04,3.569706e-05
4318,This is for the women who are shaving their he...,I think we are giving these libtards too much ...,"You are @Jodilee01, a fervent supporter of Don...",Jodilee01,1,,True,9.818299e-01,4.614637e-01,9.974435e-01,...,1.532874e-01,3.353501e-05,4.614637e-01,5.155745e-03,1.056684e-04,2.964674e-04,2.845008e-04,5.470853e-04,3.027835e-04,3.353501e-05
16537,@FranklinHa12947 Question for you: Why did P...,"Because he’s a barbaric, uncivilized Russian. ...","You are @dburlingame49, a retired soldier with...",dburlingame49,0,"Hey @FranklinHa12947, why did Putin invade Ukr...",True,9.818299e-01,1.087897e-03,9.987946e-01,...,1.608038e-02,1.584362e-05,1.087897e-03,1.159193e-05,2.468290e-06,1.568955e-06,2.627477e-06,4.607008e-04,8.750300e-06,1.584362e-05
72495,BREAKING: New leaks indicate that the Kamala H...,That's what happens when you make a retarded e...,"You are @MeyerkordD57572, a politically charge...",MeyerkordD57572,1,,True,9.818299e-01,3.875437e-02,3.865119e-01,...,2.276859e-02,1.686528e-05,3.875437e-02,6.014152e-06,8.888085e-06,2.344992e-04,2.400992e-04,5.604548e-04,1.160420e-04,1.686528e-05


In [85]:
#CHECK THAT ALL VALUES ARE BELOW 0.1  IN BADNESS

columns = [
    'harassment', 'harassment_threatening', 'hate',
    'hate_threatening', 'illicit', 'illicit_violent', 'self_harm',
    'self_harm_instructions', 'self_harm_intent', 'sexual', 'sexual_minors',
    'violence', 'violence_graphic', 'harassment/threatening',
    'hate/threatening', 'illicit/violent', 'self-harm/intent',
    'self-harm/instructions', 'self-harm', 'sexual/minors',
    'violence/graphic'
]

# Filter rows where all specified columns are <= 0.1
filtered_df = df[(df[columns] <= 0.01).all(axis=1)]

df = filtered_df

In [87]:
import json 
# Convert to the fine-tuning format
conversations = []
dff = df.loc[(df['training']==1)&(df['flagged']==False)].copy() # ONLY GET ROWS MEANT FOR TRAINING & ONLY GET ROWS THAT ARE NOT FLAGGED

for _, row in dff.iterrows():
    conversation = {
        "messages": [
            {"role": "system", "content": row['persona']},
            {"role": "user", "content": row['reply_to']},
            {"role": "assistant", "content": row['message']}
        ]
    }
    conversations.append(conversation)

# Save to JSONL format for OpenAI fine-tuning
with open('openai_finetuning_data3.jsonl', 'w', encoding='utf-8') as f:
    for conversation in conversations:
        f.write(json.dumps(conversation,ensure_ascii=False) + '\n')

# Display the first conversation for verification
print(json.dumps(conversations[0], indent=2))

{
  "messages": [
    {
      "role": "system",
      "content": "You are @scottybones, a candid and outspoken social media user with a penchant for sarcasm and humor. Your tweets often delve into political commentary, gaming, and pop culture, with a critical eye on societal issues and corporate practices. You communicate in a casual, sometimes abrasive style, frequently using slang, emojis, and all-caps for emphasis. Your language is peppered with pop culture references and a touch of cynicism, making your online presence both entertaining and thought-provoking."
    },
    {
      "role": "user",
      "content": "@Travon Pieces of Shit for Breakfast"
    },
    {
      "role": "assistant",
      "content": "https://t.co/gXfjLPQf6y"
    }
  ]
}


In [88]:
file_response = client.files.create(
    file=open("openai_finetuning_data3.jsonl", "rb"),
    purpose="fine-tune"
)

In [89]:
file_response

FileObject(id='file-MXyknqisGry8FrTgDeYByP', bytes=40472480, created_at=1738683648, filename='openai_finetuning_data3.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [90]:
# UPLOAD TRAINING FILE

# gpt-4o-mini

from openai import OpenAI
# # CREATE A FINE-TUNED MODEL
client = OpenAI(api_key="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")

job = client.fine_tuning.jobs.create(
    training_file=file_response.id,
    model="gpt-4o-mini-2024-07-18",
    hyperparameters={"n_epochs": 3}
    # method={
        # "type": "dpo",
        # "dpo": {
            # "hyperparameters": {"beta": 0.1},
        # },
    # },
)



In [78]:
client.fine_tuning.jobs.list(limit=10)

# from openai import OpenAI
# client = OpenAI()

# client.fine_tuning.jobs.create(
#     training_file="openai_finetuning_data.jsonl",
#     model="gpt-4o-mini-2024-07-18"
# )

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-Nkr3td7zUK3LfMYqqBtTjqxR', created_at=1738679852, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-7Xnn59w2pGnx5hXTVsxLEIMS', result_files=[], seed=329680549, status='validating_files', trained_tokens=None, training_file='file-Dcw7AWvqmJ3dWM8dp6Wko7', validation_file=None, estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=3)), type='supervised'), user_provided_suffix=None), FineTuningJob(id='ftjob-bwwKAXFl4JvttmS6mJWVnG3R', created_at=1738663655, error=Error(code='invalid_training_file', message="The job failed due to an invalid training file. This training file wa

In [80]:
# from openai import OpenAI
# # client = OpenAI()

# # List 10 fine-tuning jobs
# client.fine_tuning.jobs.list(limit=10)

# # Retrieve the state of a fine-tune
client.fine_tuning.jobs.retrieve("ftjob-Nkr3td7zUK3LfMYqqBtTjqxR").status

# # Cancel a job
# client.fine_tuning.jobs.cancel("ftjob-abc123")

# # List up to 10 events from a fine-tuning job
# client.fine_tuning.jobs.list_events(fine_tuning_job_id="ftjob-abc123", limit=10)

# # Delete a fine-tuned model
# client.models.delete("ft:gpt-3.5-turbo:acemeco:suffix:abc123")

'failed'