In [None]:
import json
from openai import OpenAI
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
import ast

client = OpenAI(api_key='PUT API KEY HERE')

In [24]:
dataset = pd.read_csv('C:/Users/joyse/Desktop/6.8610/Research/Emojiville/Data Processing/Clean Dataset/emojify_cleaned_10k.csv')

examples_df = pd.read_csv('C:/Users/joyse/Desktop/6.8610/Research/Emojiville/Data Processing/Clean Dataset/manually_labelled_cleaned_30.csv')


In [25]:
# Generate example instructions dynamically
example_instructions = "Here are some examples to guide your labeling process:\n\n"

for _, row in examples_df.iterrows():
    # Format each example into the prompt
    example_instructions += (
        f"Tweet: {row['Tokens']}\n"
        f"Sentiment_score: {row['Sentiment_score']}\n"
        f"Sentiment_emotion: {row['Sentiment_emotion']}\n"
        f"Part_of_speech: {row['Part_of_speech']}\n\n"
    )

# Combine examples with the main system prompt
sentiment_system_prompt = f'''
Your goal is to extract the sentiment score, emotional category, and part of speech for the emoji in a tokenized tweet. Below are examples to guide your labeling process:

{example_instructions}

**Instructions:**

- **Step 1**: Identify the emoji in the tokenized tweet. Emojis are represented as symbols or non-English characters.
- **Step 2**: For the emoji, use the emoji and the context of the language in the tweet to determine:
    - **Sentiment**: An integer where **1** is positive, **-1** is negative, and **0** is neutral.
    - **Emotion**: One or more emotions from ["joy", "sadness", "anger", "fear", "love", "surprise", "disgust"].
    - **Part of Speech (POS)**: One of the 15 UD POS tags that best represents the emoji in the context of the tweet. POS tags include:
        - **NOUN**: Names of people, places, things, or ideas (e.g., "dog", "house").
        - **VERB**: Words representing actions or states of being (e.g., "run", "think").
        - **ADJ**: Words describing or modifying nouns (e.g., "big", "blue").
        - **ADV**: Words describing verbs, adjectives, or other adverbs (e.g., "quickly", "very").
        - **PRON**: Words that replace nouns or noun phrases (e.g., "he", "they").
        - **PROPN**: Specific names of entities (e.g., "London", "Google").
        - **DET**: Words that specify nouns (e.g., "the", "an").
        - **ADP**: Prepositions or postpositions (e.g., "in", "on").
        - **CCONJ**: Coordinating conjunctions joining words or clauses (e.g., "and", "but").
        - **SCONJ**: Subordinating conjunctions joining dependent clauses (e.g., "because", "although").
        - **AUX**: Helper verbs used with main verbs (e.g., "is", "will").
        - **NUM**: Words expressing numbers or quantities (e.g., "one", "100").
        - **PART**: Function words like "to" (in "to run") or "not".
        - **PUNCT**: Punctuation symbols (e.g., ".", ",", "!").
        - **INTJ**: Words expressing exclamations (e.g., "wow", "oops").


- **Step 3**: Provide the output in the following JSON format:

```json
{{
    "sentiment": int,     // Int of sentiment for the emoji.
    "emotion": string,    // Str of emotion for the emoji.
    "pos": string         // Str of POS tag for the emoji.
}}
'''

In [26]:
# Load the new data

def get_labels(description):
    response = client.chat.completions.create(
    model="gpt-4o-mini",
    temperature=0.1,
    # This is to enable JSON mode, making sure responses are valid json objects
    response_format={ 
        "type": "json_object"
    },
    messages=[
        {
            "role": "system",
            "content": sentiment_system_prompt
        },
        {
            "role": "user",
            "content": description
        }
    ],
    )
    return response.choices[0].message.content



In [27]:
def process_batch(batch):
    results = []
    for index, row in batch.iterrows():
        for attempt in range(3):  # Retry up to 3 times
            try:
                # Get the tokenized tweet
                tweet = row['tokens']

                # Call the GPT API
                gpt_response = json.loads(get_labels(tweet))

                # Extract sentiment, emotion, and POS
                sentiment = gpt_response['sentiment']
                emotion = gpt_response['emotion']
                pos = gpt_response['pos']

                # Append results
                results.append({
                    'Tokens': tweet,
                    'Sentiment_score': sentiment,
                    'Sentiment_emotion': emotion,
                    'Part_of_speech': pos
                })
                break  # Exit retry loop on success
            except Exception as e:
                if attempt < 2:  # Log only for retryable attempts
                    print(f"Retrying tweet at index {index}: {e}")
                else:
                    print(f"Failed to process tweet at index {index}: {e}")
    return results

In [28]:
from concurrent.futures import ThreadPoolExecutor
import tqdm  # For progress tracking

def label_tweets_parallel(df, num_workers=8, batch_size=50):
    """
    Labels tweets using GPT API with parallel and batched processing.

    Args:
    - df (pd.DataFrame): The input DataFrame containing tweets.
    - num_workers (int): Number of parallel threads.
    - batch_size (int): Number of tweets in each batch.

    Returns:
    - pd.DataFrame: DataFrame with labeled results.
    """
    # Split the DataFrame into smaller batches
    batches = [df.iloc[i:i + batch_size] for i in range(0, len(df), batch_size)]

    # Use ThreadPoolExecutor for parallel processing
    results = []
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        # Process each batch and track progress
        for batch_result in tqdm.tqdm(executor.map(process_batch, batches), total=len(batches), desc="Processing Batches"):
            results.extend(batch_result)

    # Convert results to a DataFrame
    return pd.DataFrame(results)

In [29]:
if __name__ == "__main__":

      # Label the dataset in parallel
    labeled_df = label_tweets_parallel(dataset, num_workers=16, batch_size=10)

    print("Labeling complete!")

Processing Batches: 100%|██████████| 1000/1000 [07:38<00:00,  2.18it/s]

Labeling complete!





In [30]:
pd.set_option('display.max_colwidth', None)

labeled_df.head(100)

Unnamed: 0,Tokens,Sentiment_score,Sentiment_emotion,Part_of_speech
0,"['if', 'not', 'later', ',', 'when', '?', '🍑']",1,Joy,NOUN
1,"['🔗', '|', 'the', 'izombie', '4x08', 'page', 'is', 'now', 'up', '!', 'containing', 'trivia', ',', 'quotes', ',', 'reviews', ',', 'a', 'look', 'at', ""liv's"", 'style', ',', 'comic', 'slides', 'gifs', ',', 'the', 'b', '…']",0,Surprise,NOUN
2,"['hearing', 'bts', 'at', 'work', 'still', 'amazes', 'me', '😍']",1,Love,VERB
3,"['ayyy', 'this', 'is', 'lit', '🔥']",1,Joy,ADJ
4,"['well', 'richard', 'i', 'jumped', 'in', 'the', 'shower', 'and', 'saved', 'her', 'so', 'back', 'off', '😡']",-1,Anger,ADJ
...,...,...,...,...
95,"[""that's"", 'why', 'we', 'go', 'to', 'the', ""'"", 'rewind', ""'"", 'festival', 'these', 'days', '😂']",1,Joy,VERB
96,"['good', 'morning', 'everyone', '😃']",1,Joy,ADJ
97,"['i', ""don't"", 'care', 'about', 'people', 'who', 'judge', 'me', '.', ""i'am"", 'just', 'gonna', 'do', 'whatever', 'makes', 'me', 'happy', 'in', 'my', 'life', '😎']",1,Joy,ADJ
98,"['you', 'know', 'what', 'would', 'be', 'the', 'hottest', 'valentine', 'date', 'for', 'armys', '?', 'hixtape', '🔥', 'kiddin', '…']",1,Joy,NOUN


In [31]:
folder_path = "C:/Users/joyse/Desktop/6.8610/Research/Emojiville/Data Processing/Clean Dataset"
filename1 = 'emojify_cleaned_10k_labelled.csv'

file_path1 = os.path.join(folder_path, filename1)

labeled_df.to_csv(file_path1, index=False)