In [None]:
# openAI setup - base on batching example
import requests
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score

from tqdm import tqdm
tqdm.pandas()

In [None]:
# data setup
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

dataset = pd.read_csv('/content/labelled narrative - not yet labelled.csv')
#channels = list(dataset['channel_name'].unique())
batch_size = 10
max_tokens = 8912 #adjusted for prompt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
!pip install openai==0.28
import openai




In [None]:
def count_tokens(text):
    """
    """
    tokens = word_tokenize(text)
    return len(tokens)

In [None]:
#sort channels by ascending # of snippets for easier batching
def snip_sort(df):

    snippet_counts = df['channel_name'].value_counts()
    sorted_channels = snippet_counts.sort_values().index
    channel_order = pd.Categorical(df['channel_name'], categories=sorted_channels, ordered=True)
    df_sorted = df.assign(channel_order=channel_order).sort_values('channel_order').drop(columns='channel_order')

    return df_sorted

In [None]:
# prompt - input is batch containg n channels and related snippet
prompt_str = lambda input: f"""
    Determine whether a channel should be labeled with the AntiSemitic tag. The definition of AntiSemitic tag is: "Features content created by or for an Antisemitic audience.
    Antisemitism is defined as a certain negative perception of Jews, that may be expressed as hatred towards Jewish individuals, community institutions, or the Jewish religion as a whole.
    Antisemitism frequently charges Jews with conspiring to harm humanity, and it is often used to blame Jews for “why things go wrong.” Antisemitic creators often employ sinister stereotypes
    and negative character traits, especially about money or power, or physical characteristics like large noses. Holocaust denial or pro-Nazi speech is also antisemitic, as well as the dual
    loyalty trope, which accuses Jewish citizens of being more loyal to Israel, or to the alleged priorities of Jews worldwide, than to the interests of their own nations.”

    You are with a list of one or more channels, and a sample of their posts, labeled "Channel Name" and "Posts".
    Give your answer in one word for each channel, either yes or no.
    {input}"""

In [None]:
#create formatted input strings and their token count
def batch_string(df) -> pd.DataFrame:
    aggregated = df.groupby(['channel_name', 'channel_id'])['snippet'].apply(lambda x: '\n'.join(x)).reset_index()

    aggregated['batch_str'] = aggregated.apply(lambda row: f"Channel Name: {row['channel_name']}\nPosts: {row['snippet']}", axis=1)
    aggregated['tokens'] = aggregated['batch_str'].apply(count_tokens)

    return aggregated

In [None]:
# batch strings while maintaining channels, max_tokens
def batching(df):
  input = []
  chans = []
  current = []
  current_len = 0
  batchcount = 0
  current_channels = set()

  for _, row in df.iterrows():
      text = row['batch_str']
      text_len = row['tokens']
      channel = row['channel_name']

      if current_len + text_len > max_tokens:
          if current:
              input.append(' '.join(current))
              chans.append(len(current_channels))
          current = [text]
          current_channels = {channel}
          current_len = text_len
      else:
          current.append(text)
          current_len += text_len
          current_channels.add(channel)

    # Append the last chunk if it's not empty
  if current:
      input.append(' '.join(current))
      chans.append(len(current_channels))

  return input, chans #list of input strings


In [None]:
import numpy as np
sorted_df = snip_sort(dataset)

#create new column input which is the formatted string
#and column to count tokens
sorted_df['snippet'] = sorted_df['snippet'].astype(str).fillna('')
sorted_df = batch_string(sorted_df)

input, chan_tracker = batching(sorted_df)

NameError: name 'snip_sort' is not defined

In [None]:

prompts = [prompt_str(ins) for ins in input]
input_df = pd.DataFrame({
    'prompt': prompts,
    'channels': chan_tracker
})

TypeError: 'method' object is not iterable

In [None]:
# classification task
input_df["output"] = input_df['prompt'].progress_apply(lambda prompt: get_response(prompt))

input_df["prompt_tokens"] = input_df["output"].apply(lambda x: x[0])
input_df["completion_tokens"] = input_df["output"].apply(lambda x: x[1])
input_df["output"] = input_df["output"].apply(lambda x: x[2])



100%|██████████| 3182/3182 [38:57<00:00,  1.36it/s]


In [None]:
input_df['output'].to_csv('output.csv', index=False)
input_df.to_csv('input.csv', index=False)

In [None]:
ds = input_df['output']
channel_names = []
labels = []
for cell in ds:
    # Split the cell into lines
    lines = cell.split('\n')
    for line in lines:
        # Split the line on ' - '
        parts = line.split(' - ')
        if len(parts) == 2:
            # Extract channel name and label
            channel_name = parts[0].replace('Channel Name: ', '').strip()
            label = parts[1].strip()
            # Append to lists
            channel_names.append(channel_name)
            labels.append(label)


In [None]:
new_df = pd.DataFrame({
    'channel_name': channel_names,
    'label': labels
})


Unnamed: 0,channel_name,label
0,commietrashh,No
1,committeefortherepublic,Yes
2,committeeonccp,No
3,commonfilth,Yes
4,communismsurvivor,No
...,...,...
2389,😶‍🌫️,No
2390,🙂,Yes
2391,🛹 Warren R.M. Stuart,Yes
2392,🟣↙↙↙ויזיזי,No


In [None]:
new_df.to_csv("labels.csv", index = False)

In [None]:
# classification task
def get_response(prompt, model="gpt-4", temperature=0.0, verbose=True):
    openai.api_key = api_key
    openai.api_base = "https://api.openai.com/v1/"

    for _ in range(5):
        try:
            chat_completion = openai.ChatCompletion.create(
                model=model,
                temperature=temperature,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt},
                ],
            )
            break
        except Exception as e:
            print(e)
            print("Error. Trying again.")
    else:
        return "Error"

    response = chat_completion["choices"][0]["message"]["content"]
    if not verbose:
        return response

    prompt_tokens = chat_completion["usage"]["prompt_tokens"]
    completion_tokens = chat_completion["usage"]["completion_tokens"]

    return prompt_tokens, completion_tokens, response

In [None]:
output = pd.read_csv('output_df', sep = 'Answer', engine='python')
output.to_csv('out.csv', index=True)