In [1]:
import os
import re
import random
import openai
import demoji
import numpy as np
import pandas as pd

from time import sleep
from tqdm.notebook import tqdm
from unidecode import unidecode


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

openai.api_key = os.getenv("OPENAI_API_KEY")

In [2]:
DATA = "CARDS"
MODEL = "GPT-4" ## GPT-3  CHATGPT

In [3]:
FILE = f"../datasets/generated_disinformation_taxonomy_{DATA}_{MODEL}_specific_samples_V1.csv"

In [4]:
if DATA=="HAMBURG":
    contrarian_data = pd.read_csv("../datasets/hamburg_misinformation_sampled.csv", low_memory=False)
    contrarian_data = contrarian_data[contrarian_data["cards_pred_score"] > 0.9].copy(deep=True)
elif DATA=="CARDS":
    contrarian_data = pd.read_csv("../datasets/cards_waterloo_augmented.csv", low_memory=False)
    contrarian_data = contrarian_data[
        (contrarian_data.DATASET=="cards")&(contrarian_data.PARTITION=="TRAIN")].copy(deep=True)

<img src="../images/image.png" alt="drawing" width="600"/>

In [8]:
masks = {
    "ice": ["ice", "permafrost", "snow"],
    "ice age" : ["ice age", "global cooling"],
    "cold": ["cold", "snowing"],
    "reason_1_7": ["isn't increasing", "has happened before", "isn't linked to climate change"],
    "cycle": ["cycle", "variation"],
    "greenhouse effect": ["greenhouse effect", "carbon dioxide"],
    "climate sensitivity is low": ["climate sensitivity is low", "climate negative feedbacks reduce warming"],
    "species": ["species", "plants", "reefs"],
    "aren't showing climate impacts": ["aren't showing climate impacts", "are benefiting from climate change"],
    "beneficial": ["beneficial", "not a pollutant"],
    "policies are": ["policies are", "mitigation is", "adaptation is"],
    "ineffective": ["ineffective", "flawed"],
    "technologies": ["technologies", "biofuels"],
    "fossil fuels": ["gas", "fossil fuels", "nuclear"],
    "unreliable": ["unreliable", "uncertain", "unsound"],
    "science": ["data", "methods", "models"],
    "alarmists": ["unreliable", "alarmists", "corrupt"]
}

queries = {
    '1_1': "From these tweets generate a new one that implies the [ice] is not melting:\n\n",
    '1_2': "From these tweets generate a new tweet contrarian to climate change since we're heading into an [ice age]:\n\n",
    '1_3': "Contrarian tweets to climate change since the weather is [cold]:\n\n",
    '1_4': "Contrarian tweets to climate change since there haven't been changes in the last decades:\n\n",
    '1_6': "Contrarian tweets to climate change since the sea level rise is an exageration:\n\n",
    '1_7': "Contrarian tweets to climate change since extreme weather [reason_1_7]:\n\n",
    '2_1': "Contrarian tweets to climate change since it is a natural [cycle]:\n\n",
    '2_3': "Contrarian tweets to climate change based on no evidence for [greenhouse effect]:\n\n",
    '3_1': "Contrarian tweets to climate change based on [climate sensitivity is low]:\n\n",
    '3_2': "Contrarian tweets to climate change since the [species] [aren't showing climate impacts]:\n\n",
    '3_3': "Contrarian tweets to climate change based on the CO2 is [beneficial]:\n\n",
    '4_1': "Contrarian tweets to climate change based on climate [policies are] harmful:\n\n",
    '4_2': "Contrarian tweets to climate change based on climate policies are [ineffective]:\n\n",
    '4_4': "Contrarian tweets to climate change based on clean energy [technologies] won't work:\n\n",
    '4_5': "Contrarian tweets to climate change based on the need of energy from [fossil fuels]:\n\n",
    '5_1': "Contrarian tweets to climate change based on climate-related [science] is [unreliable]:\n\n",
    '5_2': "Contrarian tweets to climate change based on the fact that of climate change supporters are [alarmists]:\n\n",
}
pd.DataFrame(queries.values(), index=queries.keys())

Unnamed: 0,0
1_1,From these tweets generate a new one that implies the [ice] is not melting:\n\n
1_2,From these tweets generate a new tweet contrarian to climate change since we're heading into an [ice age]:\n\n
1_3,Contrarian tweets to climate change since the weather is [cold]:\n\n
1_4,Contrarian tweets to climate change since there haven't been changes in the last decades:\n\n
1_6,Contrarian tweets to climate change since the sea level rise is an exageration:\n\n
1_7,Contrarian tweets to climate change since extreme weather [reason_1_7]:\n\n
2_1,Contrarian tweets to climate change since it is a natural [cycle]:\n\n
2_3,Contrarian tweets to climate change based on no evidence for [greenhouse effect]:\n\n
3_1,Contrarian tweets to climate change based on [climate sensitivity is low]:\n\n
3_2,Contrarian tweets to climate change since the [species] [aren't showing climate impacts]:\n\n


In [9]:
queries = {
    '1_1': "From these claims generate a new one that implies the [ice] is not melting:\n\n",
    '1_2': "From these claims generate a new claim contrarian to climate change based on the argument we're heading into an [ice age]:\n\n",
    '1_3': "From these claims generate a new claim contrarian to climate change based on the [cold] weather of the moment:\n\n",
    '1_4': "From these claims generate a new claim contrarian to climate change based on the argument that there haven't been changes or effects in the last decades:\n\n",
    '1_6': "From these claims generate a new claim contrarian to climate change based on the argument that the sea level rise has been exaggerated:\n\n",
    '1_7': "From these claims generate a new claim contrarian to climate change based on the argument that extreme weather [reason_1_7]:\n\n",
    '2_1': "From these claims generate a new claim contrarian to climate change based on the argument that it is a natural [cycle]:\n\n",
    '2_3': "From these claims generate a new claim contrarian to climate change based on no evidence for [greenhouse effect]:\n\n",
    '3_1': "From these claims generate a new claim contrarian to climate change based on [climate sensitivity is low]:\n\n",
    '3_2': "From these claims generate a new claim contrarian to climate change based on the argument that since [species] [aren't showing climate impacts]:\n\n",
    '3_3': "From these claims generate a new claim contrarian to climate change based on the CO2 is [beneficial]:\n\n",
    '4_1': "From these claims generate a new claim contrarian to climate change based on climate [policies are] harmful:\n\n",
    '4_2': "From these claims generate a new claim contrarian to climate change based on climate policies are [ineffective]:\n\n",
    '4_4': "From these claims generate a new claim contrarian to climate change based on clean energy [technologies] won't work:\n\n",
    '4_5': "From these claims generate a new claim contrarian to climate change based on the need of energy from [fossil fuels]:\n\n",
    '5_1': "From these claims generate a new claim contrarian to climate change based on climate-related [science] is [unreliable]:\n\n",
    '5_2': "From these claims generate a new claim contrarian to climate change based on the fact that of climate change supporters are [alarmists]:\n\n",
}
pd.DataFrame(queries.values(), index=queries.keys())

Unnamed: 0,0
1_1,From these claims generate a new one that implies the [ice] is not melting:\n\n
1_2,From these claims generate a new claim contrarian to climate change based on the argument we're heading into an [ice age]:\n\n
1_3,From these claims generate a new claim contrarian to climate change based on the [cold] weather of the moment:\n\n
1_4,From these claims generate a new claim contrarian to climate change based on the argument that there haven't been changes or effects in the last decades:\n\n
1_6,From these claims generate a new claim contrarian to climate change based on the argument that the sea level rise has been exaggerated:\n\n
1_7,From these claims generate a new claim contrarian to climate change based on the argument that extreme weather [reason_1_7]:\n\n
2_1,From these claims generate a new claim contrarian to climate change based on the argument that it is a natural [cycle]:\n\n
2_3,From these claims generate a new claim contrarian to climate change based on no evidence for [greenhouse effect]:\n\n
3_1,From these claims generate a new claim contrarian to climate change based on [climate sensitivity is low]:\n\n
3_2,From these claims generate a new claim contrarian to climate change based on the argument that since [species] [aren't showing climate impacts]:\n\n


In [5]:
# queries = {
#     '4_5': "From these claims generate a new claim contrarian to climate change based on the need of energy from [fossil fuels]:\n\n",
# }

def generate_prompt(data, label, nshots=3):
    """Generates"""
    # Generating fewshots
    
    texts = data.loc[data.claim==label, "text"]
    samples = texts.sample(nshots)
    idx = samples.index.tolist()
    samples = samples.tolist()

    samples = ["{}. {}".format(
        i+1, unidecode(str(sample).replace("\n", ""))) for i, sample in enumerate(samples)]
    fewshots = "\n\n".join(samples) + "\n\n4. "
    
    # Generating query
    query = queries[label]
    matches = re.findall(r"\[.*?\]", query)
    for m in matches:
        choice = random.choice(masks[m[1:-1]])
        query = query.replace(m, choice)
    
    prompt = query + fewshots
    
    return prompt, idx

In [None]:
if os.path.isfile(FILE):
    new_data = pd.read_csv(FILE)
else:
    new_data = pd.DataFrame()

n = 400
labels = list(queries.keys())

for label in tqdm(labels[6:]):
    for i in tqdm(range(n), desc=f"{label}:"):       
        sleep(0.01)
        prompt, idx = generate_prompt(contrarian_data, label)
        try:
            if MODEL=="GPT-3":
                prompts = [prompt]
                response = openai.Completion.create(
                  model="text-davinci-003",
                  prompt=prompts,
                  temperature=0,
                  max_tokens=60,
                )
                completions = [r["text"] for r in response["choices"]]
            elif MODEL=="CHATGPT":
                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0,
                    max_tokens=60,
                )
                completions = [r["message"]["content"] for r in response["choices"]]
            elif MODEL=="GPT-4":
                response = openai.ChatCompletion.create(
                    model="gpt-4",
                    messages=[
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0,
                    max_tokens=60,
                )
                completions = [r["message"]["content"] for r in response["choices"]]
        except Exception as e:
            print(f"Exception: {e}.")
            sleep(60)
        
        tmp = {"text": completions, "generated_label": label, "based_claims": str(idx)}
        
        tmp = pd.DataFrame(tmp)
        new_data = pd.concat([new_data, tmp])
        
        new_data.to_csv(FILE, index=False)

  0%|          | 0/17 [00:00<?, ?it/s]

1_1::   0%|          | 0/400 [00:00<?, ?it/s]

1_2::   0%|          | 0/400 [00:00<?, ?it/s]

1_3::   0%|          | 0/400 [00:00<?, ?it/s]

1_4::   0%|          | 0/400 [00:00<?, ?it/s]

1_6::   0%|          | 0/400 [00:00<?, ?it/s]

Exception: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
Exception: Error communicating with OpenAI: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')).


1_7::   0%|          | 0/400 [00:00<?, ?it/s]

2_1::   0%|          | 0/400 [00:00<?, ?it/s]

In [None]:
new_data.drop_duplicates(["text"])

In [5]:
prompt = """
Complete the next idea:
The algorithm will be developed in PyTorch, a library for Parallel Linear Programming that supports GPU usage. Most of the frameworks that have been mentioned except LCP, have already been implemented in PyTorch and can be found in public repositories. The main goal of this stage is to combine these frameworks into a standard pattern that works with the environments.
"""

response = openai.ChatCompletion.create(
                    model="gpt-4",
                    messages=[
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0,
                    max_tokens=60,
                )
completions = [r["message"]["content"] for r in response["choices"]]

RateLimitError: You exceeded your current quota, please check your plan and billing details.