In [1]:
import os
import re
import random
import openai
import demoji
import numpy as np
import pandas as pd

from time import sleep
from tqdm.notebook import tqdm
from unidecode import unidecode


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

openai.api_key = os.getenv("OPENAI_API_KEY")

In [2]:
DATA = "CARDS"
MODEL = "CHATGPT" ## GPT-3

In [3]:
FILE = f"../datasets/generated_disinformation_taxonomy_{DATA}_{MODEL}.csv"

In [4]:
if DATA=="HAMBURG":
    contrarian_data = pd.read_csv("../datasets/hamburg_misinformation_sampled.csv", low_memory=False)
    contrarian_data = contrarian_data[contrarian_data["cards_pred_score"] > 0.9].copy(deep=True)
elif DATA=="CARDS":
    contrarian_data = pd.read_csv("../datasets/cards_waterloo_augmented.csv", low_memory=False)
    contrarian_data = contrarian_data[
        (contrarian_data.DATASET=="cards")&(contrarian_data.PARTITION=="TEST")].copy(deep=True)

<img src="../images/image.png" alt="drawing" width="600"/>

In [7]:
masks = {
    "ice": ["ice", "permafrost", "snow"],
    "ice age" : ["ice age", "global cooling"],
    "cold": ["cold", "snowing"],
    "reason_1_7": ["isn't increasing", "has happened before", "isn't linked to climate change"],
    "cycle": ["cycle", "variation"],
    "greenhouse effect": ["greenhouse effect", "carbon dioxide"],
    "climate sensitivity is low": ["climate sensitivity is low", "climate negative feedbacks reduce warming"],
    "species": ["species", "plants", "reefs"],
    "aren't showing climate impacts": ["aren't showing climate impacts", "are benefiting from climate change"],
    "beneficial": ["beneficial", "not a pollutant"],
    "policies are": ["policies are", "mitigation is", "adaptation is"],
    "ineffective": ["ineffective", "flawed"],
    "technologies": ["technologies", "biofuels"],
    "fossil fuels": ["fossil fuels", "nuclear"],
    "unreliable": ["unreliable", "uncertain", "unsound"],
    "science": ["data", "methods", "models"],
    "alarmists": ["unreliable", "alarmists", "corrupt"]
}

queries = {
    '1_1': "From these tweets generate a new one that implies the [ice] is not melting:\n\n",
    '1_2': "From these tweets generate a new tweet contrarian to climate change since we're heading into an [ice age]:\n\n",
    '1_3': "Contrarian tweets to climate change since the weather is [cold]:\n\n",
    '1_4': "Contrarian tweets to climate change since there haven't been changes in the last decades:\n\n",
    '1_6': "Contrarian tweets to climate change since the sea level rise is an exageration:\n\n",
    '1_7': "Contrarian tweets to climate change since extreme weather [reason_1_7]:\n\n",
    '2_1': "Contrarian tweets to climate change since it is a natural [cycle]:\n\n",
    '2_3': "Contrarian tweets to climate change based on no evidence for [greenhouse effect]:\n\n",
    '3_1': "Contrarian tweets to climate change based on [climate sensitivity is low]:\n\n",
    '3_2': "Contrarian tweets to climate change since the [species] [aren't showing climate impacts]:\n\n",
    '3_3': "Contrarian tweets to climate change based on the CO2 is [beneficial]:\n\n",
    '4_1': "Contrarian tweets to climate change based on climate [policies are] harmful:\n\n",
    '4_2': "Contrarian tweets to climate change based on climate policies are [ineffective]:\n\n",
    '4_4': "Contrarian tweets to climate change based on clean energy [technologies] won't work:\n\n",
    '4_5': "Contrarian tweets to climate change based on the need of energy from [fossil fuels]:\n\n",
    '5_1': "Contrarian tweets to climate change based on climate-related [science] is [unreliable]:\n\n",
    '5_2': "Contrarian tweets to climate change based on the fact that of climate change supporters are [alarmists]:\n\n",
}

def generate_prompt(texts, query, nshots=3):
    """Generates"""
    # Generating fewshots
    samples = texts.sample(nshots).tolist()
    samples = ["{}. {}".format(
        i+1, unidecode(str(sample).replace("\n", ""))) for i, sample in enumerate(samples)]
    fewshots = "\n\n".join(samples) + "\n\n4. "
    
    # Generating query
    matches = re.findall(r"\[.*?\]", queries[label])
    for m in matches:
        choice = random.choice(masks[m[1:-1]])
        query = query.replace(m, choice)
    
    prompt = query + fewshots
    
    return prompt

if os.path.isfile(FILE):
    new_data = pd.read_csv(FILE)
else:
    new_data = pd.DataFrame()

n = 120
texts = contrarian_data["text"]
labels = list(queries.keys())

for label in tqdm(labels):
    for i in range(n):       
        
        prompt = generate_prompt(texts, queries[label])
        try:
            if MODEL=="GPT-3":
                prompts = [prompt]
                response = openai.Completion.create(
                  model="text-davinci-003",
                  prompt=prompts,
                  temperature=0,
                  max_tokens=60,
                )
                completions = [r["text"] for r in response["choices"]]
            elif MODEL=="CHATGPT":
                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0,
                    max_tokens=60,
                )
                completions = [r["message"]["content"] for r in response["choices"]]
        except Exception as e:
            print(e)
            sleep(60)
        
        tmp = {"text": completions, "generated_label": label}
        
        tmp = pd.DataFrame(tmp)
        new_data = pd.concat([new_data, tmp])
        
        new_data.to_csv(FILE, index=False)

  0%|          | 0/17 [00:00<?, ?it/s]

Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
This model's maximum context length is 4096 tokens. However, your messages resulted in 4983 tokens. Please reduce the length of the messages.
Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
This model's maximum context length is 4096 tokens. However, your messages resulted in 4898 tokens. Please reduce the length of the messages.
This model's maximum context length is 4096 tokens. However, your messages resulted in 4907 tokens. Please reduce the length of the messages.
This model's maximum context length is 4096 tokens. However, your messages resulted in 4920 tokens. Please reduce the length of the messages.


In [14]:
new_data.text.str.contains("6.").value_counts()

False    1677
True      387
Name: text, dtype: int64

In [11]:
new_data

Unnamed: 0,text,generated_label
0,"Climate models have been consistently wrong in their predictions, and therefore cannot be relied upon to make policy decisions. We need to focus on real-world data and evidence, not computer simulations.",5_1
1,"The ice is not melting, as evidenced by the lack of statistically significant ""unprecedented changes"" in the industrial era.",1_1
2,"As we head into a global cooling period, it's time to stop the fear-mongering about climate change. The Earth's climate has always been in flux, and humans have little control over it. Let's focus on adapting to the changing climate instead of trying to stop it. #climate",1_2
3,"The fact that it's cold outside doesn't disprove climate change. It's important to look at long-term trends and data, not just short-term weather patterns.",1_3
4,"Climate change is just a natural cycle of the Earth's climate. The planet has gone through periods of warming and cooling throughout its history, long before humans even existed. It's arrogant to think that we have the power to significantly alter the Earth's climate.",1_4
...,...,...
0,The climate change movement is nothing more than a money-making scheme for corrupt politicians and scientists. They use fear tactics to push their agenda and line their pockets with taxpayer dollars. Don't be fooled by their propaganda.,5_2
0,Climate change alarmists are just fear-mongering. The Earth has gone through natural cycles of warming and cooling for millions of years. We don't need to panic and drastically change our way of life based on unproven theories.\n\n5. The media and politicians push the climate change agenda to,5_2
0,"The so-called ""climate change"" movement is just a way for corrupt politicians and scientists to make money and gain power. Don't believe the hype. #fakenews #climatehoax\n\n5. The Earth's climate has always been changing, long before humans even existed. It's arrogant",5_2
0,"Climate change supporters are unreliable because they rely on flawed models and cherry-picked data to push their agenda. The science behind climate change is far from settled, and we should not be making drastic policy decisions based on incomplete and unreliable information. #climatechange #unreliable #flawedmodels",5_2
