# Import libraries and data

In [1]:
# Import libraries
import os
import openai
import pandas as pd
import numpy as np
import openpyxl

In [2]:
# Get OpenAI API secret key (change this file with your own OpenAI key)
open_ai_key = open('../data/local/openai_key.txt', 'r').read()
openai.api_key = open_ai_key
#openai.Model.list()

In [12]:
# Get monthly data
df_2018_12_neu = pd.read_csv('../data/local/df_neu_2018-12.csv').reset_index()
df_2018_12_neu.info()

FileNotFoundError: [Errno 2] No such file or directory: '../data/local/df_neu_2018-12.csv'

## Representative sample based on week and location

In [381]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2018_12_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2018_12_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2019-w48,Acre,2,0.000316
1,2019-w48,Alagoas,10,0.001578
2,2019-w48,Amapa,6,0.000947
3,2019-w48,Amazonas,10,0.001578
4,2019-w48,Bahia,33,0.005206
...,...,...,...,...
118,2019-w52,Rondonia,3,0.000473
119,2019-w52,Roraima,2,0.000316
120,2019-w52,Santa Catarina,27,0.004259
121,2019-w52,Sao Paulo,249,0.039281


In [382]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2019-w48,Acre,2,0.000316,0
1,2019-w48,Alagoas,10,0.001578,3
2,2019-w48,Amapa,6,0.000947,2
3,2019-w48,Amazonas,10,0.001578,3
4,2019-w48,Bahia,33,0.005206,12
...,...,...,...,...,...
118,2019-w52,Rondonia,3,0.000473,1
119,2019-w52,Roraima,2,0.000316,0
120,2019-w52,Santa Catarina,27,0.004259,10
121,2019-w52,Sao Paulo,249,0.039281,96


In [383]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2018_12_neu_sample_size = pd.merge(df_2018_12_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [384]:
# Step 6: Perform the stratified sampling
df_2018_12_neu_sample = df_2018_12_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2018_12_neu_sample)} rows.")


Sampled 2387 rows.


In [385]:
text_gpt_2018_12_neu = '\n'.join(df_2018_12_neu_sample['text'].astype(str))
text_gpt_2018_12_neu

'Rolê p tomar vacina? Me chama q eu vou agr não vale rir pq eu vou chorar\nNão tomei a vacina de sarampo e vocês?\n@user vacinação\nAqui dá para entender a vacina de Crivella ao tentar desviar a atenção e culpar o mensageiro. Sempre. <url>\ndei vacina pro filho e o filho tá tudo bem\n@user @user Eu mesmo nunca tomei vacina e tô ótimo\nApós 80 jovens passarem mal, pesquisadores descartam problemas com vacina contra HPV <url> <url>\n@user @user 20 a vacina, com o coquetel sai 50. Então avisa que n é coquetel\nMamães papais cuidadores de quianças: Peloamordedeus não ameacem suas crianças com ir ao médico, tomar vacina, injeção, agulha, etc Sério Isso só cria um trauma neles q quando vem no médico e só gritaria choro Fogo no parquinho um horror\nGoverno e pesquisadores descartam problemas com vacina contra HPV - <url> <url>\nAqui dá para entender a vacina de Crivella ao tentar desviar a atenção e culpar o mensageiro. Sempre. <url>\n@user Sou mais meu pai levando o Gato pra vacinar <url>\nT

# Prompts and loops

In [245]:
# Prompt for GPT
prompt_raw_1 = "I would like to summarise in maximum 300 characters and in English what the following sentences in Portuguese talk about. \
Do not use aspects mentioned only in one sentence as part of the summary: TWEETS"

In [386]:
prompt_1 = prompt_raw_1.replace('TWEETS', text_gpt_2018_12_neu)
prompt_1

'I would like to summarise in maximum 300 characters and in English what the following sentences in Portuguese talk about. Do not use aspects mentioned only in one sentence as part of the summary: Rolê p tomar vacina? Me chama q eu vou agr não vale rir pq eu vou chorar\nNão tomei a vacina de sarampo e vocês?\n@user vacinação\nAqui dá para entender a vacina de Crivella ao tentar desviar a atenção e culpar o mensageiro. Sempre. <url>\ndei vacina pro filho e o filho tá tudo bem\n@user @user Eu mesmo nunca tomei vacina e tô ótimo\nApós 80 jovens passarem mal, pesquisadores descartam problemas com vacina contra HPV <url> <url>\n@user @user 20 a vacina, com o coquetel sai 50. Então avisa que n é coquetel\nMamães papais cuidadores de quianças: Peloamordedeus não ameacem suas crianças com ir ao médico, tomar vacina, injeção, agulha, etc Sério Isso só cria um trauma neles q quando vem no médico e só gritaria choro Fogo no parquinho um horror\nGoverno e pesquisadores descartam problemas com vaci

In [None]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_1}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [388]:
response.choices[0].message.content

'The text discusses various opinions and experiences related to vaccination, including reluctance to vaccinate children and pets, reactions to vaccines, and the societal implications of anti-vaccine sentiments. It highlights personal stories of fear, confusion, and the importance of vaccination for public health. The mention of government campaigns and the historical context of vaccination resistance, such as the "Revolta da Vacina," reflects ongoing debates about health and safety.'

In [216]:
#summary = []
#dataset = []

In [371]:
dataset.append("2019_12_neutral")

In [374]:
summary.append(response.choices[0].message.content)

In [375]:
df_2018 = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  11 non-null     object
 1   summary  11 non-null     object
dtypes: object(2)
memory usage: 304.0+ bytes


In [376]:
df_2018.tail(14)

Unnamed: 0,dataset,summary
0,2019_01_neutral,The sentences discuss various perspectives on ...
1,2019_02_neutral,The sentences discuss various aspects of vacci...
2,2019_03_neutral,The excerpts discuss various aspects of vaccin...
3,2019_04_neutral,The text primarily discusses various aspects o...
4,2019_05_neutral,The texts discuss various aspects of vaccinati...
5,2019_06_neutral,The text discusses various aspects of vaccinat...
6,2019_07_neutral,The text primarily discusses experiences and o...
7,2019_08_neutral,The text discusses vaccination campaigns in Br...
8,2019_09_neutral,Experts discuss the challenges of combating mi...
9,2019_10_neutral,The texts discuss the initiation of a national...


In [377]:
response

<OpenAIObject chat.completion id=chatcmpl-ATQgTkhtGLrgBHDKaJqQgWuPrAzV1 at 0x214d08e3810> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided texts primarily discuss vaccination, particularly the historical context of the \"Revolta da Vacina\" (Vaccine Revolt) in Brazil and current vaccination campaigns against diseases like measles and foot-and-mouth disease. The narrative includes personal anecdotes about taking vaccines, societal attitudes towards vaccination, and the importance of immunization. Additionally, there are comments about misinformation surrounding vaccines and the public's response to vaccination campaigns. Overall, the texts reflect on the significance of vaccination in public health and individual experiences related to it, particularly in the context of Brazilian society and health policies.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  

In [378]:
df_2018.to_csv('../data/summary_2018_neutral_tweets.csv')