# Import libraries and data

In [185]:
# Import libraries
import os
import openai
import pandas as pd
import numpy as np
import openpyxl

In [186]:
# Get OpenAI API secret key (change this file with your own OpenAI key)
open_ai_key = open('../data/local/openai_key.txt', 'r').read()
openai.api_key = open_ai_key
#openai.Model.list()

In [187]:
# Prompt for GPT
prompt_raw = "I would like to summarise in maximum 300 characters and in English what the following sentences in Portuguese talk about. \
Do not use aspects mentioned only in one sentence as part of the summary: TWEETS"

# 2017 positive

## January

In [188]:
# Get monthly data
df_2017_01_pos = pd.read_csv('../data/local/df_pos_2017-01.csv').reset_index()
df_2017_01_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3830 entries, 0 to 3829
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3830 non-null   int64  
 1   LocationCode            3830 non-null   object 
 2   LocationName            3830 non-null   object 
 3   StatesCode              3830 non-null   object 
 4   ...4                    3830 non-null   int64  
 5   id                      3830 non-null   float64
 6   text                    3830 non-null   object 
 7   label                   3830 non-null   object 
 8   in_reply_to_status_id   214 non-null    float64
 9   in_reply_to_user_id     235 non-null    float64
 10  quoted_user_id          71 non-null     float64
 11  quoted_status_id        71 non-null     float64
 12  retweeted_user_id       1100 non-null   float64
 13  retweeted_status_id     1100 non-null   float64
 14  created_at              3830 non-null   

### Representative sample based on week and location

In [189]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_01_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_01_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w01,Acre,2,0.000522
1,2017-w01,Alagoas,15,0.003916
2,2017-w01,Amapa,3,0.000783
3,2017-w01,Amazonas,42,0.010966
4,2017-w01,Bahia,78,0.020366
...,...,...,...,...
113,2017-w05,Pernambuco,6,0.001567
114,2017-w05,Rio De Janeiro,1,0.000261
115,2017-w05,Rio Grande Do Sul,6,0.001567
116,2017-w05,Santa Catarina,4,0.001044


In [190]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w01,Acre,2,0.000522,1
1,2017-w01,Alagoas,15,0.003916,9
2,2017-w01,Amapa,3,0.000783,1
3,2017-w01,Amazonas,42,0.010966,26
4,2017-w01,Bahia,78,0.020366,49
...,...,...,...,...,...
113,2017-w05,Pernambuco,6,0.001567,3
114,2017-w05,Rio De Janeiro,1,0.000261,0
115,2017-w05,Rio Grande Do Sul,6,0.001567,3
116,2017-w05,Santa Catarina,4,0.001044,2


In [191]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_01_pos_sample_size = pd.merge(df_2017_01_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [192]:
# Step 6: Perform the stratified sampling
df_2017_01_pos_sample = df_2017_01_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_01_pos_sample)} rows.")


Sampled 2390 rows.


In [193]:
text_gpt_2017_01_pos = '\n'.join(df_2017_01_pos_sample['text'].astype(str))

### Prompts and loops

In [194]:
prompt_1 = prompt_raw.replace('TWEETS', text_gpt_2017_01_pos)

In [195]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_1}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [196]:
response.choices[0].message.content

'Penápolis will participate in the yellow fever vaccination campaign. The HPV vaccination has also been initiated for boys aged 12 to 13, with the goal of vaccinating over 735,000 boys in São Paulo. Additionally, a new dengue vaccine is expected by 2019, with investments for its production underway.'

In [197]:
summary = []
dataset = []

In [198]:
dataset.append("2017_01_positive")

In [199]:
summary.append(response.choices[0].message.content)

In [200]:
df_2017_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  1 non-null      object
 1   summary  1 non-null      object
dtypes: object(2)
memory usage: 144.0+ bytes


In [201]:
df_2017_pos.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_positive,Penápolis will participate in the yellow fever...


In [202]:
response

<OpenAIObject chat.completion id=chatcmpl-AUB4iXQsnZAGvVOGC7LLXuHTfsOPa at 0x2170d3b15e0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Pen\u00e1polis will participate in the yellow fever vaccination campaign. The HPV vaccination has also been initiated for boys aged 12 to 13, with the goal of vaccinating over 735,000 boys in S\u00e3o Paulo. Additionally, a new dengue vaccine is expected by 2019, with investments for its production underway.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731755776,
  "id": "chatcmpl-AUB4iXQsnZAGvVOGC7LLXuHTfsOPa",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 64,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reasoning_tokens": 0,
      "rejected_prediction_toke

## February

In [203]:
# Get monthly data
df_2017_02_pos = pd.read_csv('../data/local/df_pos_2017-02.csv').reset_index()
df_2017_02_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2708 entries, 0 to 2707
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2708 non-null   int64  
 1   LocationCode            2708 non-null   object 
 2   LocationName            2708 non-null   object 
 3   StatesCode              2708 non-null   object 
 4   ...4                    2708 non-null   int64  
 5   id                      2708 non-null   float64
 6   text                    2708 non-null   object 
 7   label                   2708 non-null   object 
 8   in_reply_to_status_id   151 non-null    float64
 9   in_reply_to_user_id     164 non-null    float64
 10  quoted_user_id          71 non-null     float64
 11  quoted_status_id        71 non-null     float64
 12  retweeted_user_id       919 non-null    float64
 13  retweeted_status_id     919 non-null    float64
 14  created_at              2708 non-null   

### Representative sample based on week and location

In [204]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_02_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_02_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w05,Acre,1,0.000369
1,2017-w05,Alagoas,2,0.000739
2,2017-w05,Amapa,3,0.001108
3,2017-w05,Amazonas,7,0.002585
4,2017-w05,Bahia,21,0.007755
...,...,...,...,...
116,2017-w09,Roraima,1,0.000369
117,2017-w09,Santa Catarina,5,0.001846
118,2017-w09,Sao Paulo,88,0.032496
119,2017-w09,Sergipe,9,0.003323


In [205]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w05,Acre,1,0.000369,0
1,2017-w05,Alagoas,2,0.000739,1
2,2017-w05,Amapa,3,0.001108,2
3,2017-w05,Amazonas,7,0.002585,6
4,2017-w05,Bahia,21,0.007755,18
...,...,...,...,...,...
116,2017-w09,Roraima,1,0.000369,0
117,2017-w09,Santa Catarina,5,0.001846,4
118,2017-w09,Sao Paulo,88,0.032496,79
119,2017-w09,Sergipe,9,0.003323,8


In [206]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_02_pos_sample_size = pd.merge(df_2017_02_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [207]:
# Step 6: Perform the stratified sampling
df_2017_02_pos_sample = df_2017_02_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_02_pos_sample)} rows.")


Sampled 2375 rows.


In [208]:
text_gpt_2017_02_pos = '\n'.join(df_2017_02_pos_sample['text'].astype(str))

### Prompts and loops

In [211]:
prompt_2 = prompt_raw.replace('TWEETS', text_gpt_2017_02_pos)

In [212]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_2}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [213]:
response.choices[0].message.content

'The text discusses various vaccination efforts in Brazil, particularly against yellow fever and the Zika virus. It highlights the urgent need for vaccination due to outbreaks, with many health campaigns and increased vaccine availability in response to rising cases. Additionally, it mentions the effectiveness of a new vaccine against HIV.'

In [214]:
#summary = []
#dataset = []

In [215]:
dataset.append("2017_02_positive")

In [216]:
summary.append(response.choices[0].message.content)

In [217]:
df_2017_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  2 non-null      object
 1   summary  2 non-null      object
dtypes: object(2)
memory usage: 160.0+ bytes


In [218]:
df_2017_pos.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_positive,Penápolis will participate in the yellow fever...
1,2017_02_positive,The text discusses various vaccination efforts...


In [219]:
response

<OpenAIObject chat.completion id=chatcmpl-AUB5guRq9ftdIKVKfANjRzHXd5UJW at 0x2170f34dc70> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination efforts in Brazil, particularly against yellow fever and the Zika virus. It highlights the urgent need for vaccination due to outbreaks, with many health campaigns and increased vaccine availability in response to rising cases. Additionally, it mentions the effectiveness of a new vaccine against HIV.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731755836,
  "id": "chatcmpl-AUB5guRq9ftdIKVKfANjRzHXd5UJW",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 57,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reasoning_tokens": 0,


## March

In [220]:
# Get monthly data
df_2017_03_pos = pd.read_csv('../data/local/df_pos_2017-03.csv').reset_index()
df_2017_03_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   5000 non-null   int64  
 1   LocationCode            5000 non-null   object 
 2   LocationName            5000 non-null   object 
 3   StatesCode              5000 non-null   object 
 4   ...4                    5000 non-null   int64  
 5   id                      5000 non-null   float64
 6   text                    5000 non-null   object 
 7   label                   5000 non-null   object 
 8   in_reply_to_status_id   264 non-null    float64
 9   in_reply_to_user_id     281 non-null    float64
 10  quoted_user_id          77 non-null     float64
 11  quoted_status_id        77 non-null     float64
 12  retweeted_user_id       1109 non-null   float64
 13  retweeted_status_id     1109 non-null   float64
 14  created_at              5000 non-null   

### Representative sample based on week and location

In [221]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_03_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_03_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w09,Alagoas,7,0.0014
1,2017-w09,Amapa,1,0.0002
2,2017-w09,Amazonas,3,0.0006
3,2017-w09,Bahia,30,0.0060
4,2017-w09,Ceara,12,0.0024
...,...,...,...,...
113,2017-w13,Rondonia,2,0.0004
114,2017-w13,Roraima,1,0.0002
115,2017-w13,Santa Catarina,25,0.0050
116,2017-w13,Sao Paulo,94,0.0188


In [222]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w09,Alagoas,7,0.0014,3
1,2017-w09,Amapa,1,0.0002,0
2,2017-w09,Amazonas,3,0.0006,1
3,2017-w09,Bahia,30,0.0060,14
4,2017-w09,Ceara,12,0.0024,5
...,...,...,...,...,...
113,2017-w13,Rondonia,2,0.0004,0
114,2017-w13,Roraima,1,0.0002,0
115,2017-w13,Santa Catarina,25,0.0050,12
116,2017-w13,Sao Paulo,94,0.0188,46


In [223]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_03_pos_sample_size = pd.merge(df_2017_03_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [224]:
# Step 6: Perform the stratified sampling
df_2017_03_pos_sample = df_2017_03_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_03_pos_sample)} rows.")


Sampled 2382 rows.


In [225]:
text_gpt_2017_03_pos = '\n'.join(df_2017_03_pos_sample['text'].astype(str))

### Prompts and loops

In [226]:
prompt_3 = prompt_raw.replace('TWEETS', text_gpt_2017_03_pos)

In [227]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_3}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [228]:
response.choices[0].message.content

'The text discusses various aspects of vaccination, including personal experiences with vaccines, the importance of vaccinations against diseases like yellow fever and HPV, and government initiatives to increase vaccination rates. It highlights both the emotional reactions to vaccinations and the logistical efforts to ensure widespread vaccination during outbreaks, particularly in Brazil.'

In [229]:
#summary = []
#dataset = []

In [230]:
dataset.append("2017_03_positive")

In [231]:
summary.append(response.choices[0].message.content)

In [232]:
df_2017_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  3 non-null      object
 1   summary  3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes


In [233]:
df_2017_pos.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_positive,Penápolis will participate in the yellow fever...
1,2017_02_positive,The text discusses various vaccination efforts...
2,2017_03_positive,The text discusses various aspects of vaccinat...


In [234]:
response

<OpenAIObject chat.completion id=chatcmpl-AUB5lZQTOuLU7IYkvCrAgqS9SFUwJ at 0x2170f833040> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various aspects of vaccination, including personal experiences with vaccines, the importance of vaccinations against diseases like yellow fever and HPV, and government initiatives to increase vaccination rates. It highlights both the emotional reactions to vaccinations and the logistical efforts to ensure widespread vaccination during outbreaks, particularly in Brazil.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731755841,
  "id": "chatcmpl-AUB5lZQTOuLU7IYkvCrAgqS9SFUwJ",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 57,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
    

## April

In [235]:
# Get monthly data
df_2017_04_pos = pd.read_csv('../data/local/df_pos_2017-04.csv').reset_index()
df_2017_04_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3405 entries, 0 to 3404
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3405 non-null   int64  
 1   LocationCode            3405 non-null   object 
 2   LocationName            3405 non-null   object 
 3   StatesCode              3405 non-null   object 
 4   ...4                    3405 non-null   int64  
 5   id                      3405 non-null   float64
 6   text                    3405 non-null   object 
 7   label                   3405 non-null   object 
 8   in_reply_to_status_id   176 non-null    float64
 9   in_reply_to_user_id     194 non-null    float64
 10  quoted_user_id          69 non-null     float64
 11  quoted_status_id        69 non-null     float64
 12  retweeted_user_id       1010 non-null   float64
 13  retweeted_status_id     1010 non-null   float64
 14  created_at              3405 non-null   

### Representative sample based on week and location

In [236]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_04_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_04_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w13,Bahia,17,0.004993
1,2017-w13,Ceara,4,0.001175
2,2017-w13,Distrito Federal,1,0.000294
3,2017-w13,Espirito Santo,4,0.001175
4,2017-w13,Goias,5,0.001468
...,...,...,...,...
110,2017-w18,Espirito Santo,1,0.000294
111,2017-w18,Goias,3,0.000881
112,2017-w18,Minas Gerais,7,0.002056
113,2017-w18,Rio Grande Do Sul,2,0.000587


In [237]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w13,Bahia,17,0.004993,12
1,2017-w13,Ceara,4,0.001175,2
2,2017-w13,Distrito Federal,1,0.000294,0
3,2017-w13,Espirito Santo,4,0.001175,2
4,2017-w13,Goias,5,0.001468,3
...,...,...,...,...,...
110,2017-w18,Espirito Santo,1,0.000294,0
111,2017-w18,Goias,3,0.000881,2
112,2017-w18,Minas Gerais,7,0.002056,5
113,2017-w18,Rio Grande Do Sul,2,0.000587,1


In [238]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_04_pos_sample_size = pd.merge(df_2017_04_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [239]:
# Step 6: Perform the stratified sampling
df_2017_04_pos_sample = df_2017_04_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_04_pos_sample)} rows.")


Sampled 2392 rows.


In [240]:
text_gpt_2017_04_pos = '\n'.join(df_2017_04_pos_sample['text'].astype(str))

### Prompts and loops

In [241]:
prompt_4 = prompt_raw.replace('TWEETS', text_gpt_2017_04_pos)

In [242]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_4}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [243]:
response.choices[0].message.content

'The Portuguese sentences discuss various vaccination campaigns, particularly against yellow fever and influenza. They highlight the importance of vaccination for public health, emphasizing ongoing efforts to immunize populations, including children, pregnant women, and healthcare workers. The text also notes successful vaccine tests for diseases like Zika and diabetes, along with calls to address misinformation about vaccines.'

In [244]:
#summary = []
#dataset = []

In [245]:
dataset.append("2017_04_positive")

In [246]:
summary.append(response.choices[0].message.content)

In [247]:
df_2017_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  4 non-null      object
 1   summary  4 non-null      object
dtypes: object(2)
memory usage: 192.0+ bytes


In [248]:
df_2017_pos.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_positive,Penápolis will participate in the yellow fever...
1,2017_02_positive,The text discusses various vaccination efforts...
2,2017_03_positive,The text discusses various aspects of vaccinat...
3,2017_04_positive,The Portuguese sentences discuss various vacci...


In [249]:
response

<OpenAIObject chat.completion id=chatcmpl-AUB5sMWu6SU5VHs8KjiHZU5VulKV5 at 0x2171083cbd0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The Portuguese sentences discuss various vaccination campaigns, particularly against yellow fever and influenza. They highlight the importance of vaccination for public health, emphasizing ongoing efforts to immunize populations, including children, pregnant women, and healthcare workers. The text also notes successful vaccine tests for diseases like Zika and diabetes, along with calls to address misinformation about vaccines.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731755848,
  "id": "chatcmpl-AUB5sMWu6SU5VHs8KjiHZU5VulKV5",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 67,
    "completion_tokens_details": {
   

## May

In [250]:
# Get monthly data
df_2017_05_pos = pd.read_csv('../data/local/df_pos_2017-05.csv').reset_index()
df_2017_05_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3622 entries, 0 to 3621
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3622 non-null   int64  
 1   LocationCode            3622 non-null   object 
 2   LocationName            3622 non-null   object 
 3   StatesCode              3622 non-null   object 
 4   ...4                    3622 non-null   int64  
 5   id                      3622 non-null   float64
 6   text                    3622 non-null   object 
 7   label                   3622 non-null   object 
 8   in_reply_to_status_id   241 non-null    float64
 9   in_reply_to_user_id     247 non-null    float64
 10  quoted_user_id          125 non-null    float64
 11  quoted_status_id        125 non-null    float64
 12  retweeted_user_id       1045 non-null   float64
 13  retweeted_status_id     1045 non-null   float64
 14  created_at              3622 non-null   

### Representative sample based on week and location

In [251]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_05_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_05_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w18,Acre,2,0.000552
1,2017-w18,Alagoas,3,0.000828
2,2017-w18,Amapa,2,0.000552
3,2017-w18,Amazonas,8,0.002209
4,2017-w18,Bahia,28,0.007731
...,...,...,...,...
116,2017-w22,Rondonia,4,0.001104
117,2017-w22,Santa Catarina,5,0.001380
118,2017-w22,Sao Paulo,57,0.015737
119,2017-w22,Sergipe,1,0.000276


In [252]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w18,Acre,2,0.000552,1
1,2017-w18,Alagoas,3,0.000828,2
2,2017-w18,Amapa,2,0.000552,1
3,2017-w18,Amazonas,8,0.002209,5
4,2017-w18,Bahia,28,0.007731,18
...,...,...,...,...,...
116,2017-w22,Rondonia,4,0.001104,2
117,2017-w22,Santa Catarina,5,0.001380,3
118,2017-w22,Sao Paulo,57,0.015737,38
119,2017-w22,Sergipe,1,0.000276,0


In [253]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_05_pos_sample_size = pd.merge(df_2017_05_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [254]:
# Step 6: Perform the stratified sampling
df_2017_05_pos_sample = df_2017_05_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_05_pos_sample)} rows.")


Sampled 2391 rows.


In [255]:
text_gpt_2017_05_pos = '\n'.join(df_2017_05_pos_sample['text'].astype(str))

### Prompts and loops

In [256]:
prompt_5 = prompt_raw.replace('TWEETS', text_gpt_2017_05_pos)

In [257]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_5}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [258]:
response.choices[0].message.content

'The provided Portuguese sentences discuss various topics related to vaccination, including the importance of vaccines against diseases like influenza, HPV, and yellow fever, as well as campaigns promoting immunization for specific groups such as children, the elderly, and pregnant women. It emphasizes ongoing vaccination efforts, public health initiatives, and the necessity of maintaining vaccination schedules to enhance community health and prevent outbreaks. Additionally, there are mentions of the development and efficacy of vaccines, as well as concerns regarding anti-vaccination sentiments and their impact on public health.'

In [259]:
#summary = []
#dataset = []

In [260]:
dataset.append("2017_05_positive")

In [261]:
summary.append(response.choices[0].message.content)

In [262]:
df_2017_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  5 non-null      object
 1   summary  5 non-null      object
dtypes: object(2)
memory usage: 208.0+ bytes


In [263]:
df_2017_pos.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_positive,Penápolis will participate in the yellow fever...
1,2017_02_positive,The text discusses various vaccination efforts...
2,2017_03_positive,The text discusses various aspects of vaccinat...
3,2017_04_positive,The Portuguese sentences discuss various vacci...
4,2017_05_positive,The provided Portuguese sentences discuss vari...


In [264]:
response

<OpenAIObject chat.completion id=chatcmpl-AUB5zKfDoiCN8NCNCwC3qItOsBmXR at 0x21710cd5590> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided Portuguese sentences discuss various topics related to vaccination, including the importance of vaccines against diseases like influenza, HPV, and yellow fever, as well as campaigns promoting immunization for specific groups such as children, the elderly, and pregnant women. It emphasizes ongoing vaccination efforts, public health initiatives, and the necessity of maintaining vaccination schedules to enhance community health and prevent outbreaks. Additionally, there are mentions of the development and efficacy of vaccines, as well as concerns regarding anti-vaccination sentiments and their impact on public health.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731755855,
  "id": "chatcmpl-AUB5zKfDoiCN8NCN

## June

In [265]:
# Get monthly data
df_2017_06_pos = pd.read_csv('../data/local/df_pos_2017-06.csv').reset_index()
df_2017_06_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3612 entries, 0 to 3611
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3612 non-null   int64  
 1   LocationCode            3612 non-null   object 
 2   LocationName            3612 non-null   object 
 3   StatesCode              3612 non-null   object 
 4   ...4                    3612 non-null   int64  
 5   id                      3612 non-null   float64
 6   text                    3612 non-null   object 
 7   label                   3612 non-null   object 
 8   in_reply_to_status_id   243 non-null    float64
 9   in_reply_to_user_id     251 non-null    float64
 10  quoted_user_id          70 non-null     float64
 11  quoted_status_id        70 non-null     float64
 12  retweeted_user_id       1144 non-null   float64
 13  retweeted_status_id     1144 non-null   float64
 14  created_at              3612 non-null   

### Representative sample based on week and location

In [266]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_06_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_06_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w22,Acre,3,0.000831
1,2017-w22,Alagoas,7,0.001938
2,2017-w22,Amapa,18,0.004983
3,2017-w22,Amazonas,17,0.004707
4,2017-w22,Bahia,39,0.010797
...,...,...,...,...
119,2017-w26,Rondonia,1,0.000277
120,2017-w26,Santa Catarina,14,0.003876
121,2017-w26,Sao Paulo,87,0.024086
122,2017-w26,Sergipe,1,0.000277


In [267]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w22,Acre,3,0.000831,2
1,2017-w22,Alagoas,7,0.001938,4
2,2017-w22,Amapa,18,0.004983,12
3,2017-w22,Amazonas,17,0.004707,11
4,2017-w22,Bahia,39,0.010797,26
...,...,...,...,...,...
119,2017-w26,Rondonia,1,0.000277,0
120,2017-w26,Santa Catarina,14,0.003876,9
121,2017-w26,Sao Paulo,87,0.024086,59
122,2017-w26,Sergipe,1,0.000277,0


In [268]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_06_pos_sample_size = pd.merge(df_2017_06_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [269]:
# Step 6: Perform the stratified sampling
df_2017_06_pos_sample = df_2017_06_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_06_pos_sample)} rows.")


Sampled 2391 rows.


In [270]:
text_gpt_2017_06_pos = '\n'.join(df_2017_06_pos_sample['text'].astype(str))

### Prompts and loops

In [271]:
prompt_6 = prompt_raw.replace('TWEETS', text_gpt_2017_06_pos)

In [272]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_6}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [273]:
response.choices[0].message.content

"The text discusses the Brazilian government's decision to make the flu vaccine available to the entire population starting on June 5. It emphasizes the importance of vaccination in preventing flu and related complications, highlighting various campaigns and initiatives across the country. The text also mentions the successful vaccination drives in different states and the promotion of HPV vaccines for both boys and girls. Overall, it advocates for public health and vaccination awareness."

In [274]:
#summary = []
#dataset = []

In [275]:
dataset.append("2017_06_positive")

In [276]:
summary.append(response.choices[0].message.content)

In [277]:
df_2017_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  6 non-null      object
 1   summary  6 non-null      object
dtypes: object(2)
memory usage: 224.0+ bytes


In [278]:
df_2017_pos.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_positive,Penápolis will participate in the yellow fever...
1,2017_02_positive,The text discusses various vaccination efforts...
2,2017_03_positive,The text discusses various aspects of vaccinat...
3,2017_04_positive,The Portuguese sentences discuss various vacci...
4,2017_05_positive,The provided Portuguese sentences discuss vari...
5,2017_06_positive,The text discusses the Brazilian government's ...


In [279]:
response

<OpenAIObject chat.completion id=chatcmpl-AUB67pridVTx2TYASm52ghLj5uOzE at 0x21710ca7680> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses the Brazilian government's decision to make the flu vaccine available to the entire population starting on June 5. It emphasizes the importance of vaccination in preventing flu and related complications, highlighting various campaigns and initiatives across the country. The text also mentions the successful vaccination drives in different states and the promotion of HPV vaccines for both boys and girls. Overall, it advocates for public health and vaccination awareness.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731755863,
  "id": "chatcmpl-AUB67pridVTx2TYASm52ghLj5uOzE",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "

## July

In [280]:
# Get monthly data
df_2017_07_pos = pd.read_csv('../data/local/df_pos_2017-07.csv').reset_index()
df_2017_07_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2014 entries, 0 to 2013
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2014 non-null   int64  
 1   LocationCode            2014 non-null   object 
 2   LocationName            2014 non-null   object 
 3   StatesCode              2014 non-null   object 
 4   ...4                    2014 non-null   int64  
 5   id                      2014 non-null   float64
 6   text                    2014 non-null   object 
 7   label                   2014 non-null   object 
 8   in_reply_to_status_id   151 non-null    float64
 9   in_reply_to_user_id     155 non-null    float64
 10  quoted_user_id          72 non-null     float64
 11  quoted_status_id        72 non-null     float64
 12  retweeted_user_id       697 non-null    float64
 13  retweeted_status_id     697 non-null    float64
 14  created_at              2014 non-null   

### Representative sample based on week and location

In [281]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_07_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_07_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w26,Alagoas,1,0.000497
1,2017-w26,Amazonas,3,0.001490
2,2017-w26,Bahia,8,0.003972
3,2017-w26,Ceara,12,0.005958
4,2017-w26,Distrito Federal,3,0.001490
...,...,...,...,...
110,2017-w31,Parana,1,0.000497
111,2017-w31,Pernambuco,3,0.001490
112,2017-w31,Rio Grande Do Sul,1,0.000497
113,2017-w31,Santa Catarina,2,0.000993


In [288]:
# Step 3: Define the target sample size
#target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
#group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
#group_sizes
# Include all samples since there are not many tweets

In [286]:
# Step 5: Merge this sample size information back to the original DataFrame
#df_2017_07_pos_sample_size = pd.merge(df_2017_07_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
 #                              on=['year_week', 'LocationName'], how='left')

In [289]:
# Step 6: Perform the stratified sampling
#df_2017_07_pos_sample = df_2017_07_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
#    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2017_07_pos_sample = df_2017_07_pos
# Step 7: Check the result
print(f"Sampled {len(df_2017_07_pos_sample)} rows.")


Sampled 2014 rows.


In [291]:
text_gpt_2017_07_pos = '\n'.join(df_2017_07_pos_sample['text'].astype(str))

### Prompts and loops

In [294]:
prompt_7 = prompt_raw.replace('TWEETS', text_gpt_2017_07_pos)

In [297]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_7}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [299]:
response.choices[0].message.content

'The texts discuss various aspects of vaccination, highlighting its importance for public health, the introduction of new vaccines, and ongoing vaccination campaigns in Brazil. Key topics include the need for vaccination against diseases like HPV, HIV, and Zika, as well as the role of public awareness and education in increasing vaccination rates. The texts also mention innovative approaches, such as using fun music to promote immunization and the development of needle-free vaccine delivery methods. Overall, they emphasize that vaccination is a crucial preventive measure against infectious diseases.'

In [300]:
#summary = []
#dataset = []

In [301]:
dataset.append("2017_07_positive")

In [302]:
summary.append(response.choices[0].message.content)

In [303]:
df_2017_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  7 non-null      object
 1   summary  7 non-null      object
dtypes: object(2)
memory usage: 240.0+ bytes


In [304]:
df_2017_pos.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_positive,Penápolis will participate in the yellow fever...
1,2017_02_positive,The text discusses various vaccination efforts...
2,2017_03_positive,The text discusses various aspects of vaccinat...
3,2017_04_positive,The Portuguese sentences discuss various vacci...
4,2017_05_positive,The provided Portuguese sentences discuss vari...
5,2017_06_positive,The text discusses the Brazilian government's ...
6,2017_07_positive,The texts discuss various aspects of vaccinati...


In [305]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBDFrDzwjvsw0hFvqTq9dXIcbmQk at 0x21716933f40> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The texts discuss various aspects of vaccination, highlighting its importance for public health, the introduction of new vaccines, and ongoing vaccination campaigns in Brazil. Key topics include the need for vaccination against diseases like HPV, HIV, and Zika, as well as the role of public awareness and education in increasing vaccination rates. The texts also mention innovative approaches, such as using fun music to promote immunization and the development of needle-free vaccine delivery methods. Overall, they emphasize that vaccination is a crucial preventive measure against infectious diseases.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731756305,
  "id": "chatcmpl-AUBDFrDzwjvsw0hFvqTq9dXIcbmQk",
  "model": "gp

## August

In [306]:
# Get monthly data
df_2017_08_pos = pd.read_csv('../data/local/df_pos_2017-08.csv').reset_index()
df_2017_08_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4828 entries, 0 to 4827
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   4828 non-null   int64  
 1   LocationCode            4828 non-null   object 
 2   LocationName            4828 non-null   object 
 3   StatesCode              4828 non-null   object 
 4   ...4                    4828 non-null   int64  
 5   id                      4828 non-null   float64
 6   text                    4828 non-null   object 
 7   label                   4828 non-null   object 
 8   in_reply_to_status_id   193 non-null    float64
 9   in_reply_to_user_id     196 non-null    float64
 10  quoted_user_id          71 non-null     float64
 11  quoted_status_id        71 non-null     float64
 12  retweeted_user_id       3196 non-null   float64
 13  retweeted_status_id     3196 non-null   float64
 14  created_at              4828 non-null   

### Representative sample based on week and location

In [307]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_08_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_08_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w31,Alagoas,1,0.000207
1,2017-w31,Amazonas,2,0.000414
2,2017-w31,Bahia,6,0.001243
3,2017-w31,Ceara,6,0.001243
4,2017-w31,Distrito Federal,5,0.001036
...,...,...,...,...
105,2017-w35,Rio Grande Do Sul,55,0.011392
106,2017-w35,Roraima,2,0.000414
107,2017-w35,Santa Catarina,6,0.001243
108,2017-w35,Sao Paulo,65,0.013463


In [308]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w31,Alagoas,1,0.000207,0
1,2017-w31,Amazonas,2,0.000414,1
2,2017-w31,Bahia,6,0.001243,3
3,2017-w31,Ceara,6,0.001243,3
4,2017-w31,Distrito Federal,5,0.001036,2
...,...,...,...,...,...
105,2017-w35,Rio Grande Do Sul,55,0.011392,27
106,2017-w35,Roraima,2,0.000414,1
107,2017-w35,Santa Catarina,6,0.001243,3
108,2017-w35,Sao Paulo,65,0.013463,32


In [309]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_08_pos_sample_size = pd.merge(df_2017_08_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [310]:
# Step 6: Perform the stratified sampling
df_2017_08_pos_sample = df_2017_08_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_08_pos_sample)} rows.")


Sampled 2405 rows.


In [311]:
text_gpt_2017_08_pos = '\n'.join(df_2017_08_pos_sample['text'].astype(str))

### Prompts and loops

In [312]:
prompt_8 = prompt_raw.replace('TWEETS', text_gpt_2017_08_pos)

In [313]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_8}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [314]:
response.choices[0].message.content

'The messages primarily focus on the importance of vaccination, specifically for various diseases including HPV, rabies, and yellow fever. They encourage people to adopt pets that are healthy, vaccinated, and ready for a new family. Additionally, there are mentions of campaigns promoting vaccinations for both animals and humans, emphasizing the need for public awareness and participation in these health initiatives.'

In [315]:
#summary = []
#dataset = []

In [316]:
dataset.append("2017_08_positive")

In [317]:
summary.append(response.choices[0].message.content)

In [318]:
df_2017_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  8 non-null      object
 1   summary  8 non-null      object
dtypes: object(2)
memory usage: 256.0+ bytes


In [319]:
df_2017_pos.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_positive,Penápolis will participate in the yellow fever...
1,2017_02_positive,The text discusses various vaccination efforts...
2,2017_03_positive,The text discusses various aspects of vaccinat...
3,2017_04_positive,The Portuguese sentences discuss various vacci...
4,2017_05_positive,The provided Portuguese sentences discuss vari...
5,2017_06_positive,The text discusses the Brazilian government's ...
6,2017_07_positive,The texts discuss various aspects of vaccinati...
7,2017_08_positive,The messages primarily focus on the importance...


In [320]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBEQBEwO6H2PadqY9nPlIu2oRyAo at 0x21716940130> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The messages primarily focus on the importance of vaccination, specifically for various diseases including HPV, rabies, and yellow fever. They encourage people to adopt pets that are healthy, vaccinated, and ready for a new family. Additionally, there are mentions of campaigns promoting vaccinations for both animals and humans, emphasizing the need for public awareness and participation in these health initiatives.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731756378,
  "id": "chatcmpl-AUBEQBEwO6H2PadqY9nPlIu2oRyAo",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 71,
    "completion_tokens_details": {
      "accepted

## September

In [321]:
# Get monthly data
df_2017_09_pos = pd.read_csv('../data/local/df_pos_2017-09.csv').reset_index()
df_2017_09_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2435 entries, 0 to 2434
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2435 non-null   int64  
 1   LocationCode            2435 non-null   object 
 2   LocationName            2435 non-null   object 
 3   StatesCode              2435 non-null   object 
 4   ...4                    2435 non-null   int64  
 5   id                      2435 non-null   float64
 6   text                    2435 non-null   object 
 7   label                   2435 non-null   object 
 8   in_reply_to_status_id   200 non-null    float64
 9   in_reply_to_user_id     206 non-null    float64
 10  quoted_user_id          70 non-null     float64
 11  quoted_status_id        70 non-null     float64
 12  retweeted_user_id       700 non-null    float64
 13  retweeted_status_id     700 non-null    float64
 14  created_at              2435 non-null   

### Representative sample based on week and location

In [322]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_09_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_09_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w35,Amapa,1,0.000411
1,2017-w35,Amazonas,2,0.000821
2,2017-w35,Bahia,14,0.005749
3,2017-w35,Ceara,4,0.001643
4,2017-w35,Distrito Federal,3,0.001232
...,...,...,...,...
108,2017-w39,Rondonia,1,0.000411
109,2017-w39,Santa Catarina,7,0.002875
110,2017-w39,Sao Paulo,60,0.024641
111,2017-w39,Sergipe,4,0.001643


In [326]:
# Step 3: Define the target sample size
#target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
#group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
#group_sizes
# Taking all tweets since there were not enough for sampling

In [324]:
# Step 5: Merge this sample size information back to the original DataFrame
#df_2017_09_pos_sample_size = pd.merge(df_2017_09_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
 #                              on=['year_week', 'LocationName'], how='left')

In [327]:
# Step 6: Perform the stratified sampling
#df_2017_09_pos_sample = df_2017_09_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2017_09_pos_sample = df_2017_09_pos
# Step 7: Check the result
print(f"Sampled {len(df_2017_09_pos_sample)} rows.")


Sampled 2435 rows.


In [329]:
text_gpt_2017_09_pos = '\n'.join(df_2017_09_pos_sample['text'].astype(str))

### Prompts and loops

In [330]:
prompt_9 = prompt_raw.replace('TWEETS', text_gpt_2017_09_pos)

In [333]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_9}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [335]:
response.choices[0].message.content

"The text discusses various aspects of vaccination in Brazil, highlighting campaigns for vaccines against diseases like HPV and rabies, the importance of updating vaccination records, and efforts to improve public health. It emphasizes the country's high vaccination coverage, the availability of vaccines for children and adolescents, and ongoing studies on vaccines, including those for Zika."

In [336]:
#summary = []
#dataset = []

In [337]:
dataset.append("2017_09_positive")

In [338]:
summary.append(response.choices[0].message.content)

In [339]:
df_2017_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  9 non-null      object
 1   summary  9 non-null      object
dtypes: object(2)
memory usage: 272.0+ bytes


In [340]:
df_2017_pos.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_positive,Penápolis will participate in the yellow fever...
1,2017_02_positive,The text discusses various vaccination efforts...
2,2017_03_positive,The text discusses various aspects of vaccinat...
3,2017_04_positive,The Portuguese sentences discuss various vacci...
4,2017_05_positive,The provided Portuguese sentences discuss vari...
5,2017_06_positive,The text discusses the Brazilian government's ...
6,2017_07_positive,The texts discuss various aspects of vaccinati...
7,2017_08_positive,The messages primarily focus on the importance...
8,2017_09_positive,The text discusses various aspects of vaccinat...


In [341]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBIUHqg1OJ2hJfxNLOUQpGXSbkiC at 0x21716f75220> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various aspects of vaccination in Brazil, highlighting campaigns for vaccines against diseases like HPV and rabies, the importance of updating vaccination records, and efforts to improve public health. It emphasizes the country's high vaccination coverage, the availability of vaccines for children and adolescents, and ongoing studies on vaccines, including those for Zika.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731756630,
  "id": "chatcmpl-AUBIUHqg1OJ2hJfxNLOUQpGXSbkiC",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 65,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
 

## October

In [342]:
# Get monthly data
df_2017_10_pos = pd.read_csv('../data/local/df_pos_2017-10.csv').reset_index()
df_2017_10_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1830 entries, 0 to 1829
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1830 non-null   int64  
 1   LocationCode            1830 non-null   object 
 2   LocationName            1830 non-null   object 
 3   StatesCode              1830 non-null   object 
 4   ...4                    1830 non-null   int64  
 5   id                      1830 non-null   float64
 6   text                    1830 non-null   object 
 7   label                   1830 non-null   object 
 8   in_reply_to_status_id   166 non-null    float64
 9   in_reply_to_user_id     176 non-null    float64
 10  quoted_user_id          58 non-null     float64
 11  quoted_status_id        58 non-null     float64
 12  retweeted_user_id       574 non-null    float64
 13  retweeted_status_id     574 non-null    float64
 14  created_at              1830 non-null   

### Representative sample based on week and location

In [343]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_10_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_10_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w40,Alagoas,3,0.001639
1,2017-w40,Amapa,1,0.000546
2,2017-w40,Amazonas,17,0.009290
3,2017-w40,Bahia,14,0.007650
4,2017-w40,Ceara,11,0.006011
...,...,...,...,...
99,2017-w44,Parana,9,0.004918
100,2017-w44,Pernambuco,4,0.002186
101,2017-w44,Rio Grande Do Sul,14,0.007650
102,2017-w44,Santa Catarina,3,0.001639


In [344]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w40,Alagoas,3,0.001639,4
1,2017-w40,Amapa,1,0.000546,1
2,2017-w40,Amazonas,17,0.009290,22
3,2017-w40,Bahia,14,0.007650,18
4,2017-w40,Ceara,11,0.006011,14
...,...,...,...,...,...
99,2017-w44,Parana,9,0.004918,12
100,2017-w44,Pernambuco,4,0.002186,5
101,2017-w44,Rio Grande Do Sul,14,0.007650,18
102,2017-w44,Santa Catarina,3,0.001639,4


In [345]:
# Step 5: Merge this sample size information back to the original DataFrame
#df_2017_10_pos_sample_size = pd.merge(df_2017_10_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
 #                              on=['year_week', 'LocationName'], how='left')
# Taking all tweets for not having enough tweets for sampling

In [347]:
# Step 6: Perform the stratified sampling
#df_2017_10_pos_sample = df_2017_10_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2017_10_pos_sample = df_2017_10_pos
# Step 7: Check the result
print(f"Sampled {len(df_2017_10_pos_sample)} rows.")


Sampled 1830 rows.


In [349]:
text_gpt_2017_10_pos = '\n'.join(df_2017_10_pos_sample['text'].astype(str))

### Prompts and loops

In [350]:
prompt_10 = prompt_raw.replace('TWEETS', text_gpt_2017_10_pos)

In [351]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_10}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [352]:
response.choices[0].message.content

'The texts discuss various aspects of vaccination, including personal experiences, public health campaigns, and the significance of vaccines in preventing diseases like yellow fever, HPV, measles, and rabies. They highlight the importance of keeping vaccinations up-to-date, address public health alerts regarding outbreaks, and encourage participation in vaccination campaigns for both humans and pets. Additionally, some posts express humor or skepticism about vaccines and discuss the societal implications of vaccination rates.'

In [353]:
#summary = []
#dataset = []

In [354]:
dataset.append("2017_10_positive")

In [355]:
summary.append(response.choices[0].message.content)

In [356]:
df_2017_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  10 non-null     object
 1   summary  10 non-null     object
dtypes: object(2)
memory usage: 288.0+ bytes


In [357]:
df_2017_pos.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_positive,Penápolis will participate in the yellow fever...
1,2017_02_positive,The text discusses various vaccination efforts...
2,2017_03_positive,The text discusses various aspects of vaccinat...
3,2017_04_positive,The Portuguese sentences discuss various vacci...
4,2017_05_positive,The provided Portuguese sentences discuss vari...
5,2017_06_positive,The text discusses the Brazilian government's ...
6,2017_07_positive,The texts discuss various aspects of vaccinati...
7,2017_08_positive,The messages primarily focus on the importance...
8,2017_09_positive,The text discusses various aspects of vaccinat...
9,2017_10_positive,The texts discuss various aspects of vaccinati...


In [358]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBK3olSszVC7B0CIAGG9AAP43gjV at 0x21716775680> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The texts discuss various aspects of vaccination, including personal experiences, public health campaigns, and the significance of vaccines in preventing diseases like yellow fever, HPV, measles, and rabies. They highlight the importance of keeping vaccinations up-to-date, address public health alerts regarding outbreaks, and encourage participation in vaccination campaigns for both humans and pets. Additionally, some posts express humor or skepticism about vaccines and discuss the societal implications of vaccination rates.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731756727,
  "id": "chatcmpl-AUBK3olSszVC7B0CIAGG9AAP43gjV",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint

## November

In [359]:
# Get monthly data
df_2017_11_pos = pd.read_csv('../data/local/df_pos_2017-11.csv').reset_index()
df_2017_11_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2325 entries, 0 to 2324
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2325 non-null   int64  
 1   LocationCode            2325 non-null   object 
 2   LocationName            2325 non-null   object 
 3   StatesCode              2325 non-null   object 
 4   ...4                    2325 non-null   int64  
 5   id                      2325 non-null   float64
 6   text                    2325 non-null   object 
 7   label                   2325 non-null   object 
 8   in_reply_to_status_id   162 non-null    float64
 9   in_reply_to_user_id     170 non-null    float64
 10  quoted_user_id          48 non-null     float64
 11  quoted_status_id        48 non-null     float64
 12  retweeted_user_id       1233 non-null   float64
 13  retweeted_status_id     1233 non-null   float64
 14  created_at              2325 non-null   

### Representative sample based on week and location

In [360]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_11_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_11_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w44,Acre,1,0.000430
1,2017-w44,Bahia,3,0.001290
2,2017-w44,Ceara,7,0.003011
3,2017-w44,Distrito Federal,3,0.001290
4,2017-w44,Espirito Santo,7,0.003011
...,...,...,...,...
103,2017-w48,Rio Grande Do Sul,22,0.009462
104,2017-w48,Rondonia,2,0.000860
105,2017-w48,Santa Catarina,4,0.001720
106,2017-w48,Sao Paulo,94,0.040430


In [361]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w44,Acre,1,0.000430,1
1,2017-w44,Bahia,3,0.001290,3
2,2017-w44,Ceara,7,0.003011,7
3,2017-w44,Distrito Federal,3,0.001290,3
4,2017-w44,Espirito Santo,7,0.003011,7
...,...,...,...,...,...
103,2017-w48,Rio Grande Do Sul,22,0.009462,23
104,2017-w48,Rondonia,2,0.000860,2
105,2017-w48,Santa Catarina,4,0.001720,4
106,2017-w48,Sao Paulo,94,0.040430,99


In [362]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_11_pos_sample_size = pd.merge(df_2017_11_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [365]:
# Step 6: Perform the stratified sampling
#df_2017_11_pos_sample = df_2017_11_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2017_11_pos_sample = df_2017_11_pos
# Step 7: Check the result
print(f"Sampled {len(df_2017_11_pos_sample)} rows.")


Sampled 2325 rows.


In [366]:
text_gpt_2017_11_pos = '\n'.join(df_2017_11_pos_sample['text'].astype(str))

### Prompts and loops

In [367]:
prompt_11 = prompt_raw.replace('TWEETS', text_gpt_2017_11_pos)

In [369]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_11}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [371]:
response.choices[0].message.content

"The sentences discuss the importance of vaccination for both humans and animals, expressing gratitude for vaccines and highlighting personal experiences with them. They also emphasize the need for public health measures, such as campaigns against diseases like rabies and yellow fever, and the development of new vaccines, including for addiction and illnesses like leishmaniasis and HPV. Overall, there's a strong advocacy for vaccination as essential for health and well-being."

In [372]:
#summary = []
#dataset = []

In [373]:
dataset.append("2017_11_positive")

In [374]:
summary.append(response.choices[0].message.content)

In [375]:
df_2017_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  11 non-null     object
 1   summary  11 non-null     object
dtypes: object(2)
memory usage: 304.0+ bytes


In [376]:
df_2017_pos.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_positive,Penápolis will participate in the yellow fever...
1,2017_02_positive,The text discusses various vaccination efforts...
2,2017_03_positive,The text discusses various aspects of vaccinat...
3,2017_04_positive,The Portuguese sentences discuss various vacci...
4,2017_05_positive,The provided Portuguese sentences discuss vari...
5,2017_06_positive,The text discusses the Brazilian government's ...
6,2017_07_positive,The texts discuss various aspects of vaccinati...
7,2017_08_positive,The messages primarily focus on the importance...
8,2017_09_positive,The text discusses various aspects of vaccinat...
9,2017_10_positive,The texts discuss various aspects of vaccinati...


In [377]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBYvCED4jHL8xkRSATvm97sTlpTM at 0x2171732ff40> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The sentences discuss the importance of vaccination for both humans and animals, expressing gratitude for vaccines and highlighting personal experiences with them. They also emphasize the need for public health measures, such as campaigns against diseases like rabies and yellow fever, and the development of new vaccines, including for addiction and illnesses like leishmaniasis and HPV. Overall, there's a strong advocacy for vaccination as essential for health and well-being.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731757649,
  "id": "chatcmpl-AUBYvCED4jHL8xkRSATvm97sTlpTM",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_to

## December

In [378]:
# Get monthly data
df_2017_12_pos = pd.read_csv('../data/local/df_pos_2017-12.csv').reset_index()
df_2017_12_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1355 entries, 0 to 1354
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1355 non-null   int64  
 1   LocationCode            1355 non-null   object 
 2   LocationName            1355 non-null   object 
 3   StatesCode              1355 non-null   object 
 4   ...4                    1355 non-null   int64  
 5   id                      1355 non-null   float64
 6   text                    1355 non-null   object 
 7   label                   1355 non-null   object 
 8   in_reply_to_status_id   144 non-null    float64
 9   in_reply_to_user_id     150 non-null    float64
 10  quoted_user_id          46 non-null     float64
 11  quoted_status_id        46 non-null     float64
 12  retweeted_user_id       439 non-null    float64
 13  retweeted_status_id     439 non-null    float64
 14  created_at              1355 non-null   

### Representative sample based on week and location

In [379]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_12_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_12_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w48,Amapa,1,0.000738
1,2017-w48,Amazonas,2,0.001476
2,2017-w48,Bahia,7,0.005166
3,2017-w48,Ceara,6,0.004428
4,2017-w48,Distrito Federal,1,0.000738
...,...,...,...,...
113,2017-w53,Minas Gerais,6,0.004428
114,2017-w53,Pernambuco,1,0.000738
115,2017-w53,Rio Grande Do Sul,1,0.000738
116,2017-w53,Roraima,1,0.000738


In [380]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w48,Amapa,1,0.000738,1
1,2017-w48,Amazonas,2,0.001476,3
2,2017-w48,Bahia,7,0.005166,12
3,2017-w48,Ceara,6,0.004428,10
4,2017-w48,Distrito Federal,1,0.000738,1
...,...,...,...,...,...
113,2017-w53,Minas Gerais,6,0.004428,10
114,2017-w53,Pernambuco,1,0.000738,1
115,2017-w53,Rio Grande Do Sul,1,0.000738,1
116,2017-w53,Roraima,1,0.000738,1


In [381]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_12_pos_sample_size = pd.merge(df_2017_12_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [384]:
# Step 6: Perform the stratified sampling
#df_2017_12_pos_sample = df_2017_12_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2017_12_pos_sample = df_2017_12_pos
# Step 7: Check the result
print(f"Sampled {len(df_2017_12_pos_sample)} rows.")


Sampled 1355 rows.


In [385]:
text_gpt_2017_12_pos = '\n'.join(df_2017_12_pos_sample['text'].astype(str))

### Prompts and loops

In [386]:
prompt_12 = prompt_raw.replace('TWEETS', text_gpt_2017_12_pos)

In [387]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_12}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [388]:
response.choices[0].message.content

'The provided texts discuss various vaccination campaigns and updates in Brazil, highlighting the expansion of flu and yellow fever vaccinations to the general population, the introduction of new vaccines like Gardasil 9 for HPV, and the recognition of regions as free from diseases like foot-and-mouth disease. Additionally, there are mentions of vaccination initiatives for pets and the importance of keeping vaccination records updated. The overall tone emphasizes the significance of vaccination for public health and the efforts to increase access through pharmacies and community campaigns.'

In [389]:
#summary = []
#dataset = []

In [390]:
dataset.append("2017_12_positive")

In [391]:
summary.append(response.choices[0].message.content)

In [392]:
df_2017_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  12 non-null     object
 1   summary  12 non-null     object
dtypes: object(2)
memory usage: 320.0+ bytes


In [393]:
df_2017_pos.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_positive,Penápolis will participate in the yellow fever...
1,2017_02_positive,The text discusses various vaccination efforts...
2,2017_03_positive,The text discusses various aspects of vaccinat...
3,2017_04_positive,The Portuguese sentences discuss various vacci...
4,2017_05_positive,The provided Portuguese sentences discuss vari...
5,2017_06_positive,The text discusses the Brazilian government's ...
6,2017_07_positive,The texts discuss various aspects of vaccinati...
7,2017_08_positive,The messages primarily focus on the importance...
8,2017_09_positive,The text discusses various aspects of vaccinat...
9,2017_10_positive,The texts discuss various aspects of vaccinati...


In [394]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBakS8bYts1BTabskKhiN0lGwy9e at 0x2171734c040> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided texts discuss various vaccination campaigns and updates in Brazil, highlighting the expansion of flu and yellow fever vaccinations to the general population, the introduction of new vaccines like Gardasil 9 for HPV, and the recognition of regions as free from diseases like foot-and-mouth disease. Additionally, there are mentions of vaccination initiatives for pets and the importance of keeping vaccination records updated. The overall tone emphasizes the significance of vaccination for public health and the efforts to increase access through pharmacies and community campaigns.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731757762,
  "id": "chatcmpl-AUBakS8bYts1BTabskKhiN0lGwy9e",
  "model": "gpt-4o-mini-

## All 2017 positive

In [395]:
df_2017_pos.to_csv('../data/summary_2017_positive_tweets.csv')