# Import libraries and data

In [1]:
# Import libraries
import os
import openai
import pandas as pd
import numpy as np
import openpyxl

In [2]:
# Get OpenAI API secret key (change this file with your own OpenAI key)
open_ai_key = open('../data/local/openai_key.txt', 'r').read()
openai.api_key = open_ai_key
#openai.Model.list()

In [3]:
# Prompt for GPT
prompt_raw = "I would like to summarise in maximum 300 characters and in English what the following sentences in Portuguese talk about. \
Do not use aspects mentioned only in one sentence as part of the summary: TWEETS"

# 2013 neutral

## January

In [4]:
# Get monthly data
df_2013_01_neu = pd.read_csv('../data/local/df_neu_2013-01.csv').reset_index()
df_2013_01_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2832 entries, 0 to 2831
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2832 non-null   int64  
 1   LocationCode            2832 non-null   object 
 2   LocationName            2832 non-null   object 
 3   StatesCode              2832 non-null   object 
 4   ...4                    2832 non-null   int64  
 5   id                      2832 non-null   float64
 6   text                    2832 non-null   object 
 7   label                   2832 non-null   object 
 8   in_reply_to_status_id   275 non-null    float64
 9   in_reply_to_user_id     301 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       507 non-null    float64
 13  retweeted_status_id     507 non-null    float64
 14  created_at              2832 non-null   

### Representative sample based on week and location

In [5]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_01_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_01_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w01,Acre,4,0.001412
1,2013-w01,Alagoas,4,0.001412
2,2013-w01,Amapa,3,0.001059
3,2013-w01,Amazonas,15,0.005297
4,2013-w01,Bahia,61,0.021540
...,...,...,...,...
113,2013-w05,Rio Grande Do Sul,48,0.016949
114,2013-w05,Rondonia,2,0.000706
115,2013-w05,Santa Catarina,11,0.003884
116,2013-w05,Sao Paulo,147,0.051907


In [6]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w01,Acre,4,0.001412,3
1,2013-w01,Alagoas,4,0.001412,3
2,2013-w01,Amapa,3,0.001059,2
3,2013-w01,Amazonas,15,0.005297,12
4,2013-w01,Bahia,61,0.021540,52
...,...,...,...,...,...
113,2013-w05,Rio Grande Do Sul,48,0.016949,41
114,2013-w05,Rondonia,2,0.000706,1
115,2013-w05,Santa Catarina,11,0.003884,9
116,2013-w05,Sao Paulo,147,0.051907,127


In [7]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_01_neu_sample_size = pd.merge(df_2013_01_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [8]:
# Step 6: Perform the stratified sampling
#df_2013_01_neu_sample = df_2013_01_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_01_neu_sample = df_2013_01_neu
# Step 7: Check the result
print(f"Sampled {len(df_2013_01_neu_sample)} rows.")


Sampled 2832 rows.


In [9]:
text_gpt_2013_01_neu = '\n'.join(df_2013_01_neu_sample['text'].astype(str))

### Prompts and loops

In [10]:
prompt_1 = prompt_raw.replace('TWEETS', text_gpt_2013_01_neu)

In [11]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_1}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [12]:
response.choices[0].message.content

'The provided texts cover various topics related to vaccinations, including the beginning of vaccination campaigns against diseases like foot-and-mouth disease in cattle and yellow fever, the testing of new vaccines for HIV and Alzheimer’s, and the promotion of HPV vaccines for girls aged 11 to 13. There are also mentions of personal experiences with vaccinations, including anxiety about getting vaccinated, and humorous takes on the subject.'

In [13]:
summary = []
dataset = []

In [14]:
dataset.append("2013_01_neutral")

In [15]:
summary.append(response.choices[0].message.content)

In [16]:
df_2013_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  1 non-null      object
 1   summary  1 non-null      object
dtypes: object(2)
memory usage: 144.0+ bytes


In [17]:
df_2013_neu.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_neutral,The provided texts cover various topics relate...


In [18]:
response

<OpenAIObject chat.completion id=chatcmpl-AUeiJvR1gRhLXFWyIfushLrsCGpmT at 0x1ba6b20f130> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided texts cover various topics related to vaccinations, including the beginning of vaccination campaigns against diseases like foot-and-mouth disease in cattle and yellow fever, the testing of new vaccines for HIV and Alzheimer\u2019s, and the promotion of HPV vaccines for girls aged 11 to 13. There are also mentions of personal experiences with vaccinations, including anxiety about getting vaccinated, and humorous takes on the subject.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869707,
  "id": "chatcmpl-AUeiJvR1gRhLXFWyIfushLrsCGpmT",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 78,
    "completion_tok

## February

In [19]:
# Get monthly data
df_2013_02_neu = pd.read_csv('../data/local/df_neu_2013-02.csv').reset_index()
df_2013_02_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 439 entries, 0 to 438
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   439 non-null    int64  
 1   LocationCode            439 non-null    object 
 2   LocationName            439 non-null    object 
 3   StatesCode              439 non-null    object 
 4   ...4                    439 non-null    int64  
 5   id                      439 non-null    float64
 6   text                    439 non-null    object 
 7   label                   439 non-null    object 
 8   in_reply_to_status_id   51 non-null     float64
 9   in_reply_to_user_id     54 non-null     float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       64 non-null     float64
 13  retweeted_status_id     64 non-null     float64
 14  created_at              439 non-null    ob

### Representative sample based on week and location

In [20]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_02_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_02_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w05,Acre,3,0.006834
1,2013-w05,Alagoas,3,0.006834
2,2013-w05,Amapa,1,0.002278
3,2013-w05,Amazonas,6,0.013667
4,2013-w05,Bahia,13,0.029613
5,2013-w05,Ceara,6,0.013667
6,2013-w05,Distrito Federal,6,0.013667
7,2013-w05,Espirito Santo,5,0.01139
8,2013-w05,Goias,5,0.01139
9,2013-w05,Maranhao,11,0.025057


In [21]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w05,Acre,3,0.006834,16
1,2013-w05,Alagoas,3,0.006834,16
2,2013-w05,Amapa,1,0.002278,5
3,2013-w05,Amazonas,6,0.013667,33
4,2013-w05,Bahia,13,0.029613,72
5,2013-w05,Ceara,6,0.013667,33
6,2013-w05,Distrito Federal,6,0.013667,33
7,2013-w05,Espirito Santo,5,0.01139,27
8,2013-w05,Goias,5,0.01139,27
9,2013-w05,Maranhao,11,0.025057,61


In [22]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_02_neu_sample_size = pd.merge(df_2013_02_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [23]:
# Step 6: Perform the stratified sampling
#df_2013_02_neu_sample = df_2013_02_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_02_neu_sample = df_2013_02_neu
# Step 7: Check the result
print(f"Sampled {len(df_2013_02_neu_sample)} rows.")


Sampled 439 rows.


In [24]:
text_gpt_2013_02_neu = '\n'.join(df_2013_02_neu_sample['text'].astype(str))

### Prompts and loops

In [25]:
prompt_2 = prompt_raw.replace('TWEETS', text_gpt_2013_02_neu)

In [26]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_2}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [27]:
response.choices[0].message.content

'SESI plans to vaccinate over 900,000 workers against the flu. Additionally, a vaccination campaign in Alagoas has successfully immunized 96.13% of livestock against foot-and-mouth disease. In France, a new HIV vaccine is being tested on humans. Studies show that flu vaccine responses vary by age.'

In [28]:
#summary = []
#dataset = []

In [29]:
dataset.append("2013_02_neutral")

In [30]:
summary.append(response.choices[0].message.content)

In [31]:
df_2013_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  2 non-null      object
 1   summary  2 non-null      object
dtypes: object(2)
memory usage: 160.0+ bytes


In [32]:
df_2013_neu.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_neutral,The provided texts cover various topics relate...
1,2013_02_neutral,"SESI plans to vaccinate over 900,000 workers a..."


In [33]:
response

<OpenAIObject chat.completion id=chatcmpl-AUeiPah389Aco7Yfiyau70NL0Zmiw at 0x1ba6b223d10> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "SESI plans to vaccinate over 900,000 workers against the flu. Additionally, a vaccination campaign in Alagoas has successfully immunized 96.13% of livestock against foot-and-mouth disease. In France, a new HIV vaccine is being tested on humans. Studies show that flu vaccine responses vary by age.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869713,
  "id": "chatcmpl-AUeiPah389Aco7Yfiyau70NL0Zmiw",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 65,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reasoning_tokens": 0,
      "rejected_prediction_tokens": 0
    }

## March

In [34]:
# Get monthly data
df_2013_03_neu = pd.read_csv('../data/local/df_neu_2013-03.csv').reset_index()
df_2013_03_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452 entries, 0 to 451
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   452 non-null    int64  
 1   LocationCode            452 non-null    object 
 2   LocationName            452 non-null    object 
 3   StatesCode              452 non-null    object 
 4   ...4                    452 non-null    int64  
 5   id                      452 non-null    float64
 6   text                    452 non-null    object 
 7   label                   452 non-null    object 
 8   in_reply_to_status_id   39 non-null     float64
 9   in_reply_to_user_id     44 non-null     float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       0 non-null      float64
 13  retweeted_status_id     0 non-null      float64
 14  created_at              452 non-null    ob

### Representative sample based on week and location

In [35]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_03_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_03_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w13,Acre,1,0.002212
1,2013-w13,Alagoas,4,0.00885
2,2013-w13,Amapa,3,0.006637
3,2013-w13,Amazonas,5,0.011062
4,2013-w13,Bahia,30,0.066372
5,2013-w13,Ceara,14,0.030973
6,2013-w13,Distrito Federal,14,0.030973
7,2013-w13,Espirito Santo,6,0.013274
8,2013-w13,Goias,6,0.013274
9,2013-w13,Maranhao,11,0.024336


In [36]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w13,Acre,1,0.002212,5
1,2013-w13,Alagoas,4,0.00885,21
2,2013-w13,Amapa,3,0.006637,16
3,2013-w13,Amazonas,5,0.011062,27
4,2013-w13,Bahia,30,0.066372,162
5,2013-w13,Ceara,14,0.030973,75
6,2013-w13,Distrito Federal,14,0.030973,75
7,2013-w13,Espirito Santo,6,0.013274,32
8,2013-w13,Goias,6,0.013274,32
9,2013-w13,Maranhao,11,0.024336,59


In [37]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_03_neu_sample_size = pd.merge(df_2013_03_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [39]:
# Step 6: Perform the stratified sampling
#df_2013_03_neu_sample = df_2013_03_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_03_neu_sample = df_2013_03_neu
# Step 7: Check the result
print(f"Sampled {len(df_2013_03_neu_sample)} rows.")


Sampled 452 rows.


In [40]:
text_gpt_2013_03_neu = '\n'.join(df_2013_03_neu_sample['text'].astype(str))

### Prompts and loops

In [41]:
prompt_3 = prompt_raw.replace('TWEETS', text_gpt_2013_03_neu)

In [42]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_3}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [43]:
response.choices[0].message.content

'The campaign aims to vaccinate 31.3 million people against the flu in Brazil from April 15 to 26. Priority groups include chronic patients, postpartum women, the elderly, and children. Various states, including MS and AM, set specific vaccination targets, with the Ministry of Health coordinating nationwide efforts.'

In [44]:
#summary = []
#dataset = []

In [45]:
dataset.append("2013_03_neutral")

In [46]:
summary.append(response.choices[0].message.content)

In [47]:
df_2013_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  3 non-null      object
 1   summary  3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes


In [48]:
df_2013_neu.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_neutral,The provided texts cover various topics relate...
1,2013_02_neutral,"SESI plans to vaccinate over 900,000 workers a..."
2,2013_03_neutral,The campaign aims to vaccinate 31.3 million pe...


In [49]:
response

<OpenAIObject chat.completion id=chatcmpl-AUejk0szjQ72oWzZgviGyhxPNTtFY at 0x1ba6f8b20e0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The campaign aims to vaccinate 31.3 million people against the flu in Brazil from April 15 to 26. Priority groups include chronic patients, postpartum women, the elderly, and children. Various states, including MS and AM, set specific vaccination targets, with the Ministry of Health coordinating nationwide efforts.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869796,
  "id": "chatcmpl-AUejk0szjQ72oWzZgviGyhxPNTtFY",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 62,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reasoning_tokens": 0,
      "rejected_predicti

## April

In [50]:
# Get monthly data
df_2013_04_neu = pd.read_csv('../data/local/df_neu_2013-04.csv').reset_index()
df_2013_04_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8859 entries, 0 to 8858
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   8859 non-null   int64  
 1   LocationCode            8859 non-null   object 
 2   LocationName            8859 non-null   object 
 3   StatesCode              8859 non-null   object 
 4   ...4                    8859 non-null   int64  
 5   id                      8859 non-null   float64
 6   text                    8859 non-null   object 
 7   label                   8859 non-null   object 
 8   in_reply_to_status_id   428 non-null    float64
 9   in_reply_to_user_id     644 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       0 non-null      float64
 13  retweeted_status_id     0 non-null      float64
 14  created_at              8859 non-null   

### Representative sample based on week and location

In [51]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_04_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_04_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w13,Alagoas,3,0.000339
1,2013-w13,Amapa,1,0.000113
2,2013-w13,Amazonas,3,0.000339
3,2013-w13,Bahia,5,0.000564
4,2013-w13,Ceara,6,0.000677
...,...,...,...,...
140,2013-w18,Rondonia,3,0.000339
141,2013-w18,Roraima,2,0.000226
142,2013-w18,Santa Catarina,6,0.000677
143,2013-w18,Sao Paulo,64,0.007224


In [52]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w13,Alagoas,3,0.000339,0
1,2013-w13,Amapa,1,0.000113,0
2,2013-w13,Amazonas,3,0.000339,0
3,2013-w13,Bahia,5,0.000564,1
4,2013-w13,Ceara,6,0.000677,1
...,...,...,...,...,...
140,2013-w18,Rondonia,3,0.000339,0
141,2013-w18,Roraima,2,0.000226,0
142,2013-w18,Santa Catarina,6,0.000677,1
143,2013-w18,Sao Paulo,64,0.007224,17


In [53]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_04_neu_sample_size = pd.merge(df_2013_04_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [54]:
# Step 6: Perform the stratified sampling
df_2013_04_neu_sample = df_2013_04_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2013_04_neu_sample)} rows.")


Sampled 2382 rows.


In [55]:
text_gpt_2013_04_neu = '\n'.join(df_2013_04_neu_sample['text'].astype(str))

### Prompts and loops

In [56]:
prompt_4 = prompt_raw.replace('TWEETS', text_gpt_2013_04_neu)

In [57]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_4}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [58]:
response.choices[0].message.content

'The texts discuss the initiation and ongoing campaigns for vaccination against HPV among girls aged 11 to 13 in schools in the Federal District, Brazil. Additionally, there are mentions of vaccination campaigns against influenza (the flu) across various states, including plans to immunize large populations and specific groups like pregnant women and children. The campaigns aim to protect public health by increasing vaccination rates and addressing vaccine accessibility.'

In [59]:
#summary = []
#dataset = []

In [60]:
dataset.append("2013_04_neutral")

In [61]:
summary.append(response.choices[0].message.content)

In [62]:
df_2013_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  4 non-null      object
 1   summary  4 non-null      object
dtypes: object(2)
memory usage: 192.0+ bytes


In [63]:
df_2013_neu.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_neutral,The provided texts cover various topics relate...
1,2013_02_neutral,"SESI plans to vaccinate over 900,000 workers a..."
2,2013_03_neutral,The campaign aims to vaccinate 31.3 million pe...
3,2013_04_neutral,The texts discuss the initiation and ongoing c...


In [64]:
response

<OpenAIObject chat.completion id=chatcmpl-AUejp5XcDqa0j0eMSXOYoglaBlPEn at 0x1ba70cab1d0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The texts discuss the initiation and ongoing campaigns for vaccination against HPV among girls aged 11 to 13 in schools in the Federal District, Brazil. Additionally, there are mentions of vaccination campaigns against influenza (the flu) across various states, including plans to immunize large populations and specific groups like pregnant women and children. The campaigns aim to protect public health by increasing vaccination rates and addressing vaccine accessibility.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869801,
  "id": "chatcmpl-AUejp5XcDqa0j0eMSXOYoglaBlPEn",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens"

## May

In [65]:
# Get monthly data
df_2013_05_neu = pd.read_csv('../data/local/df_neu_2013-05.csv').reset_index()
df_2013_05_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5260 entries, 0 to 5259
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   5260 non-null   int64  
 1   LocationCode            5260 non-null   object 
 2   LocationName            5260 non-null   object 
 3   StatesCode              5260 non-null   object 
 4   ...4                    5260 non-null   int64  
 5   id                      5260 non-null   float64
 6   text                    5260 non-null   object 
 7   label                   5260 non-null   object 
 8   in_reply_to_status_id   362 non-null    float64
 9   in_reply_to_user_id     524 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       0 non-null      float64
 13  retweeted_status_id     0 non-null      float64
 14  created_at              5260 non-null   

### Representative sample based on week and location

In [66]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_05_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_05_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w18,Acre,17,0.003232
1,2013-w18,Alagoas,14,0.002662
2,2013-w18,Amapa,16,0.003042
3,2013-w18,Amazonas,18,0.003422
4,2013-w18,Bahia,61,0.011597
...,...,...,...,...
122,2013-w22,Roraima,1,0.000190
123,2013-w22,Santa Catarina,13,0.002471
124,2013-w22,Sao Paulo,149,0.028327
125,2013-w22,Sergipe,2,0.000380


In [67]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w18,Acre,17,0.003232,7
1,2013-w18,Alagoas,14,0.002662,6
2,2013-w18,Amapa,16,0.003042,7
3,2013-w18,Amazonas,18,0.003422,8
4,2013-w18,Bahia,61,0.011597,28
...,...,...,...,...,...
122,2013-w22,Roraima,1,0.000190,0
123,2013-w22,Santa Catarina,13,0.002471,6
124,2013-w22,Sao Paulo,149,0.028327,69
125,2013-w22,Sergipe,2,0.000380,0


In [68]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_05_neu_sample_size = pd.merge(df_2013_05_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [69]:
# Step 6: Perform the stratified sampling
df_2013_05_neu_sample = df_2013_05_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2013_05_neu_sample)} rows.")


Sampled 2379 rows.


In [70]:
text_gpt_2013_05_neu = '\n'.join(df_2013_05_neu_sample['text'].astype(str))

### Prompts and loops

In [71]:
prompt_5 = prompt_raw.replace('TWEETS', text_gpt_2013_05_neu)

In [72]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_5}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [73]:
response.choices[0].message.content

'The sentences discuss various vaccination campaigns in Brazil, particularly focusing on the vaccination against foot-and-mouth disease (aftosa), HPV, and influenza (gripe). They mention adjustments to schedules due to drought, goals to vaccinate animals, and efforts to reach populations such as pregnant women and children. There are also mentions of challenges in meeting vaccination targets and the importance of public health initiatives.'

In [74]:
#summary = []
#dataset = []

In [75]:
dataset.append("2013_05_neutral")

In [76]:
summary.append(response.choices[0].message.content)

In [77]:
df_2013_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  5 non-null      object
 1   summary  5 non-null      object
dtypes: object(2)
memory usage: 208.0+ bytes


In [78]:
df_2013_neu.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_neutral,The provided texts cover various topics relate...
1,2013_02_neutral,"SESI plans to vaccinate over 900,000 workers a..."
2,2013_03_neutral,The campaign aims to vaccinate 31.3 million pe...
3,2013_04_neutral,The texts discuss the initiation and ongoing c...
4,2013_05_neutral,The sentences discuss various vaccination camp...


In [79]:
response

<OpenAIObject chat.completion id=chatcmpl-AUejzZhRZmNVK3atcOgTUv8zukOL0 at 0x1ba70cad220> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The sentences discuss various vaccination campaigns in Brazil, particularly focusing on the vaccination against foot-and-mouth disease (aftosa), HPV, and influenza (gripe). They mention adjustments to schedules due to drought, goals to vaccinate animals, and efforts to reach populations such as pregnant women and children. There are also mentions of challenges in meeting vaccination targets and the importance of public health initiatives.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869811,
  "id": "chatcmpl-AUejzZhRZmNVK3atcOgTUv8zukOL0",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 77,
    "completion_tokens_det

## June

In [80]:
# Get monthly data
df_2013_06_neu = pd.read_csv('../data/local/df_neu_2013-06.csv').reset_index()
df_2013_06_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5139 entries, 0 to 5138
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   5139 non-null   int64  
 1   LocationCode            5139 non-null   object 
 2   LocationName            5139 non-null   object 
 3   StatesCode              5139 non-null   object 
 4   ...4                    5139 non-null   int64  
 5   id                      5139 non-null   float64
 6   text                    5139 non-null   object 
 7   label                   5139 non-null   object 
 8   in_reply_to_status_id   356 non-null    float64
 9   in_reply_to_user_id     447 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       685 non-null    float64
 13  retweeted_status_id     685 non-null    float64
 14  created_at              5139 non-null   

### Representative sample based on week and location

In [81]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_06_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_06_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w22,Alagoas,3,0.000584
1,2013-w22,Amazonas,2,0.000389
2,2013-w22,Bahia,19,0.003697
3,2013-w22,Ceara,9,0.001751
4,2013-w22,Distrito Federal,11,0.002140
...,...,...,...,...
117,2013-w26,Roraima,2,0.000389
118,2013-w26,Santa Catarina,33,0.006421
119,2013-w26,Sao Paulo,181,0.035221
120,2013-w26,Sergipe,3,0.000584


In [82]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w22,Alagoas,3,0.000584,1
1,2013-w22,Amazonas,2,0.000389,0
2,2013-w22,Bahia,19,0.003697,9
3,2013-w22,Ceara,9,0.001751,4
4,2013-w22,Distrito Federal,11,0.002140,5
...,...,...,...,...,...
117,2013-w26,Roraima,2,0.000389,0
118,2013-w26,Santa Catarina,33,0.006421,15
119,2013-w26,Sao Paulo,181,0.035221,86
120,2013-w26,Sergipe,3,0.000584,1


In [83]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_06_neu_sample_size = pd.merge(df_2013_06_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [84]:
# Step 6: Perform the stratified sampling
#df_2013_06_neu_sample = df_2013_06_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_06_neu_sample = df_2013_06_neu
# Step 7: Check the result
print(f"Sampled {len(df_2013_06_neu_sample)} rows.")


Sampled 5139 rows.


In [85]:
text_gpt_2013_06_neu = '\n'.join(df_2013_06_neu_sample['text'].astype(str))

### Prompts and loops

In [86]:
prompt_6 = prompt_raw.replace('TWEETS', text_gpt_2013_06_neu)

In [87]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_6}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [88]:
response.choices[0].message.content

'The text discusses various vaccination campaigns in Brazil, particularly focusing on the vaccination against poliomyelitis (polio) and the need for children aged 6 months to 5 years to receive these vaccinations. It mentions specific dates for campaigns, the number of children targeted, and the importance of vaccinations, while also referencing personal experiences and societal attitudes towards vaccines.'

In [89]:
#summary = []
#dataset = []

In [90]:
dataset.append("2013_06_neutral")

In [91]:
summary.append(response.choices[0].message.content)

In [92]:
df_2013_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  6 non-null      object
 1   summary  6 non-null      object
dtypes: object(2)
memory usage: 224.0+ bytes


In [93]:
df_2013_neu.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_neutral,The provided texts cover various topics relate...
1,2013_02_neutral,"SESI plans to vaccinate over 900,000 workers a..."
2,2013_03_neutral,The campaign aims to vaccinate 31.3 million pe...
3,2013_04_neutral,The texts discuss the initiation and ongoing c...
4,2013_05_neutral,The sentences discuss various vaccination camp...
5,2013_06_neutral,The text discusses various vaccination campaig...


In [94]:
response

<OpenAIObject chat.completion id=chatcmpl-AUekA9tfjho6Zkt2uCTPMOgS9s1f3 at 0x1ba70404360> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns in Brazil, particularly focusing on the vaccination against poliomyelitis (polio) and the need for children aged 6 months to 5 years to receive these vaccinations. It mentions specific dates for campaigns, the number of children targeted, and the importance of vaccinations, while also referencing personal experiences and societal attitudes towards vaccines.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869822,
  "id": "chatcmpl-AUekA9tfjho6Zkt2uCTPMOgS9s1f3",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 71,
    "completion_tokens_details": {
      "accepted_predictio

## July

In [95]:
# Get monthly data
df_2013_07_neu = pd.read_csv('../data/local/df_neu_2013-07.csv').reset_index()
df_2013_07_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3236 entries, 0 to 3235
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3236 non-null   int64  
 1   LocationCode            3236 non-null   object 
 2   LocationName            3236 non-null   object 
 3   StatesCode              3236 non-null   object 
 4   ...4                    3236 non-null   int64  
 5   id                      3236 non-null   float64
 6   text                    3236 non-null   object 
 7   label                   3236 non-null   object 
 8   in_reply_to_status_id   293 non-null    float64
 9   in_reply_to_user_id     331 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       641 non-null    float64
 13  retweeted_status_id     641 non-null    float64
 14  created_at              3236 non-null   

### Representative sample based on week and location

In [96]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_07_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_07_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w26,Alagoas,8,0.002472
1,2013-w26,Amapa,3,0.000927
2,2013-w26,Amazonas,1,0.000309
3,2013-w26,Bahia,11,0.003399
4,2013-w26,Ceara,13,0.004017
...,...,...,...,...
125,2013-w31,Rio Grande Do Sul,9,0.002781
126,2013-w31,Santa Catarina,8,0.002472
127,2013-w31,Sao Paulo,69,0.021323
128,2013-w31,Sergipe,1,0.000309


In [97]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w26,Alagoas,8,0.002472,6
1,2013-w26,Amapa,3,0.000927,2
2,2013-w26,Amazonas,1,0.000309,0
3,2013-w26,Bahia,11,0.003399,8
4,2013-w26,Ceara,13,0.004017,9
...,...,...,...,...,...
125,2013-w31,Rio Grande Do Sul,9,0.002781,6
126,2013-w31,Santa Catarina,8,0.002472,6
127,2013-w31,Sao Paulo,69,0.021323,52
128,2013-w31,Sergipe,1,0.000309,0


In [98]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_07_neu_sample_size = pd.merge(df_2013_07_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [99]:
# Step 6: Perform the stratified sampling
#df_2013_07_neu_sample = df_2013_07_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_07_neu_sample = df_2013_07_neu
# Step 7: Check the result
print(f"Sampled {len(df_2013_07_neu_sample)} rows.")


Sampled 3236 rows.


In [100]:
text_gpt_2013_07_neu = '\n'.join(df_2013_07_neu_sample['text'].astype(str))

### Prompts and loops

In [101]:
prompt_7 = prompt_raw.replace('TWEETS', text_gpt_2013_07_neu)

In [102]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_7}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [103]:
response.choices[0].message.content

'The text discusses various aspects of vaccination in Brazil, focusing on the incorporation of the HPV vaccine into the public health system (SUS) for girls aged 10 and 11 starting in 2014. It mentions ongoing vaccination campaigns against diseases like polio and rabies, public health initiatives, and personal anecdotes about vaccinations. There are also humorous remarks about the need for a vaccine against issues like "jealousy" and "acne." The overall theme highlights the importance of vaccinations for both humans and animals while addressing public health challenges and awareness.'

In [104]:
#summary = []
#dataset = []

In [105]:
dataset.append("2013_07_neutral")

In [106]:
summary.append(response.choices[0].message.content)

In [107]:
df_2013_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  7 non-null      object
 1   summary  7 non-null      object
dtypes: object(2)
memory usage: 240.0+ bytes


In [108]:
df_2013_neu.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_neutral,The provided texts cover various topics relate...
1,2013_02_neutral,"SESI plans to vaccinate over 900,000 workers a..."
2,2013_03_neutral,The campaign aims to vaccinate 31.3 million pe...
3,2013_04_neutral,The texts discuss the initiation and ongoing c...
4,2013_05_neutral,The sentences discuss various vaccination camp...
5,2013_06_neutral,The text discusses various vaccination campaig...
6,2013_07_neutral,The text discusses various aspects of vaccinat...


In [109]:
response

<OpenAIObject chat.completion id=chatcmpl-AUekQR6wlTrhNMIKQIMpyzVcpAAN6 at 0x1ba707a3b80> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various aspects of vaccination in Brazil, focusing on the incorporation of the HPV vaccine into the public health system (SUS) for girls aged 10 and 11 starting in 2014. It mentions ongoing vaccination campaigns against diseases like polio and rabies, public health initiatives, and personal anecdotes about vaccinations. There are also humorous remarks about the need for a vaccine against issues like \"jealousy\" and \"acne.\" The overall theme highlights the importance of vaccinations for both humans and animals while addressing public health challenges and awareness.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869838,
  "id": "chatcmpl-AUekQR6wlTrhNMIKQIMpyzVcpAAN6",
  "model": "gpt-4o-mini-20

## August

In [110]:
# Get monthly data
df_2013_08_neu = pd.read_csv('../data/local/df_neu_2013-08.csv').reset_index()
df_2013_08_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   4340 non-null   int64  
 1   LocationCode            4340 non-null   object 
 2   LocationName            4340 non-null   object 
 3   StatesCode              4340 non-null   object 
 4   ...4                    4340 non-null   int64  
 5   id                      4340 non-null   float64
 6   text                    4340 non-null   object 
 7   label                   4340 non-null   object 
 8   in_reply_to_status_id   255 non-null    float64
 9   in_reply_to_user_id     318 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       596 non-null    float64
 13  retweeted_status_id     596 non-null    float64
 14  created_at              4340 non-null   

### Representative sample based on week and location

In [111]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_08_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_08_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w31,Acre,1,0.000230
1,2013-w31,Alagoas,3,0.000691
2,2013-w31,Amapa,1,0.000230
3,2013-w31,Amazonas,12,0.002765
4,2013-w31,Bahia,35,0.008065
...,...,...,...,...
122,2013-w35,Roraima,4,0.000922
123,2013-w35,Santa Catarina,12,0.002765
124,2013-w35,Sao Paulo,194,0.044700
125,2013-w35,Sergipe,11,0.002535


In [112]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w31,Acre,1,0.000230,0
1,2013-w31,Alagoas,3,0.000691,1
2,2013-w31,Amapa,1,0.000230,0
3,2013-w31,Amazonas,12,0.002765,6
4,2013-w31,Bahia,35,0.008065,19
...,...,...,...,...,...
122,2013-w35,Roraima,4,0.000922,2
123,2013-w35,Santa Catarina,12,0.002765,6
124,2013-w35,Sao Paulo,194,0.044700,109
125,2013-w35,Sergipe,11,0.002535,6


In [113]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_08_neu_sample_size = pd.merge(df_2013_08_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [114]:
# Step 6: Perform the stratified sampling
#df_2013_08_neu_sample = df_2013_08_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_08_neu_sample = df_2013_08_neu
# Step 7: Check the result
print(f"Sampled {len(df_2013_08_neu_sample)} rows.")


Sampled 4340 rows.


In [115]:
text_gpt_2013_08_neu = '\n'.join(df_2013_08_neu_sample['text'].astype(str))

### Prompts and loops

In [116]:
prompt_8 = prompt_raw.replace('TWEETS', text_gpt_2013_08_neu)

In [117]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_8}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [118]:
response.choices[0].message.content

'The provided sentences discuss various vaccination campaigns in Brazil, including those aimed at both humans (children, adults) and animals (pet vaccination). Key themes include updates to vaccination records, the introduction of new vaccines (like those against dengue and Aids), and reminders about vaccination schedules, especially for children under five. There are also humorous references to personal experiences with vaccines and discussions about the need for vaccines against various ailments. Overall, the content emphasizes public health initiatives and the importance of staying up-to-date with vaccinations.'

In [119]:
#summary = []
#dataset = []

In [120]:
dataset.append("2013_08_neutral")

In [121]:
summary.append(response.choices[0].message.content)

In [122]:
df_2013_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  8 non-null      object
 1   summary  8 non-null      object
dtypes: object(2)
memory usage: 256.0+ bytes


In [123]:
df_2013_neu.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_neutral,The provided texts cover various topics relate...
1,2013_02_neutral,"SESI plans to vaccinate over 900,000 workers a..."
2,2013_03_neutral,The campaign aims to vaccinate 31.3 million pe...
3,2013_04_neutral,The texts discuss the initiation and ongoing c...
4,2013_05_neutral,The sentences discuss various vaccination camp...
5,2013_06_neutral,The text discusses various vaccination campaig...
6,2013_07_neutral,The text discusses various aspects of vaccinat...
7,2013_08_neutral,The provided sentences discuss various vaccina...


In [124]:
response

<OpenAIObject chat.completion id=chatcmpl-AUekb8Xrjsq30vYntFjkeX7QbLXUL at 0x1ba71ecff90> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided sentences discuss various vaccination campaigns in Brazil, including those aimed at both humans (children, adults) and animals (pet vaccination). Key themes include updates to vaccination records, the introduction of new vaccines (like those against dengue and Aids), and reminders about vaccination schedules, especially for children under five. There are also humorous references to personal experiences with vaccines and discussions about the need for vaccines against various ailments. Overall, the content emphasizes public health initiatives and the importance of staying up-to-date with vaccinations.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869849,
  "id": "chatcmpl-AUekb8Xrjsq30vYntFjkeX7QbLXUL",

## September

In [125]:
# Get monthly data
df_2013_09_neu = pd.read_csv('../data/local/df_neu_2013-09.csv').reset_index()
df_2013_09_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3430 entries, 0 to 3429
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3430 non-null   int64  
 1   LocationCode            3430 non-null   object 
 2   LocationName            3430 non-null   object 
 3   StatesCode              3430 non-null   object 
 4   ...4                    3430 non-null   int64  
 5   id                      3430 non-null   float64
 6   text                    3430 non-null   object 
 7   label                   3430 non-null   object 
 8   in_reply_to_status_id   204 non-null    float64
 9   in_reply_to_user_id     268 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       417 non-null    float64
 13  retweeted_status_id     417 non-null    float64
 14  created_at              3430 non-null   

### Representative sample based on week and location

In [126]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_09_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_09_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w35,Alagoas,3,0.000875
1,2013-w35,Amapa,3,0.000875
2,2013-w35,Amazonas,7,0.002041
3,2013-w35,Bahia,11,0.003207
4,2013-w35,Ceara,7,0.002041
...,...,...,...,...
114,2013-w39,Roraima,1,0.000292
115,2013-w39,Santa Catarina,15,0.004373
116,2013-w39,Sao Paulo,137,0.039942
117,2013-w39,Sergipe,22,0.006414


In [127]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w35,Alagoas,3,0.000875,2
1,2013-w35,Amapa,3,0.000875,2
2,2013-w35,Amazonas,7,0.002041,5
3,2013-w35,Bahia,11,0.003207,7
4,2013-w35,Ceara,7,0.002041,5
...,...,...,...,...,...
114,2013-w39,Roraima,1,0.000292,0
115,2013-w39,Santa Catarina,15,0.004373,10
116,2013-w39,Sao Paulo,137,0.039942,97
117,2013-w39,Sergipe,22,0.006414,15


In [128]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_09_neu_sample_size = pd.merge(df_2013_09_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [129]:
# Step 6: Perform the stratified sampling
df_2013_09_neu_sample = df_2013_09_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2013_09_neu_sample)} rows.")


Sampled 2397 rows.


In [130]:
text_gpt_2013_09_neu = '\n'.join(df_2013_09_neu_sample['text'].astype(str))

### Prompts and loops

In [131]:
prompt_9 = prompt_raw.replace('TWEETS', text_gpt_2013_09_neu)

In [132]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_9}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [133]:
response.choices[0].message.content

'The Brazilian public healthcare system (SUS) is now offering the varicella (chickenpox) vaccine for children starting at 15 months old. The vaccine will be administered in two doses: the first at one year of age and the second at four years. The campaign aims to increase vaccination coverage and protect children from this disease.'

In [134]:
#summary = []
#dataset = []

In [135]:
dataset.append("2013_09_neutral")

In [136]:
summary.append(response.choices[0].message.content)

In [137]:
df_2013_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  9 non-null      object
 1   summary  9 non-null      object
dtypes: object(2)
memory usage: 272.0+ bytes


In [138]:
df_2013_neu.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_neutral,The provided texts cover various topics relate...
1,2013_02_neutral,"SESI plans to vaccinate over 900,000 workers a..."
2,2013_03_neutral,The campaign aims to vaccinate 31.3 million pe...
3,2013_04_neutral,The texts discuss the initiation and ongoing c...
4,2013_05_neutral,The sentences discuss various vaccination camp...
5,2013_06_neutral,The text discusses various vaccination campaig...
6,2013_07_neutral,The text discusses various aspects of vaccinat...
7,2013_08_neutral,The provided sentences discuss various vaccina...
8,2013_09_neutral,The Brazilian public healthcare system (SUS) i...


In [139]:
response

<OpenAIObject chat.completion id=chatcmpl-AUekoEyEoGkqdOOCBqQRFiDgxbpL8 at 0x1ba7216a400> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The Brazilian public healthcare system (SUS) is now offering the varicella (chickenpox) vaccine for children starting at 15 months old. The vaccine will be administered in two doses: the first at one year of age and the second at four years. The campaign aims to increase vaccination coverage and protect children from this disease.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869862,
  "id": "chatcmpl-AUekoEyEoGkqdOOCBqQRFiDgxbpL8",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 68,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reasoning_tokens": 0,
      "r

## October

In [140]:
# Get monthly data
df_2013_10_neu = pd.read_csv('../data/local/df_neu_2013-10.csv').reset_index()
df_2013_10_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2299 entries, 0 to 2298
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2299 non-null   int64  
 1   LocationCode            2299 non-null   object 
 2   LocationName            2299 non-null   object 
 3   StatesCode              2299 non-null   object 
 4   ...4                    2299 non-null   int64  
 5   id                      2299 non-null   float64
 6   text                    2299 non-null   object 
 7   label                   2299 non-null   object 
 8   in_reply_to_status_id   193 non-null    float64
 9   in_reply_to_user_id     215 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       295 non-null    float64
 13  retweeted_status_id     295 non-null    float64
 14  created_at              2299 non-null   

### Representative sample based on week and location

In [141]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_10_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_10_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w40,Alagoas,4,0.001740
1,2013-w40,Amapa,3,0.001305
2,2013-w40,Amazonas,8,0.003480
3,2013-w40,Bahia,40,0.017399
4,2013-w40,Ceara,7,0.003045
...,...,...,...,...
107,2013-w44,Pernambuco,14,0.006090
108,2013-w44,Rio De Janeiro,1,0.000435
109,2013-w44,Rio Grande Do Sul,26,0.011309
110,2013-w44,Santa Catarina,5,0.002175


In [142]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w40,Alagoas,4,0.001740,4
1,2013-w40,Amapa,3,0.001305,3
2,2013-w40,Amazonas,8,0.003480,8
3,2013-w40,Bahia,40,0.017399,42
4,2013-w40,Ceara,7,0.003045,7
...,...,...,...,...,...
107,2013-w44,Pernambuco,14,0.006090,14
108,2013-w44,Rio De Janeiro,1,0.000435,1
109,2013-w44,Rio Grande Do Sul,26,0.011309,27
110,2013-w44,Santa Catarina,5,0.002175,5


In [143]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_10_neu_sample_size = pd.merge(df_2013_10_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [144]:
# Step 6: Perform the stratified sampling
#df_2013_10_neu_sample = df_2013_10_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_10_neu_sample = df_2013_10_neu
# Step 7: Check the result
print(f"Sampled {len(df_2013_10_neu_sample)} rows.")


Sampled 2299 rows.


In [145]:
text_gpt_2013_10_neu = '\n'.join(df_2013_10_neu_sample['text'].astype(str))

### Prompts and loops

In [146]:
prompt_10 = prompt_raw.replace('TWEETS', text_gpt_2013_10_neu)

In [147]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_10}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [148]:
response.choices[0].message.content

'The text discusses various aspects of vaccination, including personal reflections on vaccination experiences, public health campaigns, and the lack of vaccines for certain conditions. It mentions the need for vaccinations against diseases like rabies, dengue, and HPV, and highlights the absurdity of not having a vaccine for acne despite advancements in medicine. Additionally, it touches on societal attitudes towards vaccinations and the necessity of testing them on animals.'

In [149]:
#summary = []
#dataset = []

In [150]:
dataset.append("2013_10_neutral")

In [151]:
summary.append(response.choices[0].message.content)

In [152]:
df_2013_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  10 non-null     object
 1   summary  10 non-null     object
dtypes: object(2)
memory usage: 288.0+ bytes


In [153]:
df_2013_neu.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_neutral,The provided texts cover various topics relate...
1,2013_02_neutral,"SESI plans to vaccinate over 900,000 workers a..."
2,2013_03_neutral,The campaign aims to vaccinate 31.3 million pe...
3,2013_04_neutral,The texts discuss the initiation and ongoing c...
4,2013_05_neutral,The sentences discuss various vaccination camp...
5,2013_06_neutral,The text discusses various vaccination campaig...
6,2013_07_neutral,The text discusses various aspects of vaccinat...
7,2013_08_neutral,The provided sentences discuss various vaccina...
8,2013_09_neutral,The Brazilian public healthcare system (SUS) i...
9,2013_10_neutral,The text discusses various aspects of vaccinat...


In [154]:
response

<OpenAIObject chat.completion id=chatcmpl-AUekuiPPz70MIXlQpaD2XzopVe7p0 at 0x1ba721599a0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various aspects of vaccination, including personal reflections on vaccination experiences, public health campaigns, and the lack of vaccines for certain conditions. It mentions the need for vaccinations against diseases like rabies, dengue, and HPV, and highlights the absurdity of not having a vaccine for acne despite advancements in medicine. Additionally, it touches on societal attitudes towards vaccinations and the necessity of testing them on animals.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869868,
  "id": "chatcmpl-AUekuiPPz70MIXlQpaD2XzopVe7p0",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_9b78b61c52",
  "usage": {
    "completion_tok

## November

In [155]:
# Get monthly data
df_2013_11_neu = pd.read_csv('../data/local/df_neu_2013-11.csv').reset_index()
df_2013_11_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1955 entries, 0 to 1954
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1955 non-null   int64  
 1   LocationCode            1955 non-null   object 
 2   LocationName            1955 non-null   object 
 3   StatesCode              1955 non-null   object 
 4   ...4                    1955 non-null   int64  
 5   id                      1955 non-null   float64
 6   text                    1955 non-null   object 
 7   label                   1955 non-null   object 
 8   in_reply_to_status_id   135 non-null    float64
 9   in_reply_to_user_id     142 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       260 non-null    float64
 13  retweeted_status_id     260 non-null    float64
 14  created_at              1955 non-null   

### Representative sample based on week and location

In [156]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_11_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_11_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w44,Acre,3,0.001535
1,2013-w44,Alagoas,2,0.001023
2,2013-w44,Amapa,1,0.000512
3,2013-w44,Amazonas,5,0.002558
4,2013-w44,Bahia,24,0.012276
...,...,...,...,...
103,2013-w48,Piaui,1,0.000512
104,2013-w48,Rio Grande Do Sul,36,0.018414
105,2013-w48,Santa Catarina,3,0.001535
106,2013-w48,Sao Paulo,70,0.035806


In [157]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w44,Acre,3,0.001535,3
1,2013-w44,Alagoas,2,0.001023,2
2,2013-w44,Amapa,1,0.000512,1
3,2013-w44,Amazonas,5,0.002558,6
4,2013-w44,Bahia,24,0.012276,30
...,...,...,...,...,...
103,2013-w48,Piaui,1,0.000512,1
104,2013-w48,Rio Grande Do Sul,36,0.018414,45
105,2013-w48,Santa Catarina,3,0.001535,3
106,2013-w48,Sao Paulo,70,0.035806,87


In [158]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_11_neu_sample_size = pd.merge(df_2013_11_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [160]:
# Step 6: Perform the stratified sampling
#df_2013_11_neu_sample = df_2013_11_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_11_neu_sample = df_2013_11_neu
# Step 7: Check the result
print(f"Sampled {len(df_2013_11_neu_sample)} rows.")


Sampled 1955 rows.


In [161]:
text_gpt_2013_11_neu = '\n'.join(df_2013_11_neu_sample['text'].astype(str))

### Prompts and loops

In [162]:
prompt_11 = prompt_raw.replace('TWEETS', text_gpt_2013_11_neu)

In [163]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_11}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [164]:
response.choices[0].message.content

'The text discusses various vaccination campaigns, including the second phase of vaccination against foot-and-mouth disease for livestock, the testing of a Brazilian HIV vaccine on monkeys, and rabies vaccination for pets. It highlights the importance of vaccinations for public health and livestock management, while also portraying humorous personal anecdotes about vaccination experiences.'

In [165]:
#summary = []
#dataset = []

In [166]:
dataset.append("2013_11_neutral")

In [167]:
summary.append(response.choices[0].message.content)

In [168]:
df_2013_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  11 non-null     object
 1   summary  11 non-null     object
dtypes: object(2)
memory usage: 304.0+ bytes


In [169]:
df_2013_neu.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_neutral,The provided texts cover various topics relate...
1,2013_02_neutral,"SESI plans to vaccinate over 900,000 workers a..."
2,2013_03_neutral,The campaign aims to vaccinate 31.3 million pe...
3,2013_04_neutral,The texts discuss the initiation and ongoing c...
4,2013_05_neutral,The sentences discuss various vaccination camp...
5,2013_06_neutral,The text discusses various vaccination campaig...
6,2013_07_neutral,The text discusses various aspects of vaccinat...
7,2013_08_neutral,The provided sentences discuss various vaccina...
8,2013_09_neutral,The Brazilian public healthcare system (SUS) i...
9,2013_10_neutral,The text discusses various aspects of vaccinat...


In [170]:
response

<OpenAIObject chat.completion id=chatcmpl-AUemZFbOUR6NXrnJiwMKxBS04M2HJ at 0x1ba7225fd10> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns, including the second phase of vaccination against foot-and-mouth disease for livestock, the testing of a Brazilian HIV vaccine on monkeys, and rabies vaccination for pets. It highlights the importance of vaccinations for public health and livestock management, while also portraying humorous personal anecdotes about vaccination experiences.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869971,
  "id": "chatcmpl-AUemZFbOUR6NXrnJiwMKxBS04M2HJ",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 61,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
   

## December

In [171]:
# Get monthly data
df_2013_12_neu = pd.read_csv('../data/local/df_neu_2013-12.csv').reset_index()
df_2013_12_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1507 entries, 0 to 1506
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1507 non-null   int64  
 1   LocationCode            1507 non-null   object 
 2   LocationName            1507 non-null   object 
 3   StatesCode              1507 non-null   object 
 4   ...4                    1507 non-null   int64  
 5   id                      1507 non-null   float64
 6   text                    1507 non-null   object 
 7   label                   1507 non-null   object 
 8   in_reply_to_status_id   141 non-null    float64
 9   in_reply_to_user_id     153 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       250 non-null    float64
 13  retweeted_status_id     250 non-null    float64
 14  created_at              1507 non-null   

### Representative sample based on week and location

In [172]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_12_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_12_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w48,Alagoas,2,0.001327
1,2013-w48,Amazonas,4,0.002654
2,2013-w48,Bahia,8,0.005309
3,2013-w48,Ceara,4,0.002654
4,2013-w48,Distrito Federal,1,0.000664
...,...,...,...,...
105,2013-w53,Parana,4,0.002654
106,2013-w53,Pernambuco,1,0.000664
107,2013-w53,Rio Grande Do Sul,7,0.004645
108,2013-w53,Roraima,1,0.000664


In [173]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w48,Alagoas,2,0.001327,3
1,2013-w48,Amazonas,4,0.002654,6
2,2013-w48,Bahia,8,0.005309,13
3,2013-w48,Ceara,4,0.002654,6
4,2013-w48,Distrito Federal,1,0.000664,1
...,...,...,...,...,...
105,2013-w53,Parana,4,0.002654,6
106,2013-w53,Pernambuco,1,0.000664,1
107,2013-w53,Rio Grande Do Sul,7,0.004645,11
108,2013-w53,Roraima,1,0.000664,1


In [174]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_12_neu_sample_size = pd.merge(df_2013_12_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [175]:
# Step 6: Perform the stratified sampling
#df_2013_12_neu_sample = df_2013_12_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_12_neu_sample = df_2013_12_neu
# Step 7: Check the result
print(f"Sampled {len(df_2013_12_neu_sample)} rows.")


Sampled 1507 rows.


In [176]:
text_gpt_2013_12_neu = '\n'.join(df_2013_12_neu_sample['text'].astype(str))

### Prompts and loops

In [177]:
prompt_12 = prompt_raw.replace('TWEETS', text_gpt_2013_12_neu)

In [178]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_12}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [179]:
response.choices[0].message.content

'The text discusses various aspects of vaccination, highlighting personal experiences, fears, and societal comments regarding various vaccines, including those for rabies, dengue, and routine childhood vaccinations. It includes mentions of campaigns for animal vaccination and the importance of vaccines in public health, along with humorous remarks about wanting vaccines for emotional issues like disappointment and jealousy.'

In [180]:
#summary = []
#dataset = []

In [181]:
dataset.append("2013_12_neutral")

In [182]:
summary.append(response.choices[0].message.content)

In [183]:
df_2013_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  12 non-null     object
 1   summary  12 non-null     object
dtypes: object(2)
memory usage: 320.0+ bytes


In [184]:
df_2013_neu.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_neutral,The provided texts cover various topics relate...
1,2013_02_neutral,"SESI plans to vaccinate over 900,000 workers a..."
2,2013_03_neutral,The campaign aims to vaccinate 31.3 million pe...
3,2013_04_neutral,The texts discuss the initiation and ongoing c...
4,2013_05_neutral,The sentences discuss various vaccination camp...
5,2013_06_neutral,The text discusses various vaccination campaig...
6,2013_07_neutral,The text discusses various aspects of vaccinat...
7,2013_08_neutral,The provided sentences discuss various vaccina...
8,2013_09_neutral,The Brazilian public healthcare system (SUS) i...
9,2013_10_neutral,The text discusses various aspects of vaccinat...


In [185]:
response

<OpenAIObject chat.completion id=chatcmpl-AUemgUNzJ3And9U0WC9N6RuZY7Dan at 0x1ba7265c6d0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various aspects of vaccination, highlighting personal experiences, fears, and societal comments regarding various vaccines, including those for rabies, dengue, and routine childhood vaccinations. It includes mentions of campaigns for animal vaccination and the importance of vaccines in public health, along with humorous remarks about wanting vaccines for emotional issues like disappointment and jealousy.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869978,
  "id": "chatcmpl-AUemgUNzJ3And9U0WC9N6RuZY7Dan",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_9b78b61c52",
  "usage": {
    "completion_tokens": 66,
    "completion_tokens_details": {
      "

## All 2013 neutral

In [186]:
#df_2013_neu['dataset'] = df_2013_neu['dataset'].str.replace('neuitive','neutral')

In [187]:
df_2013_neu.to_csv('../data/summary_2013_neutral_tweets.csv')