# Import libraries and data

In [1]:
# Import libraries
import os
import openai
import pandas as pd
import numpy as np
import openpyxl

In [2]:
# Get OpenAI API secret key (change this file with your own OpenAI key)
open_ai_key = open('../data/local/openai_key.txt', 'r').read()
openai.api_key = open_ai_key
#openai.Model.list()

In [3]:
# Prompt for GPT
prompt_raw = "I would like to summarise in maximum 300 characters and in English what the following sentences in Portuguese talk about. \
Do not use aspects mentioned only in one sentence as part of the summary: TWEETS"

# 2016 neutral

## January

In [4]:
# Get monthly data
df_2016_01_neu = pd.read_csv('../data/local/df_neu_2016-01.csv').reset_index()
df_2016_01_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4621 entries, 0 to 4620
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   4621 non-null   int64  
 1   LocationCode            4621 non-null   object 
 2   LocationName            4621 non-null   object 
 3   StatesCode              4621 non-null   object 
 4   ...4                    4621 non-null   int64  
 5   id                      4621 non-null   float64
 6   text                    4621 non-null   object 
 7   label                   4621 non-null   object 
 8   in_reply_to_status_id   228 non-null    float64
 9   in_reply_to_user_id     309 non-null    float64
 10  quoted_user_id          59 non-null     float64
 11  quoted_status_id        59 non-null     float64
 12  retweeted_user_id       1370 non-null   float64
 13  retweeted_status_id     1370 non-null   float64
 14  created_at              4621 non-null   

### Representative sample based on week and location

In [5]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_01_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_01_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w01,Acre,7,0.001515
1,2016-w01,Alagoas,21,0.004544
2,2016-w01,Amapa,8,0.001731
3,2016-w01,Amazonas,27,0.005843
4,2016-w01,Bahia,81,0.017529
...,...,...,...,...
117,2016-w05,Rio Grande Do Sul,13,0.002813
118,2016-w05,Rondonia,5,0.001082
119,2016-w05,Santa Catarina,6,0.001298
120,2016-w05,Sao Paulo,64,0.013850


In [6]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w01,Acre,7,0.001515,3
1,2016-w01,Alagoas,21,0.004544,11
2,2016-w01,Amapa,8,0.001731,4
3,2016-w01,Amazonas,27,0.005843,14
4,2016-w01,Bahia,81,0.017529,42
...,...,...,...,...,...
117,2016-w05,Rio Grande Do Sul,13,0.002813,6
118,2016-w05,Rondonia,5,0.001082,2
119,2016-w05,Santa Catarina,6,0.001298,3
120,2016-w05,Sao Paulo,64,0.013850,33


In [7]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_01_neu_sample_size = pd.merge(df_2016_01_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [8]:
# Step 6: Perform the stratified sampling
df_2016_01_neu_sample = df_2016_01_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2016_01_neu_sample)} rows.")


Sampled 2391 rows.


In [9]:
text_gpt_2016_01_neu = '\n'.join(df_2016_01_neu_sample['text'].astype(str))

### Prompts and loops

In [10]:
prompt_1 = prompt_raw.replace('TWEETS', text_gpt_2016_01_neu)

In [11]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_1}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [12]:
response.choices[0].message.content

'The Ministry of Health of Brazil announced updates to the national vaccination calendar, including reducing the HPV vaccine dosage for adolescents and adjustments to other childhood vaccinations. There are also discussions around the development of vaccines for dengue and Zika, with the latter expected to take years before availability. The vaccination coverage against foot-and-mouth disease in Alagoas exceeds 96%.'

In [13]:
summary = []
dataset = []

In [14]:
dataset.append("2016_01_neutral")

In [15]:
summary.append(response.choices[0].message.content)

In [16]:
df_2016_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  1 non-null      object
 1   summary  1 non-null      object
dtypes: object(2)
memory usage: 144.0+ bytes


In [17]:
df_2016_neu.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_neutral,The Ministry of Health of Brazil announced upd...


In [18]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcuLrLYrFiv6uSdAgmdNPXH9yVkg at 0x2228667dae0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The Ministry of Health of Brazil announced updates to the national vaccination calendar, including reducing the HPV vaccine dosage for adolescents and adjustments to other childhood vaccinations. There are also discussions around the development of vaccines for dengue and Zika, with the latter expected to take years before availability. The vaccination coverage against foot-and-mouth disease in Alagoas exceeds 96%.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862765,
  "id": "chatcmpl-AUcuLrLYrFiv6uSdAgmdNPXH9yVkg",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 70,
    "completion_tokens_details": {
      "accepted

## February

In [19]:
# Get monthly data
df_2016_02_neu = pd.read_csv('../data/local/df_neu_2016-02.csv').reset_index()
df_2016_02_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3858 entries, 0 to 3857
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3858 non-null   int64  
 1   LocationCode            3858 non-null   object 
 2   LocationName            3858 non-null   object 
 3   StatesCode              3858 non-null   object 
 4   ...4                    3858 non-null   int64  
 5   id                      3858 non-null   float64
 6   text                    3858 non-null   object 
 7   label                   3858 non-null   object 
 8   in_reply_to_status_id   270 non-null    float64
 9   in_reply_to_user_id     300 non-null    float64
 10  quoted_user_id          53 non-null     float64
 11  quoted_status_id        53 non-null     float64
 12  retweeted_user_id       1064 non-null   float64
 13  retweeted_status_id     1064 non-null   float64
 14  created_at              3858 non-null   

### Representative sample based on week and location

In [20]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_02_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_02_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w05,Acre,1,0.000259
1,2016-w05,Alagoas,13,0.003370
2,2016-w05,Amapa,6,0.001555
3,2016-w05,Amazonas,15,0.003888
4,2016-w05,Bahia,29,0.007517
...,...,...,...,...
113,2016-w09,Rondonia,2,0.000518
114,2016-w09,Santa Catarina,14,0.003629
115,2016-w09,Sao Paulo,57,0.014774
116,2016-w09,Sergipe,1,0.000259


In [21]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w05,Acre,1,0.000259,0
1,2016-w05,Alagoas,13,0.003370,8
2,2016-w05,Amapa,6,0.001555,3
3,2016-w05,Amazonas,15,0.003888,9
4,2016-w05,Bahia,29,0.007517,18
...,...,...,...,...,...
113,2016-w09,Rondonia,2,0.000518,1
114,2016-w09,Santa Catarina,14,0.003629,8
115,2016-w09,Sao Paulo,57,0.014774,36
116,2016-w09,Sergipe,1,0.000259,0


In [22]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_02_neu_sample_size = pd.merge(df_2016_02_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [23]:
# Step 6: Perform the stratified sampling
df_2016_02_neu_sample = df_2016_02_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2016_02_neu_sample)} rows.")


Sampled 2394 rows.


In [24]:
text_gpt_2016_02_neu = '\n'.join(df_2016_02_neu_sample['text'].astype(str))

### Prompts and loops

In [25]:
prompt_2 = prompt_raw.replace('TWEETS', text_gpt_2016_02_neu)

In [26]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_2}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [27]:
response.choices[0].message.content

'The texts discuss the development of vaccines against the Zika virus, highlighting efforts by institutions like Fiocruz and Butantan, and international collaborations, particularly between Brazil and the USA. There is emphasis on the timeframes for vaccine readiness, varying from one to five years, and the ongoing challenges faced in vaccine research and funding, amidst public health concerns related to Zika and its effects.'

In [28]:
#summary = []
#dataset = []

In [29]:
dataset.append("2016_02_neutral")

In [30]:
summary.append(response.choices[0].message.content)

In [31]:
df_2016_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  2 non-null      object
 1   summary  2 non-null      object
dtypes: object(2)
memory usage: 160.0+ bytes


In [32]:
df_2016_neu.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_neutral,The Ministry of Health of Brazil announced upd...
1,2016_02_neutral,The texts discuss the development of vaccines ...


In [33]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcuRS2OuJA9nSFzDT80qnkecNUUv at 0x22286736360> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The texts discuss the development of vaccines against the Zika virus, highlighting efforts by institutions like Fiocruz and Butantan, and international collaborations, particularly between Brazil and the USA. There is emphasis on the timeframes for vaccine readiness, varying from one to five years, and the ongoing challenges faced in vaccine research and funding, amidst public health concerns related to Zika and its effects.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862771,
  "id": "chatcmpl-AUcuRS2OuJA9nSFzDT80qnkecNUUv",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 77,
    "completion_tokens_details": {
     

## March

In [34]:
# Get monthly data
df_2016_03_neu = pd.read_csv('../data/local/df_neu_2016-03.csv').reset_index()
df_2016_03_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3726 entries, 0 to 3725
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3726 non-null   int64  
 1   LocationCode            3726 non-null   object 
 2   LocationName            3726 non-null   object 
 3   StatesCode              3726 non-null   object 
 4   ...4                    3726 non-null   int64  
 5   id                      3726 non-null   float64
 6   text                    3726 non-null   object 
 7   label                   3726 non-null   object 
 8   in_reply_to_status_id   233 non-null    float64
 9   in_reply_to_user_id     272 non-null    float64
 10  quoted_user_id          59 non-null     float64
 11  quoted_status_id        59 non-null     float64
 12  retweeted_user_id       899 non-null    float64
 13  retweeted_status_id     899 non-null    float64
 14  created_at              3726 non-null   

### Representative sample based on week and location

In [35]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_03_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_03_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w09,Amapa,2,0.000537
1,2016-w09,Amazonas,2,0.000537
2,2016-w09,Bahia,14,0.003757
3,2016-w09,Ceara,4,0.001074
4,2016-w09,Distrito Federal,9,0.002415
...,...,...,...,...
110,2016-w13,Roraima,6,0.001610
111,2016-w13,Santa Catarina,61,0.016371
112,2016-w13,Sao Paulo,373,0.100107
113,2016-w13,Sergipe,19,0.005099


In [36]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w09,Amapa,2,0.000537,1
1,2016-w09,Amazonas,2,0.000537,1
2,2016-w09,Bahia,14,0.003757,9
3,2016-w09,Ceara,4,0.001074,2
4,2016-w09,Distrito Federal,9,0.002415,5
...,...,...,...,...,...
110,2016-w13,Roraima,6,0.001610,3
111,2016-w13,Santa Catarina,61,0.016371,40
112,2016-w13,Sao Paulo,373,0.100107,245
113,2016-w13,Sergipe,19,0.005099,12


In [37]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_03_neu_sample_size = pd.merge(df_2016_03_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [38]:
# Step 6: Perform the stratified sampling
df_2016_03_neu_sample = df_2016_03_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2016_03_neu_sample)} rows.")


Sampled 2389 rows.


In [39]:
text_gpt_2016_03_neu = '\n'.join(df_2016_03_neu_sample['text'].astype(str))

### Prompts and loops

In [40]:
prompt_3 = prompt_raw.replace('TWEETS', text_gpt_2016_03_neu)

In [41]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_3}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [42]:
response.choices[0].message.content

'The text discusses various vaccination campaigns in Brazil, including those against foot-and-mouth disease, rabies, HPV, and H1N1. It highlights the urgency of vaccination due to outbreaks, public health initiatives, and the need for timely vaccination to prevent diseases. Additionally, it features personal anecdotes about taking pets for vaccination and the challenges faced by individuals regarding their vaccinations.'

In [43]:
#summary = []
#dataset = []

In [44]:
dataset.append("2016_03_neutral")

In [45]:
summary.append(response.choices[0].message.content)

In [46]:
df_2016_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  3 non-null      object
 1   summary  3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes


In [47]:
df_2016_neu.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_neutral,The Ministry of Health of Brazil announced upd...
1,2016_02_neutral,The texts discuss the development of vaccines ...
2,2016_03_neutral,The text discusses various vaccination campaig...


In [48]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcuXnhm1gczodewS4cf2I9lGOy3f at 0x2228675f5e0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns in Brazil, including those against foot-and-mouth disease, rabies, HPV, and H1N1. It highlights the urgency of vaccination due to outbreaks, public health initiatives, and the need for timely vaccination to prevent diseases. Additionally, it features personal anecdotes about taking pets for vaccination and the challenges faced by individuals regarding their vaccinations.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862777,
  "id": "chatcmpl-AUcuXnhm1gczodewS4cf2I9lGOy3f",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 73,
    "completion_tokens_details": {
      "acce

## April

In [49]:
# Get monthly data
df_2016_04_neu = pd.read_csv('../data/local/df_neu_2016-04.csv').reset_index()
df_2016_04_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16001 entries, 0 to 16000
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   16001 non-null  int64  
 1   LocationCode            16001 non-null  object 
 2   LocationName            16001 non-null  object 
 3   StatesCode              16001 non-null  object 
 4   ...4                    16001 non-null  int64  
 5   id                      16001 non-null  float64
 6   text                    16001 non-null  object 
 7   label                   16001 non-null  object 
 8   in_reply_to_status_id   595 non-null    float64
 9   in_reply_to_user_id     933 non-null    float64
 10  quoted_user_id          165 non-null    float64
 11  quoted_status_id        165 non-null    float64
 12  retweeted_user_id       2844 non-null   float64
 13  retweeted_status_id     2844 non-null   float64
 14  created_at              16001 non-null

### Representative sample based on week and location

In [50]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_04_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_04_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w14,Acre,13,0.000812
1,2016-w14,Alagoas,48,0.003000
2,2016-w14,Amapa,57,0.003562
3,2016-w14,Amazonas,40,0.002500
4,2016-w14,Bahia,166,0.010374
...,...,...,...,...
123,2016-w18,Roraima,13,0.000812
124,2016-w18,Santa Catarina,47,0.002937
125,2016-w18,Sao Paulo,221,0.013812
126,2016-w18,Sergipe,43,0.002687


In [51]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w14,Acre,13,0.000812,1
1,2016-w14,Alagoas,48,0.003000,7
2,2016-w14,Amapa,57,0.003562,8
3,2016-w14,Amazonas,40,0.002500,6
4,2016-w14,Bahia,166,0.010374,25
...,...,...,...,...,...
123,2016-w18,Roraima,13,0.000812,1
124,2016-w18,Santa Catarina,47,0.002937,7
125,2016-w18,Sao Paulo,221,0.013812,33
126,2016-w18,Sergipe,43,0.002687,6


In [52]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_04_neu_sample_size = pd.merge(df_2016_04_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [53]:
# Step 6: Perform the stratified sampling
df_2016_04_neu_sample = df_2016_04_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2016_04_neu_sample)} rows.")


Sampled 2381 rows.


In [54]:
text_gpt_2016_04_neu = '\n'.join(df_2016_04_neu_sample['text'].astype(str))

### Prompts and loops

In [55]:
prompt_4 = prompt_raw.replace('TWEETS', text_gpt_2016_04_neu)

In [56]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_4}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [57]:
response.choices[0].message.content

'The texts discuss various vaccination campaigns against H1N1 and rabies in Brazil. They highlight the urgency for vaccinations due to rising flu cases, the long lines at clinics, and public health efforts to immunize vulnerable populations. The government is distributing vaccines, with specific dates and locations announced for public access.'

In [58]:
#summary = []
#dataset = []

In [59]:
dataset.append("2016_04_neutral")

In [60]:
summary.append(response.choices[0].message.content)

In [61]:
df_2016_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  4 non-null      object
 1   summary  4 non-null      object
dtypes: object(2)
memory usage: 192.0+ bytes


In [62]:
df_2016_neu.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_neutral,The Ministry of Health of Brazil announced upd...
1,2016_02_neutral,The texts discuss the development of vaccines ...
2,2016_03_neutral,The text discusses various vaccination campaig...
3,2016_04_neutral,The texts discuss various vaccination campaign...


In [63]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcudcftv8PLtueBe67Fda9BDP9LB at 0x22288587860> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The texts discuss various vaccination campaigns against H1N1 and rabies in Brazil. They highlight the urgency for vaccinations due to rising flu cases, the long lines at clinics, and public health efforts to immunize vulnerable populations. The government is distributing vaccines, with specific dates and locations announced for public access.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862783,
  "id": "chatcmpl-AUcudcftv8PLtueBe67Fda9BDP9LB",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 61,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reasoning_tokens":

## May

In [64]:
# Get monthly data
df_2016_05_neu = pd.read_csv('../data/local/df_neu_2016-05.csv').reset_index()
df_2016_05_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10773 entries, 0 to 10772
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   10773 non-null  int64  
 1   LocationCode            10773 non-null  object 
 2   LocationName            10773 non-null  object 
 3   StatesCode              10773 non-null  object 
 4   ...4                    10773 non-null  int64  
 5   id                      10773 non-null  float64
 6   text                    10773 non-null  object 
 7   label                   10773 non-null  object 
 8   in_reply_to_status_id   554 non-null    float64
 9   in_reply_to_user_id     952 non-null    float64
 10  quoted_user_id          141 non-null    float64
 11  quoted_status_id        141 non-null    float64
 12  retweeted_user_id       1344 non-null   float64
 13  retweeted_status_id     1344 non-null   float64
 14  created_at              10773 non-null

### Representative sample based on week and location

In [65]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_05_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_05_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w18,Acre,6,0.000557
1,2016-w18,Alagoas,22,0.002042
2,2016-w18,Amapa,12,0.001114
3,2016-w18,Amazonas,46,0.004270
4,2016-w18,Bahia,178,0.016523
...,...,...,...,...
119,2016-w22,Rondonia,7,0.000650
120,2016-w22,Roraima,1,0.000093
121,2016-w22,Santa Catarina,11,0.001021
122,2016-w22,Sao Paulo,259,0.024042


In [66]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w18,Acre,6,0.000557,1
1,2016-w18,Alagoas,22,0.002042,5
2,2016-w18,Amapa,12,0.001114,2
3,2016-w18,Amazonas,46,0.004270,10
4,2016-w18,Bahia,178,0.016523,40
...,...,...,...,...,...
119,2016-w22,Rondonia,7,0.000650,1
120,2016-w22,Roraima,1,0.000093,0
121,2016-w22,Santa Catarina,11,0.001021,2
122,2016-w22,Sao Paulo,259,0.024042,58


In [67]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_05_neu_sample_size = pd.merge(df_2016_05_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [68]:
# Step 6: Perform the stratified sampling
df_2016_05_neu_sample = df_2016_05_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2016_05_neu_sample)} rows.")


Sampled 2387 rows.


In [69]:
text_gpt_2016_05_neu = '\n'.join(df_2016_05_neu_sample['text'].astype(str))

### Prompts and loops

In [70]:
prompt_5 = prompt_raw.replace('TWEETS', text_gpt_2016_05_neu)

In [71]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_5}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [72]:
response.choices[0].message.content

'The provided texts discuss various vaccination campaigns in Brazil, including those against H1N1, influenza, and foot-and-mouth disease. They highlight the importance of vaccination, reports of vaccine shortages in several cities, efforts to increase public awareness, and specific vaccination events. The overall message emphasizes the need for proper vaccination coverage and ongoing public health initiatives.'

In [73]:
#summary = []
#dataset = []

In [74]:
dataset.append("2016_05_neutral")

In [75]:
summary.append(response.choices[0].message.content)

In [76]:
df_2016_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  5 non-null      object
 1   summary  5 non-null      object
dtypes: object(2)
memory usage: 208.0+ bytes


In [77]:
df_2016_neu.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_neutral,The Ministry of Health of Brazil announced upd...
1,2016_02_neutral,The texts discuss the development of vaccines ...
2,2016_03_neutral,The text discusses various vaccination campaig...
3,2016_04_neutral,The texts discuss various vaccination campaign...
4,2016_05_neutral,The provided texts discuss various vaccination...


In [78]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcujVubrAY7xIDfbiIuGOwA29rtA at 0x222893514a0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided texts discuss various vaccination campaigns in Brazil, including those against H1N1, influenza, and foot-and-mouth disease. They highlight the importance of vaccination, reports of vaccine shortages in several cities, efforts to increase public awareness, and specific vaccination events. The overall message emphasizes the need for proper vaccination coverage and ongoing public health initiatives.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862789,
  "id": "chatcmpl-AUcujVubrAY7xIDfbiIuGOwA29rtA",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 68,
    "completion_tokens_details": {
      "accepted_predi

## June

In [79]:
# Get monthly data
df_2016_06_neu = pd.read_csv('../data/local/df_neu_2016-06.csv').reset_index()
df_2016_06_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3857 entries, 0 to 3856
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3857 non-null   int64  
 1   LocationCode            3857 non-null   object 
 2   LocationName            3857 non-null   object 
 3   StatesCode              3857 non-null   object 
 4   ...4                    3857 non-null   int64  
 5   id                      3857 non-null   float64
 6   text                    3857 non-null   object 
 7   label                   3857 non-null   object 
 8   in_reply_to_status_id   246 non-null    float64
 9   in_reply_to_user_id     297 non-null    float64
 10  quoted_user_id          40 non-null     float64
 11  quoted_status_id        40 non-null     float64
 12  retweeted_user_id       482 non-null    float64
 13  retweeted_status_id     482 non-null    float64
 14  created_at              3857 non-null   

### Representative sample based on week and location

In [80]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_06_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_06_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w22,Acre,1,0.000259
1,2016-w22,Amapa,1,0.000259
2,2016-w22,Amazonas,4,0.001037
3,2016-w22,Bahia,9,0.002333
4,2016-w22,Ceara,12,0.003111
...,...,...,...,...
111,2016-w26,Roraima,2,0.000519
112,2016-w26,Santa Catarina,14,0.003630
113,2016-w26,Sao Paulo,88,0.022816
114,2016-w26,Sergipe,1,0.000259


In [81]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w22,Acre,1,0.000259,0
1,2016-w22,Amapa,1,0.000259,0
2,2016-w22,Amazonas,4,0.001037,2
3,2016-w22,Bahia,9,0.002333,5
4,2016-w22,Ceara,12,0.003111,7
...,...,...,...,...,...
111,2016-w26,Roraima,2,0.000519,1
112,2016-w26,Santa Catarina,14,0.003630,8
113,2016-w26,Sao Paulo,88,0.022816,55
114,2016-w26,Sergipe,1,0.000259,0


In [82]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_06_neu_sample_size = pd.merge(df_2016_06_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [83]:
# Step 6: Perform the stratified sampling
df_2016_06_neu_sample = df_2016_06_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2016_06_neu_sample)} rows.")


Sampled 2390 rows.


In [84]:
text_gpt_2016_06_neu = '\n'.join(df_2016_06_neu_sample['text'].astype(str))

### Prompts and loops

In [85]:
prompt_6 = prompt_raw.replace('TWEETS', text_gpt_2016_06_neu)

In [86]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_6}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [87]:
response.choices[0].message.content

'The texts discuss various vaccination campaigns and updates in Brazil, focusing on the flu, rabies, and other diseases. They mention extensions of vaccination deadlines, requirements for proof of vaccination for travel, and the introduction of new vaccines. Additionally, there are references to personal experiences and public health initiatives aimed at increasing vaccination rates among different populations.'

In [88]:
#summary = []
#dataset = []

In [89]:
dataset.append("2016_06_neutral")

In [90]:
summary.append(response.choices[0].message.content)

In [91]:
df_2016_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  6 non-null      object
 1   summary  6 non-null      object
dtypes: object(2)
memory usage: 224.0+ bytes


In [92]:
df_2016_neu.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_neutral,The Ministry of Health of Brazil announced upd...
1,2016_02_neutral,The texts discuss the development of vaccines ...
2,2016_03_neutral,The text discusses various vaccination campaig...
3,2016_04_neutral,The texts discuss various vaccination campaign...
4,2016_05_neutral,The provided texts discuss various vaccination...
5,2016_06_neutral,The texts discuss various vaccination campaign...


In [93]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcuppfu0UmDYMZ0A2lZBy3j0Zopf at 0x222866c7720> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The texts discuss various vaccination campaigns and updates in Brazil, focusing on the flu, rabies, and other diseases. They mention extensions of vaccination deadlines, requirements for proof of vaccination for travel, and the introduction of new vaccines. Additionally, there are references to personal experiences and public health initiatives aimed at increasing vaccination rates among different populations.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862795,
  "id": "chatcmpl-AUcuppfu0UmDYMZ0A2lZBy3j0Zopf",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_f8dd36be86",
  "usage": {
    "completion_tokens": 66,
    "completion_tokens_details": {
      "accepted_pred

## July

In [94]:
# Get monthly data
df_2016_07_neu = pd.read_csv('../data/local/df_neu_2016-07.csv').reset_index()
df_2016_07_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3734 entries, 0 to 3733
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3734 non-null   int64  
 1   LocationCode            3734 non-null   object 
 2   LocationName            3734 non-null   object 
 3   StatesCode              3734 non-null   object 
 4   ...4                    3734 non-null   int64  
 5   id                      3734 non-null   float64
 6   text                    3734 non-null   object 
 7   label                   3734 non-null   object 
 8   in_reply_to_status_id   215 non-null    float64
 9   in_reply_to_user_id     239 non-null    float64
 10  quoted_user_id          44 non-null     float64
 11  quoted_status_id        44 non-null     float64
 12  retweeted_user_id       630 non-null    float64
 13  retweeted_status_id     630 non-null    float64
 14  created_at              3734 non-null   

### Representative sample based on week and location

In [95]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_07_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_07_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w27,Acre,2,0.000536
1,2016-w27,Alagoas,10,0.002678
2,2016-w27,Amapa,10,0.002678
3,2016-w27,Amazonas,20,0.005356
4,2016-w27,Bahia,53,0.014194
...,...,...,...,...
111,2016-w31,Rio Grande Do Sul,23,0.006160
112,2016-w31,Roraima,1,0.000268
113,2016-w31,Santa Catarina,9,0.002410
114,2016-w31,Sao Paulo,54,0.014462


In [96]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w27,Acre,2,0.000536,1
1,2016-w27,Alagoas,10,0.002678,6
2,2016-w27,Amapa,10,0.002678,6
3,2016-w27,Amazonas,20,0.005356,13
4,2016-w27,Bahia,53,0.014194,34
...,...,...,...,...,...
111,2016-w31,Rio Grande Do Sul,23,0.006160,15
112,2016-w31,Roraima,1,0.000268,0
113,2016-w31,Santa Catarina,9,0.002410,5
114,2016-w31,Sao Paulo,54,0.014462,35


In [97]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_07_neu_sample_size = pd.merge(df_2016_07_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [98]:
# Step 6: Perform the stratified sampling
df_2016_07_neu_sample = df_2016_07_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2016_07_neu_sample)} rows.")


Sampled 2390 rows.


In [99]:
text_gpt_2016_07_neu = '\n'.join(df_2016_07_neu_sample['text'].astype(str))

### Prompts and loops

In [100]:
prompt_7 = prompt_raw.replace('TWEETS', text_gpt_2016_07_neu)

In [101]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_7}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [102]:
response.choices[0].message.content

'The text covers various topics related to vaccination in Brazil, including the scheduling and availability of vaccines against diseases like polio, dengue, and HPV. It highlights upcoming vaccination campaigns, the importance of vaccination for children and animals, and mentions the costs associated with some vaccines. Additionally, it addresses public health concerns and the need for community participation in vaccination efforts.'

In [103]:
#summary = []
#dataset = []

In [104]:
dataset.append("2016_07_neutral")

In [105]:
summary.append(response.choices[0].message.content)

In [106]:
df_2016_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  7 non-null      object
 1   summary  7 non-null      object
dtypes: object(2)
memory usage: 240.0+ bytes


In [107]:
df_2016_neu.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_neutral,The Ministry of Health of Brazil announced upd...
1,2016_02_neutral,The texts discuss the development of vaccines ...
2,2016_03_neutral,The text discusses various vaccination campaig...
3,2016_04_neutral,The texts discuss various vaccination campaign...
4,2016_05_neutral,The provided texts discuss various vaccination...
5,2016_06_neutral,The texts discuss various vaccination campaign...
6,2016_07_neutral,The text covers various topics related to vacc...


In [108]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcuv4m8EGcVyJh22lCyX5Sy8B3N3 at 0x2228b2129a0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text covers various topics related to vaccination in Brazil, including the scheduling and availability of vaccines against diseases like polio, dengue, and HPV. It highlights upcoming vaccination campaigns, the importance of vaccination for children and animals, and mentions the costs associated with some vaccines. Additionally, it addresses public health concerns and the need for community participation in vaccination efforts.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862801,
  "id": "chatcmpl-AUcuv4m8EGcVyJh22lCyX5Sy8B3N3",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 70,
    "completion_tokens_details": 

## August

In [109]:
# Get monthly data
df_2016_08_neu = pd.read_csv('../data/local/df_neu_2016-08.csv').reset_index()
df_2016_08_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3282 entries, 0 to 3281
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3282 non-null   int64  
 1   LocationCode            3282 non-null   object 
 2   LocationName            3282 non-null   object 
 3   StatesCode              3282 non-null   object 
 4   ...4                    3282 non-null   int64  
 5   id                      3282 non-null   float64
 6   text                    3282 non-null   object 
 7   label                   3282 non-null   object 
 8   in_reply_to_status_id   215 non-null    float64
 9   in_reply_to_user_id     235 non-null    float64
 10  quoted_user_id          36 non-null     float64
 11  quoted_status_id        36 non-null     float64
 12  retweeted_user_id       458 non-null    float64
 13  retweeted_status_id     458 non-null    float64
 14  created_at              3282 non-null   

### Representative sample based on week and location

In [110]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_08_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_08_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w31,Acre,2,0.000609
1,2016-w31,Alagoas,13,0.003961
2,2016-w31,Amapa,4,0.001219
3,2016-w31,Amazonas,18,0.005484
4,2016-w31,Bahia,43,0.013102
...,...,...,...,...
112,2016-w35,Rio Grande Do Sul,45,0.013711
113,2016-w35,Rondonia,8,0.002438
114,2016-w35,Roraima,2,0.000609
115,2016-w35,Santa Catarina,6,0.001828


In [111]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w31,Acre,2,0.000609,1
1,2016-w31,Alagoas,13,0.003961,9
2,2016-w31,Amapa,4,0.001219,2
3,2016-w31,Amazonas,18,0.005484,13
4,2016-w31,Bahia,43,0.013102,32
...,...,...,...,...,...
112,2016-w35,Rio Grande Do Sul,45,0.013711,33
113,2016-w35,Rondonia,8,0.002438,5
114,2016-w35,Roraima,2,0.000609,1
115,2016-w35,Santa Catarina,6,0.001828,4


In [112]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_08_neu_sample_size = pd.merge(df_2016_08_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [113]:
# Step 6: Perform the stratified sampling
df_2016_08_neu_sample = df_2016_08_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2016_08_neu_sample)} rows.")


Sampled 2385 rows.


In [114]:
text_gpt_2016_08_neu = '\n'.join(df_2016_08_neu_sample['text'].astype(str))

### Prompts and loops

In [115]:
prompt_8 = prompt_raw.replace('TWEETS', text_gpt_2016_08_neu)

In [116]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_8}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [117]:
response.choices[0].message.content

'The articles discuss various vaccination initiatives in Brazil, particularly focusing on the intensification of HPV vaccination in schools, the introduction and sales of dengue vaccines at high prices, and the ongoing vaccination campaigns against rabies and other diseases. Concerns about the affordability of vaccines and low vaccination coverage rates are also highlighted. Additionally, there are mentions of new vaccine trials for Zika and dengue, alongside calls for reporting abusive pricing practices.'

In [118]:
#summary = []
#dataset = []

In [119]:
dataset.append("2016_08_neutral")

In [120]:
summary.append(response.choices[0].message.content)

In [121]:
df_2016_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  8 non-null      object
 1   summary  8 non-null      object
dtypes: object(2)
memory usage: 256.0+ bytes


In [122]:
df_2016_neu.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_neutral,The Ministry of Health of Brazil announced upd...
1,2016_02_neutral,The texts discuss the development of vaccines ...
2,2016_03_neutral,The text discusses various vaccination campaig...
3,2016_04_neutral,The texts discuss various vaccination campaign...
4,2016_05_neutral,The provided texts discuss various vaccination...
5,2016_06_neutral,The texts discuss various vaccination campaign...
6,2016_07_neutral,The text covers various topics related to vacc...
7,2016_08_neutral,The articles discuss various vaccination initi...


In [123]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcv0VG3rraTmqon8ZrPhGAVWKCTt at 0x2228b696ae0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The articles discuss various vaccination initiatives in Brazil, particularly focusing on the intensification of HPV vaccination in schools, the introduction and sales of dengue vaccines at high prices, and the ongoing vaccination campaigns against rabies and other diseases. Concerns about the affordability of vaccines and low vaccination coverage rates are also highlighted. Additionally, there are mentions of new vaccine trials for Zika and dengue, alongside calls for reporting abusive pricing practices.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862806,
  "id": "chatcmpl-AUcv0VG3rraTmqon8ZrPhGAVWKCTt",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
 

## September

In [124]:
# Get monthly data
df_2016_09_neu = pd.read_csv('../data/local/df_neu_2016-09.csv').reset_index()
df_2016_09_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5507 entries, 0 to 5506
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   5507 non-null   int64  
 1   LocationCode            5507 non-null   object 
 2   LocationName            5507 non-null   object 
 3   StatesCode              5507 non-null   object 
 4   ...4                    5507 non-null   int64  
 5   id                      5507 non-null   float64
 6   text                    5507 non-null   object 
 7   label                   5507 non-null   object 
 8   in_reply_to_status_id   314 non-null    float64
 9   in_reply_to_user_id     335 non-null    float64
 10  quoted_user_id          67 non-null     float64
 11  quoted_status_id        67 non-null     float64
 12  retweeted_user_id       848 non-null    float64
 13  retweeted_status_id     848 non-null    float64
 14  created_at              5507 non-null   

### Representative sample based on week and location

In [125]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_09_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_09_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w35,Acre,1,0.000182
1,2016-w35,Amapa,1,0.000182
2,2016-w35,Bahia,2,0.000363
3,2016-w35,Ceara,2,0.000363
4,2016-w35,Distrito Federal,1,0.000182
...,...,...,...,...
129,2016-w40,Rio Grande Do Sul,35,0.006356
130,2016-w40,Rondonia,2,0.000363
131,2016-w40,Santa Catarina,5,0.000908
132,2016-w40,Sao Paulo,55,0.009987


In [126]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w35,Acre,1,0.000182,0
1,2016-w35,Amapa,1,0.000182,0
2,2016-w35,Bahia,2,0.000363,0
3,2016-w35,Ceara,2,0.000363,0
4,2016-w35,Distrito Federal,1,0.000182,0
...,...,...,...,...,...
129,2016-w40,Rio Grande Do Sul,35,0.006356,15
130,2016-w40,Rondonia,2,0.000363,0
131,2016-w40,Santa Catarina,5,0.000908,2
132,2016-w40,Sao Paulo,55,0.009987,24


In [127]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_09_neu_sample_size = pd.merge(df_2016_09_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [128]:
# Step 6: Perform the stratified sampling
df_2016_09_neu_sample = df_2016_09_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2016_09_neu_sample)} rows.")


Sampled 2383 rows.


In [129]:
text_gpt_2016_09_neu = '\n'.join(df_2016_09_neu_sample['text'].astype(str))

### Prompts and loops

In [130]:
prompt_9 = prompt_raw.replace('TWEETS', text_gpt_2016_09_neu)

In [131]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_9}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [132]:
response.choices[0].message.content

'The text discusses various vaccination campaigns in Brazil, including those for dogs and cats against rabies, and vaccinations for children and adolescents. It mentions specific dates and locations for vaccination events, such as the "Dia D" for rabies vaccination and the national campaign for updating vaccination records. The text reflects public reactions, experiences with vaccinations, and the importance of compliance with vaccination schedules.'

In [133]:
#summary = []
#dataset = []

In [134]:
dataset.append("2016_09_neutral")

In [135]:
summary.append(response.choices[0].message.content)

In [136]:
df_2016_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  9 non-null      object
 1   summary  9 non-null      object
dtypes: object(2)
memory usage: 272.0+ bytes


In [137]:
df_2016_neu.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_neutral,The Ministry of Health of Brazil announced upd...
1,2016_02_neutral,The texts discuss the development of vaccines ...
2,2016_03_neutral,The text discusses various vaccination campaig...
3,2016_04_neutral,The texts discuss various vaccination campaign...
4,2016_05_neutral,The provided texts discuss various vaccination...
5,2016_06_neutral,The texts discuss various vaccination campaign...
6,2016_07_neutral,The text covers various topics related to vacc...
7,2016_08_neutral,The articles discuss various vaccination initi...
8,2016_09_neutral,The text discusses various vaccination campaig...


In [138]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcv7zOYZug4JoKhSltD9yOpUR1dH at 0x222898167c0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns in Brazil, including those for dogs and cats against rabies, and vaccinations for children and adolescents. It mentions specific dates and locations for vaccination events, such as the \"Dia D\" for rabies vaccination and the national campaign for updating vaccination records. The text reflects public reactions, experiences with vaccinations, and the importance of compliance with vaccination schedules.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862813,
  "id": "chatcmpl-AUcv7zOYZug4JoKhSltD9yOpUR1dH",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 75,
    "completio

## October

In [139]:
# Get monthly data
df_2016_10_neu = pd.read_csv('../data/local/df_neu_2016-10.csv').reset_index()
df_2016_10_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3508 entries, 0 to 3507
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3508 non-null   int64  
 1   LocationCode            3508 non-null   object 
 2   LocationName            3508 non-null   object 
 3   StatesCode              3508 non-null   object 
 4   ...4                    3508 non-null   int64  
 5   id                      3508 non-null   float64
 6   text                    3508 non-null   object 
 7   label                   3508 non-null   object 
 8   in_reply_to_status_id   254 non-null    float64
 9   in_reply_to_user_id     277 non-null    float64
 10  quoted_user_id          60 non-null     float64
 11  quoted_status_id        60 non-null     float64
 12  retweeted_user_id       396 non-null    float64
 13  retweeted_status_id     396 non-null    float64
 14  created_at              3508 non-null   

### Representative sample based on week and location

In [140]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_10_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_10_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w40,Acre,2,0.000570
1,2016-w40,Alagoas,6,0.001710
2,2016-w40,Amapa,3,0.000855
3,2016-w40,Amazonas,26,0.007412
4,2016-w40,Bahia,27,0.007697
...,...,...,...,...
115,2016-w44,Roraima,4,0.001140
116,2016-w44,Santa Catarina,3,0.000855
117,2016-w44,Sao Paulo,61,0.017389
118,2016-w44,Sergipe,1,0.000285


In [141]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w40,Acre,2,0.000570,1
1,2016-w40,Alagoas,6,0.001710,4
2,2016-w40,Amapa,3,0.000855,2
3,2016-w40,Amazonas,26,0.007412,18
4,2016-w40,Bahia,27,0.007697,18
...,...,...,...,...,...
115,2016-w44,Roraima,4,0.001140,2
116,2016-w44,Santa Catarina,3,0.000855,2
117,2016-w44,Sao Paulo,61,0.017389,42
118,2016-w44,Sergipe,1,0.000285,0


In [142]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_10_neu_sample_size = pd.merge(df_2016_10_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [143]:
# Step 6: Perform the stratified sampling
df_2016_10_neu_sample = df_2016_10_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2016_10_neu_sample)} rows.")


Sampled 2390 rows.


In [144]:
text_gpt_2016_10_neu = '\n'.join(df_2016_10_neu_sample['text'].astype(str))

### Prompts and loops

In [145]:
prompt_10 = prompt_raw.replace('TWEETS', text_gpt_2016_10_neu)

In [146]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_10}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [147]:
response.choices[0].message.content

'The text contains various references to vaccination campaigns in Brazil, highlighting the importance of vaccines for children, animals, and specific diseases like HPV, rabies, and foot-and-mouth disease. It discusses public health initiatives, personal experiences with vaccination, and mentions upcoming vaccination events and requirements. The overall sentiment includes humor and anxiety about receiving vaccines, particularly in children and pets.'

In [148]:
#summary = []
#dataset = []

In [149]:
dataset.append("2016_10_neutral")

In [150]:
summary.append(response.choices[0].message.content)

In [151]:
df_2016_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  10 non-null     object
 1   summary  10 non-null     object
dtypes: object(2)
memory usage: 288.0+ bytes


In [152]:
df_2016_neu.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_neutral,The Ministry of Health of Brazil announced upd...
1,2016_02_neutral,The texts discuss the development of vaccines ...
2,2016_03_neutral,The text discusses various vaccination campaig...
3,2016_04_neutral,The texts discuss various vaccination campaign...
4,2016_05_neutral,The provided texts discuss various vaccination...
5,2016_06_neutral,The texts discuss various vaccination campaign...
6,2016_07_neutral,The text covers various topics related to vacc...
7,2016_08_neutral,The articles discuss various vaccination initi...
8,2016_09_neutral,The text discusses various vaccination campaig...
9,2016_10_neutral,The text contains various references to vaccin...


In [153]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcvD3ZrPx4bQTauhIRNMAJ3Vdxq5 at 0x2228c987540> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text contains various references to vaccination campaigns in Brazil, highlighting the importance of vaccines for children, animals, and specific diseases like HPV, rabies, and foot-and-mouth disease. It discusses public health initiatives, personal experiences with vaccination, and mentions upcoming vaccination events and requirements. The overall sentiment includes humor and anxiety about receiving vaccines, particularly in children and pets.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862819,
  "id": "chatcmpl-AUcvD3ZrPx4bQTauhIRNMAJ3Vdxq5",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 72,
    "completion_t

## November

In [154]:
# Get monthly data
df_2016_11_neu = pd.read_csv('../data/local/df_neu_2016-11.csv').reset_index()
df_2016_11_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3039 entries, 0 to 3038
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3039 non-null   int64  
 1   LocationCode            3039 non-null   object 
 2   LocationName            3039 non-null   object 
 3   StatesCode              3039 non-null   object 
 4   ...4                    3039 non-null   int64  
 5   id                      3039 non-null   float64
 6   text                    3039 non-null   object 
 7   label                   3039 non-null   object 
 8   in_reply_to_status_id   245 non-null    float64
 9   in_reply_to_user_id     256 non-null    float64
 10  quoted_user_id          80 non-null     float64
 11  quoted_status_id        80 non-null     float64
 12  retweeted_user_id       508 non-null    float64
 13  retweeted_status_id     508 non-null    float64
 14  created_at              3039 non-null   

### Representative sample based on week and location

In [155]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_11_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_11_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w44,Acre,3,0.000987
1,2016-w44,Alagoas,3,0.000987
2,2016-w44,Amazonas,16,0.005265
3,2016-w44,Bahia,39,0.012833
4,2016-w44,Ceara,20,0.006581
...,...,...,...,...
113,2016-w48,Roraima,1,0.000329
114,2016-w48,Santa Catarina,4,0.001316
115,2016-w48,Sao Paulo,55,0.018098
116,2016-w48,Sergipe,3,0.000987


In [156]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w44,Acre,3,0.000987,2
1,2016-w44,Alagoas,3,0.000987,2
2,2016-w44,Amazonas,16,0.005265,12
3,2016-w44,Bahia,39,0.012833,31
4,2016-w44,Ceara,20,0.006581,16
...,...,...,...,...,...
113,2016-w48,Roraima,1,0.000329,0
114,2016-w48,Santa Catarina,4,0.001316,3
115,2016-w48,Sao Paulo,55,0.018098,44
116,2016-w48,Sergipe,3,0.000987,2


In [157]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_11_neu_sample_size = pd.merge(df_2016_11_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [158]:
# Step 6: Perform the stratified sampling
df_2016_11_neu_sample = df_2016_11_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2016_11_neu_sample)} rows.")


Sampled 2390 rows.


In [159]:
text_gpt_2016_11_neu = '\n'.join(df_2016_11_neu_sample['text'].astype(str))

### Prompts and loops

In [160]:
prompt_11 = prompt_raw.replace('TWEETS', text_gpt_2016_11_neu)

In [161]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_11}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [162]:
response.choices[0].message.content

'The text discusses vaccination campaigns in Brazil, focusing on mandatory vaccines for both humans and animals, particularly against foot-and-mouth disease and rabies. It highlights the importance of immunization, mentions specific vaccination phases, and shares personal experiences and opinions about getting vaccinated, including fears and preferences regarding vaccine administration.'

In [163]:
#summary = []
#dataset = []

In [164]:
dataset.append("2016_11_neutral")

In [165]:
summary.append(response.choices[0].message.content)

In [166]:
df_2016_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  11 non-null     object
 1   summary  11 non-null     object
dtypes: object(2)
memory usage: 304.0+ bytes


In [167]:
df_2016_neu.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_neutral,The Ministry of Health of Brazil announced upd...
1,2016_02_neutral,The texts discuss the development of vaccines ...
2,2016_03_neutral,The text discusses various vaccination campaig...
3,2016_04_neutral,The texts discuss various vaccination campaign...
4,2016_05_neutral,The provided texts discuss various vaccination...
5,2016_06_neutral,The texts discuss various vaccination campaign...
6,2016_07_neutral,The text covers various topics related to vacc...
7,2016_08_neutral,The articles discuss various vaccination initi...
8,2016_09_neutral,The text discusses various vaccination campaig...
9,2016_10_neutral,The text contains various references to vaccin...


In [168]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcvMx4zpYL1pEVuQHlkTjd9k2j0T at 0x2228ca89540> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses vaccination campaigns in Brazil, focusing on mandatory vaccines for both humans and animals, particularly against foot-and-mouth disease and rabies. It highlights the importance of immunization, mentions specific vaccination phases, and shares personal experiences and opinions about getting vaccinated, including fears and preferences regarding vaccine administration.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862828,
  "id": "chatcmpl-AUcvMx4zpYL1pEVuQHlkTjd9k2j0T",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 59,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      

## December

In [169]:
# Get monthly data
df_2016_12_neu = pd.read_csv('../data/local/df_neu_2016-12.csv').reset_index()
df_2016_12_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1737 entries, 0 to 1736
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1737 non-null   int64  
 1   LocationCode            1737 non-null   object 
 2   LocationName            1737 non-null   object 
 3   StatesCode              1737 non-null   object 
 4   ...4                    1737 non-null   int64  
 5   id                      1737 non-null   float64
 6   text                    1737 non-null   object 
 7   label                   1737 non-null   object 
 8   in_reply_to_status_id   230 non-null    float64
 9   in_reply_to_user_id     240 non-null    float64
 10  quoted_user_id          56 non-null     float64
 11  quoted_status_id        56 non-null     float64
 12  retweeted_user_id       285 non-null    float64
 13  retweeted_status_id     285 non-null    float64
 14  created_at              1737 non-null   

### Representative sample based on week and location

In [170]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_12_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_12_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w48,Amapa,2,0.001151
1,2016-w48,Amazonas,3,0.001727
2,2016-w48,Bahia,1,0.000576
3,2016-w48,Ceara,2,0.001151
4,2016-w48,Distrito Federal,7,0.004030
...,...,...,...,...
115,2016-w53,Parana,4,0.002303
116,2016-w53,Rio De Janeiro,1,0.000576
117,2016-w53,Rio Grande Do Sul,3,0.001727
118,2016-w53,Santa Catarina,2,0.001151


In [171]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w48,Amapa,2,0.001151,2
1,2016-w48,Amazonas,3,0.001727,4
2,2016-w48,Bahia,1,0.000576,1
3,2016-w48,Ceara,2,0.001151,2
4,2016-w48,Distrito Federal,7,0.004030,9
...,...,...,...,...,...
115,2016-w53,Parana,4,0.002303,5
116,2016-w53,Rio De Janeiro,1,0.000576,1
117,2016-w53,Rio Grande Do Sul,3,0.001727,4
118,2016-w53,Santa Catarina,2,0.001151,2


In [172]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_12_neu_sample_size = pd.merge(df_2016_12_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [174]:
# Step 6: Perform the stratified sampling
#df_2016_12_neu_sample = df_2016_12_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2016_12_neu_sample = df_2016_12_neu
# Step 7: Check the result
print(f"Sampled {len(df_2016_12_neu_sample)} rows.")


Sampled 1737 rows.


In [175]:
text_gpt_2016_12_neu = '\n'.join(df_2016_12_neu_sample['text'].astype(str))

### Prompts and loops

In [176]:
prompt_12 = prompt_raw.replace('TWEETS', text_gpt_2016_12_neu)

In [177]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_12}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [178]:
response.choices[0].message.content

'The text discusses various vaccination campaigns, including those for cattle against foot-and-mouth disease, rabies, and other diseases. It emphasizes the importance of vaccination schedules for both livestock and pets, mentions government initiatives to facilitate vaccinations, and highlights specific cases where vaccines are crucial for public health, like dengue and HPV.'

In [179]:
#summary = []
#dataset = []

In [180]:
dataset.append("2016_12_neutral")

In [181]:
summary.append(response.choices[0].message.content)

In [182]:
df_2016_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  12 non-null     object
 1   summary  12 non-null     object
dtypes: object(2)
memory usage: 320.0+ bytes


In [183]:
df_2016_neu.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_neutral,The Ministry of Health of Brazil announced upd...
1,2016_02_neutral,The texts discuss the development of vaccines ...
2,2016_03_neutral,The text discusses various vaccination campaig...
3,2016_04_neutral,The texts discuss various vaccination campaign...
4,2016_05_neutral,The provided texts discuss various vaccination...
5,2016_06_neutral,The texts discuss various vaccination campaign...
6,2016_07_neutral,The text covers various topics related to vacc...
7,2016_08_neutral,The articles discuss various vaccination initi...
8,2016_09_neutral,The text discusses various vaccination campaig...
9,2016_10_neutral,The text contains various references to vaccin...


In [184]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcwHwKAIBGd5LvjjSRb7U91n94yo at 0x22286672220> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns, including those for cattle against foot-and-mouth disease, rabies, and other diseases. It emphasizes the importance of vaccination schedules for both livestock and pets, mentions government initiatives to facilitate vaccinations, and highlights specific cases where vaccines are crucial for public health, like dengue and HPV.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862885,
  "id": "chatcmpl-AUcwHwKAIBGd5LvjjSRb7U91n94yo",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 61,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_token

## All 2016 neutral

In [185]:
#df_2016_neu['dataset'] = df_2016_neu['dataset'].str.replace('neuitive','neutral')

In [186]:
df_2016_neu.to_csv('../data/summary_2016_neutral_tweets.csv')