# Import libraries and data

In [1]:
# Import libraries
import os
import openai
import pandas as pd
import numpy as np
import openpyxl

In [2]:
# Get OpenAI API secret key (change this file with your own OpenAI key)
open_ai_key = open('../data/local/openai_key.txt', 'r').read()
openai.api_key = open_ai_key
#openai.Model.list()

In [3]:
# Prompt for GPT
prompt_raw = "I would like to summarise in maximum 300 characters and in English what the following sentences in Portuguese talk about. \
Do not use aspects mentioned only in one sentence as part of the summary: TWEETS"

# 2014 positive

## January

In [4]:
# Get monthly data
df_2014_01_pos = pd.read_csv('../data/local/df_pos_2014-01.csv').reset_index()
df_2014_01_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2019 entries, 0 to 2018
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2019 non-null   int64  
 1   LocationCode            2019 non-null   object 
 2   LocationName            2019 non-null   object 
 3   StatesCode              2019 non-null   object 
 4   ...4                    2019 non-null   int64  
 5   id                      2019 non-null   float64
 6   text                    2019 non-null   object 
 7   label                   2019 non-null   object 
 8   in_reply_to_status_id   103 non-null    float64
 9   in_reply_to_user_id     111 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       558 non-null    float64
 13  retweeted_status_id     558 non-null    float64
 14  created_at              2019 non-null   

### Representative sample based on week and location

In [5]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_01_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_01_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w01,Amapa,3,0.001486
1,2014-w01,Amazonas,5,0.002476
2,2014-w01,Bahia,14,0.006934
3,2014-w01,Ceara,7,0.003467
4,2014-w01,Distrito Federal,10,0.004953
...,...,...,...,...
102,2014-w05,Rio De Janeiro,1,0.000495
103,2014-w05,Rio Grande Do Sul,23,0.011392
104,2014-w05,Rondonia,2,0.000991
105,2014-w05,Santa Catarina,6,0.002972


In [6]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w01,Amapa,3,0.001486,3
1,2014-w01,Amazonas,5,0.002476,6
2,2014-w01,Bahia,14,0.006934,16
3,2014-w01,Ceara,7,0.003467,8
4,2014-w01,Distrito Federal,10,0.004953,12
...,...,...,...,...,...
102,2014-w05,Rio De Janeiro,1,0.000495,1
103,2014-w05,Rio Grande Do Sul,23,0.011392,27
104,2014-w05,Rondonia,2,0.000991,2
105,2014-w05,Santa Catarina,6,0.002972,7


In [7]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_01_pos_sample_size = pd.merge(df_2014_01_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [8]:
# Step 6: Perform the stratified sampling
#df_2014_01_pos_sample = df_2014_01_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_01_pos_sample = df_2014_01_pos
# Step 7: Check the result
print(f"Sampled {len(df_2014_01_pos_sample)} rows.")


Sampled 2019 rows.


In [9]:
text_gpt_2014_01_pos = '\n'.join(df_2014_01_pos_sample['text'].astype(str))

### Prompts and loops

In [10]:
prompt_1 = prompt_raw.replace('TWEETS', text_gpt_2014_01_pos)

In [11]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_1}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [12]:
response.choices[0].message.content

'The provided texts discuss the introduction of a nasal vaccine for whooping cough, the successful testing of this vaccine in humans, and the implementation of a public vaccination campaign against HPV in Brazil starting March 10, 2014. The HPV vaccine will be offered for free to girls aged 11 to 13, aiming to prevent cervical cancer. Additionally, vaccination efforts are being made for Haitian immigrants and general vaccination initiatives within the health system.'

In [13]:
summary = []
dataset = []

In [14]:
dataset.append("2014_01_positive")

In [15]:
summary.append(response.choices[0].message.content)

In [16]:
df_2014_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  1 non-null      object
 1   summary  1 non-null      object
dtypes: object(2)
memory usage: 144.0+ bytes


In [17]:
df_2014_pos.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_positive,The provided texts discuss the introduction of...


In [18]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdHEF1clgwLmB8hT6JVMj4pSijRF at 0x1d359530400> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided texts discuss the introduction of a nasal vaccine for whooping cough, the successful testing of this vaccine in humans, and the implementation of a public vaccination campaign against HPV in Brazil starting March 10, 2014. The HPV vaccine will be offered for free to girls aged 11 to 13, aiming to prevent cervical cancer. Additionally, vaccination efforts are being made for Haitian immigrants and general vaccination initiatives within the health system.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864184,
  "id": "chatcmpl-AUdHEF1clgwLmB8hT6JVMj4pSijRF",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 87,

## February

In [19]:
# Get monthly data
df_2014_02_pos = pd.read_csv('../data/local/df_pos_2014-02.csv').reset_index()
df_2014_02_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2234 entries, 0 to 2233
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2234 non-null   int64  
 1   LocationCode            2234 non-null   object 
 2   LocationName            2234 non-null   object 
 3   StatesCode              2234 non-null   object 
 4   ...4                    2234 non-null   int64  
 5   id                      2234 non-null   float64
 6   text                    2234 non-null   object 
 7   label                   2234 non-null   object 
 8   in_reply_to_status_id   67 non-null     float64
 9   in_reply_to_user_id     89 non-null     float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       559 non-null    float64
 13  retweeted_status_id     559 non-null    float64
 14  created_at              2234 non-null   

### Representative sample based on week and location

In [20]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_02_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_02_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w05,Alagoas,4,0.001791
1,2014-w05,Amapa,1,0.000448
2,2014-w05,Amazonas,8,0.003581
3,2014-w05,Bahia,6,0.002686
4,2014-w05,Ceara,24,0.010743
...,...,...,...,...
106,2014-w09,Piaui,1,0.000448
107,2014-w09,Rio Grande Do Sul,11,0.004924
108,2014-w09,Santa Catarina,4,0.001791
109,2014-w09,Sao Paulo,36,0.016115


In [21]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w05,Alagoas,4,0.001791,4
1,2014-w05,Amapa,1,0.000448,1
2,2014-w05,Amazonas,8,0.003581,8
3,2014-w05,Bahia,6,0.002686,6
4,2014-w05,Ceara,24,0.010743,26
...,...,...,...,...,...
106,2014-w09,Piaui,1,0.000448,1
107,2014-w09,Rio Grande Do Sul,11,0.004924,12
108,2014-w09,Santa Catarina,4,0.001791,4
109,2014-w09,Sao Paulo,36,0.016115,39


In [22]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_02_pos_sample_size = pd.merge(df_2014_02_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [23]:
# Step 6: Perform the stratified sampling
#df_2014_02_pos_sample = df_2014_02_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_02_pos_sample = df_2014_02_pos
# Step 7: Check the result
print(f"Sampled {len(df_2014_02_pos_sample)} rows.")


Sampled 2234 rows.


In [24]:
text_gpt_2014_02_pos = '\n'.join(df_2014_02_pos_sample['text'].astype(str))

### Prompts and loops

In [25]:
prompt_2 = prompt_raw.replace('TWEETS', text_gpt_2014_02_pos)

In [26]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_2}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [27]:
response.choices[0].message.content

'The texts discuss various vaccination campaigns and research in Brazil, particularly focusing on the HPV vaccine for girls aged 11 to 13, which will be available in schools starting in March. Additionally, there are advancements in HIV vaccine research, with successful tests on monkeys and plans for human trials within three years.'

In [28]:
#summary = []
#dataset = []

In [29]:
dataset.append("2014_02_positive")

In [30]:
summary.append(response.choices[0].message.content)

In [31]:
df_2014_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  2 non-null      object
 1   summary  2 non-null      object
dtypes: object(2)
memory usage: 160.0+ bytes


In [32]:
df_2014_pos.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_positive,The provided texts discuss the introduction of...
1,2014_02_positive,The texts discuss various vaccination campaign...


In [33]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdHLvSeApov7mJRiUPnDNqUwviEQ at 0x1d359716e00> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The texts discuss various vaccination campaigns and research in Brazil, particularly focusing on the HPV vaccine for girls aged 11 to 13, which will be available in schools starting in March. Additionally, there are advancements in HIV vaccine research, with successful tests on monkeys and plans for human trials within three years.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864191,
  "id": "chatcmpl-AUdHLvSeApov7mJRiUPnDNqUwviEQ",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 60,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reasoning_tokens": 0,
      "

## March

In [34]:
# Get monthly data
df_2014_03_pos = pd.read_csv('../data/local/df_pos_2014-03.csv').reset_index()
df_2014_03_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4045 entries, 0 to 4044
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   4045 non-null   int64  
 1   LocationCode            4045 non-null   object 
 2   LocationName            4045 non-null   object 
 3   StatesCode              4045 non-null   object 
 4   ...4                    4045 non-null   int64  
 5   id                      4045 non-null   float64
 6   text                    4045 non-null   object 
 7   label                   4045 non-null   object 
 8   in_reply_to_status_id   177 non-null    float64
 9   in_reply_to_user_id     234 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       937 non-null    float64
 13  retweeted_status_id     937 non-null    float64
 14  created_at              4045 non-null   

### Representative sample based on week and location

In [35]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_03_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_03_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w09,Alagoas,1,0.000247
1,2014-w09,Amapa,1,0.000247
2,2014-w09,Amazonas,5,0.001236
3,2014-w09,Bahia,13,0.003214
4,2014-w09,Ceara,8,0.001978
...,...,...,...,...
112,2014-w13,Roraima,7,0.001731
113,2014-w13,Santa Catarina,10,0.002472
114,2014-w13,Sao Paulo,88,0.021755
115,2014-w13,Sergipe,3,0.000742


In [36]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w09,Alagoas,1,0.000247,0
1,2014-w09,Amapa,1,0.000247,0
2,2014-w09,Amazonas,5,0.001236,3
3,2014-w09,Bahia,13,0.003214,7
4,2014-w09,Ceara,8,0.001978,4
...,...,...,...,...,...
112,2014-w13,Roraima,7,0.001731,4
113,2014-w13,Santa Catarina,10,0.002472,6
114,2014-w13,Sao Paulo,88,0.021755,53
115,2014-w13,Sergipe,3,0.000742,1


In [37]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_03_pos_sample_size = pd.merge(df_2014_03_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [38]:
# Step 6: Perform the stratified sampling
#df_2014_03_pos_sample = df_2014_03_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_03_pos_sample = df_2014_03_pos
# Step 7: Check the result
print(f"Sampled {len(df_2014_03_pos_sample)} rows.")


Sampled 4045 rows.


In [39]:
text_gpt_2014_03_pos = '\n'.join(df_2014_03_pos_sample['text'].astype(str))

### Prompts and loops

In [40]:
prompt_3 = prompt_raw.replace('TWEETS', text_gpt_2014_03_pos)

In [41]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_3}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [42]:
response.choices[0].message.content

'The text discusses the upcoming nationwide vaccination campaign in Brazil against HPV, targeting girls aged 11 to 13, starting from March 10. It emphasizes the importance of vaccination in preventing cervical cancer, highlights the availability of the vaccine through public health systems, and encourages parents to ensure their daughters receive the vaccine. Concerns about vaccine safety are addressed, with authorities assuring the public of its efficacy and necessity. The campaign aims to raise awareness and increase participation among the target age group.'

In [43]:
#summary = []
#dataset = []

In [44]:
dataset.append("2014_03_positive")

In [45]:
summary.append(response.choices[0].message.content)

In [46]:
df_2014_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  3 non-null      object
 1   summary  3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes


In [47]:
df_2014_pos.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_positive,The provided texts discuss the introduction of...
1,2014_02_positive,The texts discuss various vaccination campaign...
2,2014_03_positive,The text discusses the upcoming nationwide vac...


In [48]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdHS6SuYsWFOWfCrO1ZA0a9xbvl8 at 0x1d35971c680> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses the upcoming nationwide vaccination campaign in Brazil against HPV, targeting girls aged 11 to 13, starting from March 10. It emphasizes the importance of vaccination in preventing cervical cancer, highlights the availability of the vaccine through public health systems, and encourages parents to ensure their daughters receive the vaccine. Concerns about vaccine safety are addressed, with authorities assuring the public of its efficacy and necessity. The campaign aims to raise awareness and increase participation among the target age group.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864198,
  "id": "chatcmpl-AUdHS6SuYsWFOWfCrO1ZA0a9xbvl8",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat

## April

In [49]:
# Get monthly data
df_2014_04_pos = pd.read_csv('../data/local/df_pos_2014-04.csv').reset_index()
df_2014_04_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2761 entries, 0 to 2760
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2761 non-null   int64  
 1   LocationCode            2761 non-null   object 
 2   LocationName            2761 non-null   object 
 3   StatesCode              2761 non-null   object 
 4   ...4                    2761 non-null   int64  
 5   id                      2761 non-null   float64
 6   text                    2761 non-null   object 
 7   label                   2761 non-null   object 
 8   in_reply_to_status_id   86 non-null     float64
 9   in_reply_to_user_id     101 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       639 non-null    float64
 13  retweeted_status_id     639 non-null    float64
 14  created_at              2761 non-null   

### Representative sample based on week and location

In [50]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_04_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_04_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w13,Alagoas,3,0.001087
1,2014-w13,Bahia,10,0.003622
2,2014-w13,Ceara,2,0.000724
3,2014-w13,Distrito Federal,2,0.000724
4,2014-w13,Goias,1,0.000362
...,...,...,...,...
121,2014-w18,Pernambuco,3,0.001087
122,2014-w18,Rio Grande Do Sul,9,0.003260
123,2014-w18,Santa Catarina,5,0.001811
124,2014-w18,Sao Paulo,15,0.005433


In [51]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w13,Alagoas,3,0.001087,2
1,2014-w13,Bahia,10,0.003622,8
2,2014-w13,Ceara,2,0.000724,1
3,2014-w13,Distrito Federal,2,0.000724,1
4,2014-w13,Goias,1,0.000362,0
...,...,...,...,...,...
121,2014-w18,Pernambuco,3,0.001087,2
122,2014-w18,Rio Grande Do Sul,9,0.003260,7
123,2014-w18,Santa Catarina,5,0.001811,4
124,2014-w18,Sao Paulo,15,0.005433,13


In [52]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_04_pos_sample_size = pd.merge(df_2014_04_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [53]:
# Step 6: Perform the stratified sampling
#df_2014_04_pos_sample = df_2014_04_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_04_pos_sample = df_2014_04_pos
# Step 7: Check the result
print(f"Sampled {len(df_2014_04_pos_sample)} rows.")


Sampled 2761 rows.


In [54]:
text_gpt_2014_04_pos = '\n'.join(df_2014_04_pos_sample['text'].astype(str))

### Prompts and loops

In [55]:
prompt_4 = prompt_raw.replace('TWEETS', text_gpt_2014_04_pos)

In [56]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_4}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [57]:
response.choices[0].message.content

'The sentences discuss vaccination campaigns in Brazil, focusing on the importance of vaccines for children, pregnant women, and specific diseases like HPV and influenza. Key details include the start dates for various vaccination campaigns, the groups targeted (such as children under 5 and young girls for HPV), and efforts to increase vaccination rates nationwide. The messages emphasize the safety and necessity of vaccinations in preventing serious diseases.'

In [58]:
#summary = []
#dataset = []

In [59]:
dataset.append("2014_04_positive")

In [60]:
summary.append(response.choices[0].message.content)

In [61]:
df_2014_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  4 non-null      object
 1   summary  4 non-null      object
dtypes: object(2)
memory usage: 192.0+ bytes


In [62]:
df_2014_pos.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_positive,The provided texts discuss the introduction of...
1,2014_02_positive,The texts discuss various vaccination campaign...
2,2014_03_positive,The text discusses the upcoming nationwide vac...
3,2014_04_positive,The sentences discuss vaccination campaigns in...


In [63]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdHeqSRKIqHpnYe2IexbvG5qgH7Y at 0x1d359bddae0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The sentences discuss vaccination campaigns in Brazil, focusing on the importance of vaccines for children, pregnant women, and specific diseases like HPV and influenza. Key details include the start dates for various vaccination campaigns, the groups targeted (such as children under 5 and young girls for HPV), and efforts to increase vaccination rates nationwide. The messages emphasize the safety and necessity of vaccinations in preventing serious diseases.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864210,
  "id": "chatcmpl-AUdHeqSRKIqHpnYe2IexbvG5qgH7Y",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_f8dd36be86",
  "usage": {
    "completion_tokens": 77,
    "c

## May

In [64]:
# Get monthly data
df_2014_05_pos = pd.read_csv('../data/local/df_pos_2014-05.csv').reset_index()
df_2014_05_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2155 entries, 0 to 2154
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2155 non-null   int64  
 1   LocationCode            2155 non-null   object 
 2   LocationName            2155 non-null   object 
 3   StatesCode              2155 non-null   object 
 4   ...4                    2155 non-null   int64  
 5   id                      2155 non-null   float64
 6   text                    2155 non-null   object 
 7   label                   2155 non-null   object 
 8   in_reply_to_status_id   106 non-null    float64
 9   in_reply_to_user_id     124 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       482 non-null    float64
 13  retweeted_status_id     482 non-null    float64
 14  created_at              2155 non-null   

### Representative sample based on week and location

In [65]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_05_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_05_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w18,Acre,1,0.000464
1,2014-w18,Alagoas,8,0.003712
2,2014-w18,Amapa,3,0.001392
3,2014-w18,Amazonas,8,0.003712
4,2014-w18,Bahia,32,0.014849
...,...,...,...,...
107,2014-w22,Roraima,1,0.000464
108,2014-w22,Santa Catarina,2,0.000928
109,2014-w22,Sao Paulo,35,0.016241
110,2014-w22,Sergipe,7,0.003248


In [66]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w18,Acre,1,0.000464,1
1,2014-w18,Alagoas,8,0.003712,9
2,2014-w18,Amapa,3,0.001392,3
3,2014-w18,Amazonas,8,0.003712,9
4,2014-w18,Bahia,32,0.014849,36
...,...,...,...,...,...
107,2014-w22,Roraima,1,0.000464,1
108,2014-w22,Santa Catarina,2,0.000928,2
109,2014-w22,Sao Paulo,35,0.016241,39
110,2014-w22,Sergipe,7,0.003248,7


In [67]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_05_pos_sample_size = pd.merge(df_2014_05_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [69]:
# Step 6: Perform the stratified sampling
#df_2014_05_pos_sample = df_2014_05_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_05_pos_sample = df_2014_05_pos
# Step 7: Check the result
print(f"Sampled {len(df_2014_05_pos_sample)} rows.")


Sampled 2155 rows.


In [70]:
text_gpt_2014_05_pos = '\n'.join(df_2014_05_pos_sample['text'].astype(str))

### Prompts and loops

In [71]:
prompt_5 = prompt_raw.replace('TWEETS', text_gpt_2014_05_pos)

In [72]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_5}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [73]:
response.choices[0].message.content

'The text discusses various vaccination campaigns in Brazil, focusing mainly on the influenza vaccine and the importance of immunization against diseases like HPV and measles. It highlights government efforts to extend vaccination deadlines and achieve vaccination targets, as well as the development of new vaccines for diseases like malaria and cancer. The public is encouraged to participate in vaccination drives to protect against illnesses and maintain community health.'

In [74]:
#summary = []
#dataset = []

In [75]:
dataset.append("2014_05_positive")

In [76]:
summary.append(response.choices[0].message.content)

In [77]:
df_2014_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  5 non-null      object
 1   summary  5 non-null      object
dtypes: object(2)
memory usage: 208.0+ bytes


In [78]:
df_2014_pos.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_positive,The provided texts discuss the introduction of...
1,2014_02_positive,The texts discuss various vaccination campaign...
2,2014_03_positive,The text discusses the upcoming nationwide vac...
3,2014_04_positive,The sentences discuss vaccination campaigns in...
4,2014_05_positive,The text discusses various vaccination campaig...


In [79]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdJGLmlFZioeCgaeFaRgf9sFhdAo at 0x1d35f2c2e00> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns in Brazil, focusing mainly on the influenza vaccine and the importance of immunization against diseases like HPV and measles. It highlights government efforts to extend vaccination deadlines and achieve vaccination targets, as well as the development of new vaccines for diseases like malaria and cancer. The public is encouraged to participate in vaccination drives to protect against illnesses and maintain community health.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864310,
  "id": "chatcmpl-AUdJGLmlFZioeCgaeFaRgf9sFhdAo",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens

## June

In [80]:
# Get monthly data
df_2014_06_pos = pd.read_csv('../data/local/df_pos_2014-06.csv').reset_index()
df_2014_06_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1272 entries, 0 to 1271
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1272 non-null   int64  
 1   LocationCode            1272 non-null   object 
 2   LocationName            1272 non-null   object 
 3   StatesCode              1272 non-null   object 
 4   ...4                    1272 non-null   int64  
 5   id                      1272 non-null   float64
 6   text                    1272 non-null   object 
 7   label                   1272 non-null   object 
 8   in_reply_to_status_id   89 non-null     float64
 9   in_reply_to_user_id     95 non-null     float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       211 non-null    float64
 13  retweeted_status_id     211 non-null    float64
 14  created_at              1272 non-null   

### Representative sample based on week and location

In [81]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_06_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_06_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w22,Acre,2,0.001572
1,2014-w22,Amapa,4,0.003145
2,2014-w22,Amazonas,8,0.006289
3,2014-w22,Bahia,15,0.011792
4,2014-w22,Ceara,4,0.003145
...,...,...,...,...
102,2014-w26,Roraima,3,0.002358
103,2014-w26,Santa Catarina,2,0.001572
104,2014-w26,Sao Paulo,55,0.043239
105,2014-w26,Sergipe,2,0.001572


In [82]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w22,Acre,2,0.001572,3
1,2014-w22,Amapa,4,0.003145,7
2,2014-w22,Amazonas,8,0.006289,15
3,2014-w22,Bahia,15,0.011792,28
4,2014-w22,Ceara,4,0.003145,7
...,...,...,...,...,...
102,2014-w26,Roraima,3,0.002358,5
103,2014-w26,Santa Catarina,2,0.001572,3
104,2014-w26,Sao Paulo,55,0.043239,105
105,2014-w26,Sergipe,2,0.001572,3


In [83]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_06_pos_sample_size = pd.merge(df_2014_06_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [84]:
# Step 6: Perform the stratified sampling
#df_2014_06_pos_sample = df_2014_06_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_06_pos_sample = df_2014_06_pos
# Step 7: Check the result
print(f"Sampled {len(df_2014_06_pos_sample)} rows.")


Sampled 1272 rows.


In [85]:
text_gpt_2014_06_pos = '\n'.join(df_2014_06_pos_sample['text'].astype(str))

### Prompts and loops

In [86]:
prompt_6 = prompt_raw.replace('TWEETS', text_gpt_2014_06_pos)

In [87]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_6}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [88]:
response.choices[0].message.content

'The text discusses various vaccination campaigns and initiatives in Brazil, emphasizing the importance of vaccinations against diseases like HPV, influenza, and coqueluche. It highlights statistics on coverage rates, ongoing campaigns, and the benefits of vaccination for both individuals and public health. The overall message promotes awareness and participation in vaccination programs.'

In [89]:
#summary = []
#dataset = []

In [90]:
dataset.append("2014_06_positive")

In [91]:
summary.append(response.choices[0].message.content)

In [92]:
df_2014_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  6 non-null      object
 1   summary  6 non-null      object
dtypes: object(2)
memory usage: 224.0+ bytes


In [93]:
df_2014_pos.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_positive,The provided texts discuss the introduction of...
1,2014_02_positive,The texts discuss various vaccination campaign...
2,2014_03_positive,The text discusses the upcoming nationwide vac...
3,2014_04_positive,The sentences discuss vaccination campaigns in...
4,2014_05_positive,The text discusses various vaccination campaig...
5,2014_06_positive,The text discusses various vaccination campaig...


In [94]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdJNz3GxL1cpDk5zuNruhrNRGkeQ at 0x1d35f3d8860> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns and initiatives in Brazil, emphasizing the importance of vaccinations against diseases like HPV, influenza, and coqueluche. It highlights statistics on coverage rates, ongoing campaigns, and the benefits of vaccination for both individuals and public health. The overall message promotes awareness and participation in vaccination programs.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864317,
  "id": "chatcmpl-AUdJNz3GxL1cpDk5zuNruhrNRGkeQ",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 61,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
     

## July

In [95]:
# Get monthly data
df_2014_07_pos = pd.read_csv('../data/local/df_pos_2014-07.csv').reset_index()
df_2014_07_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1901 entries, 0 to 1900
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1901 non-null   int64  
 1   LocationCode            1901 non-null   object 
 2   LocationName            1901 non-null   object 
 3   StatesCode              1901 non-null   object 
 4   ...4                    1901 non-null   int64  
 5   id                      1901 non-null   float64
 6   text                    1901 non-null   object 
 7   label                   1901 non-null   object 
 8   in_reply_to_status_id   80 non-null     float64
 9   in_reply_to_user_id     84 non-null     float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       420 non-null    float64
 13  retweeted_status_id     420 non-null    float64
 14  created_at              1901 non-null   

### Representative sample based on week and location

In [96]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_07_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_07_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w26,Bahia,8,0.004208
1,2014-w26,Ceara,2,0.001052
2,2014-w26,Distrito Federal,2,0.001052
3,2014-w26,Goias,3,0.001578
4,2014-w26,Maranhao,3,0.001578
...,...,...,...,...
123,2014-w31,Rio Grande Do Sul,12,0.006312
124,2014-w31,Roraima,2,0.001052
125,2014-w31,Santa Catarina,9,0.004734
126,2014-w31,Sao Paulo,44,0.023146


In [97]:
# Step 3: Define the target sample size
#target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
#group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
#group_sizes
# Include all samples since there are not many tweets

In [98]:
# Step 5: Merge this sample size information back to the original DataFrame
#df_2014_07_pos_sample_size = pd.merge(df_2014_07_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
 #                              on=['year_week', 'LocationName'], how='left')

In [99]:
# Step 6: Perform the stratified sampling
#df_2014_07_pos_sample = df_2014_07_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
#    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_07_pos_sample = df_2014_07_pos
# Step 7: Check the result
print(f"Sampled {len(df_2014_07_pos_sample)} rows.")


Sampled 1901 rows.


In [100]:
text_gpt_2014_07_pos = '\n'.join(df_2014_07_pos_sample['text'].astype(str))

### Prompts and loops

In [101]:
prompt_7 = prompt_raw.replace('TWEETS', text_gpt_2014_07_pos)

In [102]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_7}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [103]:
response.choices[0].message.content

'The text discusses various vaccination campaigns in Brazil, highlighting the effectiveness of a dengue vaccine showing 88% efficacy in hemorrhagic cases. It notes the inclusion of the hepatitis A vaccine in the national immunization schedule, with plans to reach children aged 1-2 years. Additionally, it mentions significant vaccination rates against influenza and the need for public awareness regarding hepatitis vaccinations.'

In [104]:
#summary = []
#dataset = []

In [105]:
dataset.append("2014_07_positive")

In [106]:
summary.append(response.choices[0].message.content)

In [107]:
df_2014_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  7 non-null      object
 1   summary  7 non-null      object
dtypes: object(2)
memory usage: 240.0+ bytes


In [108]:
df_2014_pos.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_positive,The provided texts discuss the introduction of...
1,2014_02_positive,The texts discuss various vaccination campaign...
2,2014_03_positive,The text discusses the upcoming nationwide vac...
3,2014_04_positive,The sentences discuss vaccination campaigns in...
4,2014_05_positive,The text discusses various vaccination campaig...
5,2014_06_positive,The text discusses various vaccination campaig...
6,2014_07_positive,The text discusses various vaccination campaig...


In [109]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdJVgQYAD7B9gArlS4P8X6vtZVbN at 0x1d35f27ee50> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns in Brazil, highlighting the effectiveness of a dengue vaccine showing 88% efficacy in hemorrhagic cases. It notes the inclusion of the hepatitis A vaccine in the national immunization schedule, with plans to reach children aged 1-2 years. Additionally, it mentions significant vaccination rates against influenza and the need for public awareness regarding hepatitis vaccinations.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864325,
  "id": "chatcmpl-AUdJVgQYAD7B9gArlS4P8X6vtZVbN",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 74,
    "completion_tokens_details": {
    

## August

In [110]:
# Get monthly data
df_2014_08_pos = pd.read_csv('../data/local/df_pos_2014-08.csv').reset_index()
df_2014_08_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1653 entries, 0 to 1652
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1653 non-null   int64  
 1   LocationCode            1653 non-null   object 
 2   LocationName            1653 non-null   object 
 3   StatesCode              1653 non-null   object 
 4   ...4                    1653 non-null   int64  
 5   id                      1653 non-null   float64
 6   text                    1653 non-null   object 
 7   label                   1653 non-null   object 
 8   in_reply_to_status_id   69 non-null     float64
 9   in_reply_to_user_id     138 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       385 non-null    float64
 13  retweeted_status_id     385 non-null    float64
 14  created_at              1653 non-null   

### Representative sample based on week and location

In [111]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_08_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_08_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w31,Alagoas,1,0.000605
1,2014-w31,Amapa,3,0.001815
2,2014-w31,Amazonas,13,0.007864
3,2014-w31,Bahia,38,0.022989
4,2014-w31,Ceara,6,0.003630
...,...,...,...,...
105,2014-w35,Roraima,2,0.001210
106,2014-w35,Santa Catarina,6,0.003630
107,2014-w35,Sao Paulo,69,0.041742
108,2014-w35,Sergipe,3,0.001815


In [112]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w31,Alagoas,1,0.000605,1
1,2014-w31,Amapa,3,0.001815,4
2,2014-w31,Amazonas,13,0.007864,19
3,2014-w31,Bahia,38,0.022989,56
4,2014-w31,Ceara,6,0.003630,8
...,...,...,...,...,...
105,2014-w35,Roraima,2,0.001210,2
106,2014-w35,Santa Catarina,6,0.003630,8
107,2014-w35,Sao Paulo,69,0.041742,102
108,2014-w35,Sergipe,3,0.001815,4


In [113]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_08_pos_sample_size = pd.merge(df_2014_08_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [114]:
# Step 6: Perform the stratified sampling
#df_2014_08_pos_sample = df_2014_08_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_08_pos_sample = df_2014_08_pos
# Step 7: Check the result
print(f"Sampled {len(df_2014_08_pos_sample)} rows.")


Sampled 1653 rows.


In [115]:
text_gpt_2014_08_pos = '\n'.join(df_2014_08_pos_sample['text'].astype(str))

### Prompts and loops

In [116]:
prompt_8 = prompt_raw.replace('TWEETS', text_gpt_2014_08_pos)

In [117]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_8}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [118]:
response.choices[0].message.content

'The text covers various announcements regarding vaccines for different diseases, including dengue, rabies, HPV, hepatitis A, and Ebola. It highlights the introduction of new vaccines, the start of vaccination campaigns in multiple states, and the importance of vaccinating children and pets. Additionally, it discusses ongoing vaccine trials for Ebola and chikungunya, emphasizing public health initiatives and the necessity of vaccinations to prevent disease spread.'

In [119]:
#summary = []
#dataset = []

In [120]:
dataset.append("2014_08_positive")

In [121]:
summary.append(response.choices[0].message.content)

In [122]:
df_2014_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  8 non-null      object
 1   summary  8 non-null      object
dtypes: object(2)
memory usage: 256.0+ bytes


In [123]:
df_2014_pos.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_positive,The provided texts discuss the introduction of...
1,2014_02_positive,The texts discuss various vaccination campaign...
2,2014_03_positive,The text discusses the upcoming nationwide vac...
3,2014_04_positive,The sentences discuss vaccination campaigns in...
4,2014_05_positive,The text discusses various vaccination campaig...
5,2014_06_positive,The text discusses various vaccination campaig...
6,2014_07_positive,The text discusses various vaccination campaig...
7,2014_08_positive,The text covers various announcements regardin...


In [124]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdJZSkGYp4aZXIIio9xnPxHvP2tn at 0x1d35f66f810> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text covers various announcements regarding vaccines for different diseases, including dengue, rabies, HPV, hepatitis A, and Ebola. It highlights the introduction of new vaccines, the start of vaccination campaigns in multiple states, and the importance of vaccinating children and pets. Additionally, it discusses ongoing vaccine trials for Ebola and chikungunya, emphasizing public health initiatives and the necessity of vaccinations to prevent disease spread.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864329,
  "id": "chatcmpl-AUdJZSkGYp4aZXIIio9xnPxHvP2tn",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_9b78b61c52",
  "usage": {
    "completion_tokens": 80,
 

## September

In [125]:
# Get monthly data
df_2014_09_pos = pd.read_csv('../data/local/df_pos_2014-09.csv').reset_index()
df_2014_09_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2535 entries, 0 to 2534
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2535 non-null   int64  
 1   LocationCode            2535 non-null   object 
 2   LocationName            2535 non-null   object 
 3   StatesCode              2535 non-null   object 
 4   ...4                    2535 non-null   int64  
 5   id                      2535 non-null   float64
 6   text                    2535 non-null   object 
 7   label                   2535 non-null   object 
 8   in_reply_to_status_id   140 non-null    float64
 9   in_reply_to_user_id     158 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       571 non-null    float64
 13  retweeted_status_id     571 non-null    float64
 14  created_at              2535 non-null   

### Representative sample based on week and location

In [126]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_09_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_09_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w35,Acre,2,0.000789
1,2014-w35,Alagoas,5,0.001972
2,2014-w35,Amapa,2,0.000789
3,2014-w35,Amazonas,10,0.003945
4,2014-w35,Bahia,29,0.011440
...,...,...,...,...
110,2014-w39,Rio Grande Do Sul,27,0.010651
111,2014-w39,Roraima,5,0.001972
112,2014-w39,Santa Catarina,8,0.003156
113,2014-w39,Sao Paulo,45,0.017751


In [127]:
# Step 3: Define the target sample size
#target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
#group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
#group_sizes
# Taking all tweets since there were not enough for sampling

In [128]:
# Step 5: Merge this sample size information back to the original DataFrame
#df_2014_09_pos_sample_size = pd.merge(df_2014_09_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
 #                              on=['year_week', 'LocationName'], how='left')

In [129]:
# Step 6: Perform the stratified sampling
#df_2014_09_pos_sample = df_2014_09_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_09_pos_sample = df_2014_09_pos
# Step 7: Check the result
print(f"Sampled {len(df_2014_09_pos_sample)} rows.")


Sampled 2535 rows.


In [130]:
text_gpt_2014_09_pos = '\n'.join(df_2014_09_pos_sample['text'].astype(str))

### Prompts and loops

In [131]:
prompt_9 = prompt_raw.replace('TWEETS', text_gpt_2014_09_pos)

In [132]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_9}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [133]:
response.choices[0].message.content

'The texts discuss various vaccination campaigns and their significance, including the availability of HPV vaccines for girls aged 11-13, the effectiveness of the dengue vaccine showing 60.8% efficacy, and the ongoing efforts to develop and test vaccines against Ebola. They emphasize the importance of timely vaccinations for both humans and pets, and the role of vaccines in preventing diseases. Additionally, there are mentions of new vaccines for hepatitis A and meningitis, and the need for public health initiatives to encourage vaccination.'

In [134]:
#summary = []
#dataset = []

In [135]:
dataset.append("2014_09_positive")

In [136]:
summary.append(response.choices[0].message.content)

In [137]:
df_2014_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  9 non-null      object
 1   summary  9 non-null      object
dtypes: object(2)
memory usage: 272.0+ bytes


In [138]:
df_2014_pos.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_positive,The provided texts discuss the introduction of...
1,2014_02_positive,The texts discuss various vaccination campaign...
2,2014_03_positive,The text discusses the upcoming nationwide vac...
3,2014_04_positive,The sentences discuss vaccination campaigns in...
4,2014_05_positive,The text discusses various vaccination campaig...
5,2014_06_positive,The text discusses various vaccination campaig...
6,2014_07_positive,The text discusses various vaccination campaig...
7,2014_08_positive,The text covers various announcements regardin...
8,2014_09_positive,The texts discuss various vaccination campaign...


In [139]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdJgmSkXBE4agGmF0QiXsIFxzK1I at 0x1d35f828860> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The texts discuss various vaccination campaigns and their significance, including the availability of HPV vaccines for girls aged 11-13, the effectiveness of the dengue vaccine showing 60.8% efficacy, and the ongoing efforts to develop and test vaccines against Ebola. They emphasize the importance of timely vaccinations for both humans and pets, and the role of vaccines in preventing diseases. Additionally, there are mentions of new vaccines for hepatitis A and meningitis, and the need for public health initiatives to encourage vaccination.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864336,
  "id": "chatcmpl-AUdJgmSkXBE4agGmF0QiXsIFxzK1I",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "sy

## October

In [140]:
# Get monthly data
df_2014_10_pos = pd.read_csv('../data/local/df_pos_2014-10.csv').reset_index()
df_2014_10_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2330 entries, 0 to 2329
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2330 non-null   int64  
 1   LocationCode            2330 non-null   object 
 2   LocationName            2330 non-null   object 
 3   StatesCode              2330 non-null   object 
 4   ...4                    2330 non-null   int64  
 5   id                      2330 non-null   float64
 6   text                    2330 non-null   object 
 7   label                   2330 non-null   object 
 8   in_reply_to_status_id   141 non-null    float64
 9   in_reply_to_user_id     164 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       840 non-null    float64
 13  retweeted_status_id     840 non-null    float64
 14  created_at              2330 non-null   

### Representative sample based on week and location

In [141]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_10_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_10_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w40,Alagoas,2,0.000858
1,2014-w40,Amapa,2,0.000858
2,2014-w40,Amazonas,5,0.002146
3,2014-w40,Bahia,11,0.004721
4,2014-w40,Ceara,7,0.003004
...,...,...,...,...
101,2014-w44,Rondonia,2,0.000858
102,2014-w44,Santa Catarina,5,0.002146
103,2014-w44,Sao Paulo,54,0.023176
104,2014-w44,Sergipe,1,0.000429


In [142]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w40,Alagoas,2,0.000858,2
1,2014-w40,Amapa,2,0.000858,2
2,2014-w40,Amazonas,5,0.002146,5
3,2014-w40,Bahia,11,0.004721,11
4,2014-w40,Ceara,7,0.003004,7
...,...,...,...,...,...
101,2014-w44,Rondonia,2,0.000858,2
102,2014-w44,Santa Catarina,5,0.002146,5
103,2014-w44,Sao Paulo,54,0.023176,56
104,2014-w44,Sergipe,1,0.000429,1


In [143]:
# Step 5: Merge this sample size information back to the original DataFrame
#df_2014_10_pos_sample_size = pd.merge(df_2014_10_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
 #                              on=['year_week', 'LocationName'], how='left')
# Taking all tweets for not having enough tweets for sampling

In [144]:
# Step 6: Perform the stratified sampling
#df_2014_10_pos_sample = df_2014_10_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_10_pos_sample = df_2014_10_pos
# Step 7: Check the result
print(f"Sampled {len(df_2014_10_pos_sample)} rows.")


Sampled 2330 rows.


In [145]:
text_gpt_2014_10_pos = '\n'.join(df_2014_10_pos_sample['text'].astype(str))

### Prompts and loops

In [146]:
prompt_10 = prompt_raw.replace('TWEETS', text_gpt_2014_10_pos)

In [147]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_10}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [148]:
response.choices[0].message.content

'The text discusses various vaccination campaigns and initiatives in Brazil, including vaccinations against polio, measles, HPV, and rabies. It highlights the urgency and importance of these vaccinations, especially in light of potential epidemics like Ebola. It also mentions the testing of an Ebola vaccine in Canada and Switzerland, and emphasizes the role of vaccinations in public health and disease prevention.'

In [149]:
#summary = []
#dataset = []

In [150]:
dataset.append("2014_10_positive")

In [151]:
summary.append(response.choices[0].message.content)

In [152]:
df_2014_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  10 non-null     object
 1   summary  10 non-null     object
dtypes: object(2)
memory usage: 288.0+ bytes


In [153]:
df_2014_pos.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_positive,The provided texts discuss the introduction of...
1,2014_02_positive,The texts discuss various vaccination campaign...
2,2014_03_positive,The text discusses the upcoming nationwide vac...
3,2014_04_positive,The sentences discuss vaccination campaigns in...
4,2014_05_positive,The text discusses various vaccination campaig...
5,2014_06_positive,The text discusses various vaccination campaig...
6,2014_07_positive,The text discusses various vaccination campaig...
7,2014_08_positive,The text covers various announcements regardin...
8,2014_09_positive,The texts discuss various vaccination campaign...
9,2014_10_positive,The text discusses various vaccination campaig...


In [154]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdJuYLpRj5iByuK7yeapCUxZPHUt at 0x1d35f9d0590> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns and initiatives in Brazil, including vaccinations against polio, measles, HPV, and rabies. It highlights the urgency and importance of these vaccinations, especially in light of potential epidemics like Ebola. It also mentions the testing of an Ebola vaccine in Canada and Switzerland, and emphasizes the role of vaccinations in public health and disease prevention.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864350,
  "id": "chatcmpl-AUdJuYLpRj5iByuK7yeapCUxZPHUt",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_9b78b61c52",
  "usage": {
    "completion_tokens": 73,
    "completion_tokens_details": {
      "accepted_pr

## November

In [155]:
# Get monthly data
df_2014_11_pos = pd.read_csv('../data/local/df_pos_2014-11.csv').reset_index()
df_2014_11_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4173 entries, 0 to 4172
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   4173 non-null   int64  
 1   LocationCode            4173 non-null   object 
 2   LocationName            4173 non-null   object 
 3   StatesCode              4173 non-null   object 
 4   ...4                    4173 non-null   int64  
 5   id                      4173 non-null   float64
 6   text                    4173 non-null   object 
 7   label                   4173 non-null   object 
 8   in_reply_to_status_id   85 non-null     float64
 9   in_reply_to_user_id     99 non-null     float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       919 non-null    float64
 13  retweeted_status_id     919 non-null    float64
 14  created_at              4173 non-null   

### Representative sample based on week and location

In [156]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_11_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_11_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w44,Acre,4,0.000959
1,2014-w44,Alagoas,14,0.003355
2,2014-w44,Amapa,3,0.000719
3,2014-w44,Amazonas,10,0.002396
4,2014-w44,Bahia,62,0.014857
...,...,...,...,...
117,2014-w48,Roraima,7,0.001677
118,2014-w48,Santa Catarina,9,0.002157
119,2014-w48,Sao Paulo,80,0.019171
120,2014-w48,Sergipe,3,0.000719


In [157]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w44,Acre,4,0.000959,2
1,2014-w44,Alagoas,14,0.003355,8
2,2014-w44,Amapa,3,0.000719,1
3,2014-w44,Amazonas,10,0.002396,5
4,2014-w44,Bahia,62,0.014857,36
...,...,...,...,...,...
117,2014-w48,Roraima,7,0.001677,4
118,2014-w48,Santa Catarina,9,0.002157,5
119,2014-w48,Sao Paulo,80,0.019171,46
120,2014-w48,Sergipe,3,0.000719,1


In [158]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_11_pos_sample_size = pd.merge(df_2014_11_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [159]:
# Step 6: Perform the stratified sampling
#df_2014_11_pos_sample = df_2014_11_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_11_pos_sample = df_2014_11_pos
# Step 7: Check the result
print(f"Sampled {len(df_2014_11_pos_sample)} rows.")


Sampled 4173 rows.


In [160]:
text_gpt_2014_11_pos = '\n'.join(df_2014_11_pos_sample['text'].astype(str))

### Prompts and loops

In [161]:
prompt_11 = prompt_raw.replace('TWEETS', text_gpt_2014_11_pos)

In [162]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_11}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [163]:
response.choices[0].message.content

'The provided sentences in Portuguese discuss various vaccination campaigns in Brazil, focusing on the importance of keeping vaccinations up to date for children, pregnant women, and pets. Key topics include the launch of vaccines for diseases like dengue, polio, measles, and whooping cough, with emphasis on specific campaigns and deadlines for immunization. Additionally, advancements in vaccine development, including one against Ebola, and the introduction of health apps to track vaccination schedules are highlighted.'

In [164]:
#summary = []
#dataset = []

In [165]:
dataset.append("2014_11_positive")

In [166]:
summary.append(response.choices[0].message.content)

In [167]:
df_2014_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  11 non-null     object
 1   summary  11 non-null     object
dtypes: object(2)
memory usage: 304.0+ bytes


In [168]:
df_2014_pos.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_positive,The provided texts discuss the introduction of...
1,2014_02_positive,The texts discuss various vaccination campaign...
2,2014_03_positive,The text discusses the upcoming nationwide vac...
3,2014_04_positive,The sentences discuss vaccination campaigns in...
4,2014_05_positive,The text discusses various vaccination campaig...
5,2014_06_positive,The text discusses various vaccination campaig...
6,2014_07_positive,The text discusses various vaccination campaig...
7,2014_08_positive,The text covers various announcements regardin...
8,2014_09_positive,The texts discuss various vaccination campaign...
9,2014_10_positive,The text discusses various vaccination campaig...


In [169]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdK2J89AuxYc1ZId3DE3tm3UcgEg at 0x1d35fc54310> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided sentences in Portuguese discuss various vaccination campaigns in Brazil, focusing on the importance of keeping vaccinations up to date for children, pregnant women, and pets. Key topics include the launch of vaccines for diseases like dengue, polio, measles, and whooping cough, with emphasis on specific campaigns and deadlines for immunization. Additionally, advancements in vaccine development, including one against Ebola, and the introduction of health apps to track vaccination schedules are highlighted.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864358,
  "id": "chatcmpl-AUdK2J89AuxYc1ZId3DE3tm3UcgEg",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_

## December

In [170]:
# Get monthly data
df_2014_12_pos = pd.read_csv('../data/local/df_pos_2014-12.csv').reset_index()
df_2014_12_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1160 entries, 0 to 1159
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1160 non-null   int64  
 1   LocationCode            1160 non-null   object 
 2   LocationName            1160 non-null   object 
 3   StatesCode              1160 non-null   object 
 4   ...4                    1160 non-null   int64  
 5   id                      1160 non-null   float64
 6   text                    1160 non-null   object 
 7   label                   1160 non-null   object 
 8   in_reply_to_status_id   58 non-null     float64
 9   in_reply_to_user_id     65 non-null     float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       203 non-null    float64
 13  retweeted_status_id     203 non-null    float64
 14  created_at              1160 non-null   

### Representative sample based on week and location

In [171]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_12_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_12_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w48,Alagoas,1,0.000862
1,2014-w48,Amazonas,3,0.002586
2,2014-w48,Bahia,4,0.003448
3,2014-w48,Ceara,9,0.007759
4,2014-w48,Distrito Federal,2,0.001724
...,...,...,...,...
100,2014-w53,Mato Grosso,1,0.000862
101,2014-w53,Minas Gerais,1,0.000862
102,2014-w53,Piaui,1,0.000862
103,2014-w53,Rio Grande Do Sul,2,0.001724


In [172]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w48,Alagoas,1,0.000862,2
1,2014-w48,Amazonas,3,0.002586,6
2,2014-w48,Bahia,4,0.003448,8
3,2014-w48,Ceara,9,0.007759,19
4,2014-w48,Distrito Federal,2,0.001724,4
...,...,...,...,...,...
100,2014-w53,Mato Grosso,1,0.000862,2
101,2014-w53,Minas Gerais,1,0.000862,2
102,2014-w53,Piaui,1,0.000862,2
103,2014-w53,Rio Grande Do Sul,2,0.001724,4


In [173]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_12_pos_sample_size = pd.merge(df_2014_12_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [174]:
# Step 6: Perform the stratified sampling
#df_2014_12_pos_sample = df_2014_12_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_12_pos_sample = df_2014_12_pos
# Step 7: Check the result
print(f"Sampled {len(df_2014_12_pos_sample)} rows.")


Sampled 1160 rows.


In [175]:
text_gpt_2014_12_pos = '\n'.join(df_2014_12_pos_sample['text'].astype(str))

### Prompts and loops

In [176]:
prompt_12 = prompt_raw.replace('TWEETS', text_gpt_2014_12_pos)

In [177]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_12}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [178]:
response.choices[0].message.content

'The text discusses various vaccination campaigns in Brazil, focusing on the vaccination against polio, measles, and other diseases such as dengue and HPV. It highlights the importance of public health initiatives, with many cities achieving significant coverage rates. Additionally, it mentions the development and testing of vaccines for diseases like Ebola and cancer, emphasizing the necessity of staying updated with vaccinations for children and pets.'

In [179]:
#summary = []
#dataset = []

In [180]:
dataset.append("2014_12_positive")

In [181]:
summary.append(response.choices[0].message.content)

In [182]:
df_2014_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  12 non-null     object
 1   summary  12 non-null     object
dtypes: object(2)
memory usage: 320.0+ bytes


In [183]:
df_2014_pos.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_positive,The provided texts discuss the introduction of...
1,2014_02_positive,The texts discuss various vaccination campaign...
2,2014_03_positive,The text discusses the upcoming nationwide vac...
3,2014_04_positive,The sentences discuss vaccination campaigns in...
4,2014_05_positive,The text discusses various vaccination campaig...
5,2014_06_positive,The text discusses various vaccination campaig...
6,2014_07_positive,The text discusses various vaccination campaig...
7,2014_08_positive,The text covers various announcements regardin...
8,2014_09_positive,The texts discuss various vaccination campaign...
9,2014_10_positive,The text discusses various vaccination campaig...


In [184]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdKC9QGY6CjXjILO1CR4ZSCQzVfP at 0x1d35fe59040> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns in Brazil, focusing on the vaccination against polio, measles, and other diseases such as dengue and HPV. It highlights the importance of public health initiatives, with many cities achieving significant coverage rates. Additionally, it mentions the development and testing of vaccines for diseases like Ebola and cancer, emphasizing the necessity of staying updated with vaccinations for children and pets.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864368,
  "id": "chatcmpl-AUdKC9QGY6CjXjILO1CR4ZSCQzVfP",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 76,
    "complet

## All 2014 positive

In [185]:
df_2014_pos.to_csv('../data/summary_2014_positive_tweets.csv')