# Import libraries and data

In [1]:
# Import libraries
import os
import openai
import pandas as pd
import numpy as np
import openpyxl

In [2]:
# Get OpenAI API secret key (change this file with your own OpenAI key)
open_ai_key = open('../data/local/openai_key.txt', 'r').read()
openai.api_key = open_ai_key
#openai.Model.list()

In [3]:
# Prompt for GPT
prompt_raw = "I would like to summarise in maximum 300 characters and in English what the following sentences in Portuguese talk about. \
Do not use aspects mentioned only in one sentence as part of the summary: TWEETS"

# 2015 positive

## January

In [4]:
# Get monthly data
df_2015_01_pos = pd.read_csv('../data/local/df_pos_2015-01.csv').reset_index()
df_2015_01_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1047 entries, 0 to 1046
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1047 non-null   int64  
 1   LocationCode            1047 non-null   object 
 2   LocationName            1047 non-null   object 
 3   StatesCode              1047 non-null   object 
 4   ...4                    1047 non-null   int64  
 5   id                      1047 non-null   float64
 6   text                    1047 non-null   object 
 7   label                   1047 non-null   object 
 8   in_reply_to_status_id   58 non-null     float64
 9   in_reply_to_user_id     60 non-null     float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       276 non-null    float64
 13  retweeted_status_id     276 non-null    float64
 14  created_at              1047 non-null   

### Representative sample based on week and location

In [5]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_01_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_01_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2015-w01,Alagoas,3,0.002865
1,2015-w01,Amazonas,3,0.002865
2,2015-w01,Bahia,5,0.004776
3,2015-w01,Ceara,7,0.006686
4,2015-w01,Distrito Federal,5,0.004776
...,...,...,...,...
96,2015-w05,Rio Grande Do Sul,9,0.008596
97,2015-w05,Rondonia,1,0.000955
98,2015-w05,Roraima,1,0.000955
99,2015-w05,Santa Catarina,1,0.000955


In [6]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2015-w01,Alagoas,3,0.002865,7
1,2015-w01,Amazonas,3,0.002865,7
2,2015-w01,Bahia,5,0.004776,11
3,2015-w01,Ceara,7,0.006686,16
4,2015-w01,Distrito Federal,5,0.004776,11
...,...,...,...,...,...
96,2015-w05,Rio Grande Do Sul,9,0.008596,21
97,2015-w05,Rondonia,1,0.000955,2
98,2015-w05,Roraima,1,0.000955,2
99,2015-w05,Santa Catarina,1,0.000955,2


In [7]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_01_pos_sample_size = pd.merge(df_2015_01_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [8]:
# Step 6: Perform the stratified sampling
#df_2015_01_pos_sample = df_2015_01_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2015_01_pos_sample = df_2015_01_pos
# Step 7: Check the result
print(f"Sampled {len(df_2015_01_pos_sample)} rows.")


Sampled 1047 rows.


In [9]:
text_gpt_2015_01_pos = '\n'.join(df_2015_01_pos_sample['text'].astype(str))

### Prompts and loops

In [10]:
prompt_1 = prompt_raw.replace('TWEETS', text_gpt_2015_01_pos)

In [11]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_1}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [12]:
response.choices[0].message.content

"Recent developments include Johnson & Johnson's initiation of human trials for an Ebola vaccine, new Israeli cancer vaccines showing promising results for 90% of cancer types, and ongoing vaccination campaigns for HPV targeted at girls aged 11 to 13. Additionally, the FDA has approved a meningitis vaccine, enhancing public health efforts."

In [13]:
summary = []
dataset = []

In [14]:
dataset.append("2015_01_positive")

In [15]:
summary.append(response.choices[0].message.content)

In [16]:
df_2015_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  1 non-null      object
 1   summary  1 non-null      object
dtypes: object(2)
memory usage: 144.0+ bytes


In [17]:
df_2015_pos.tail(14)

Unnamed: 0,dataset,summary
0,2015_01_positive,Recent developments include Johnson & Johnson'...


In [18]:
response

<OpenAIObject chat.completion id=chatcmpl-AUd7c3p23gk1KIlzwunnVDbUJOCHd at 0x21d82a2aa90> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Recent developments include Johnson & Johnson's initiation of human trials for an Ebola vaccine, new Israeli cancer vaccines showing promising results for 90% of cancer types, and ongoing vaccination campaigns for HPV targeted at girls aged 11 to 13. Additionally, the FDA has approved a meningitis vaccine, enhancing public health efforts.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731863588,
  "id": "chatcmpl-AUd7c3p23gk1KIlzwunnVDbUJOCHd",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 63,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reasoning_tokens": 0,


## February

In [19]:
# Get monthly data
df_2015_02_pos = pd.read_csv('../data/local/df_pos_2015-02.csv').reset_index()
df_2015_02_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1394 entries, 0 to 1393
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1394 non-null   int64  
 1   LocationCode            1394 non-null   object 
 2   LocationName            1394 non-null   object 
 3   StatesCode              1394 non-null   object 
 4   ...4                    1394 non-null   int64  
 5   id                      1394 non-null   float64
 6   text                    1394 non-null   object 
 7   label                   1394 non-null   object 
 8   in_reply_to_status_id   58 non-null     float64
 9   in_reply_to_user_id     67 non-null     float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       516 non-null    float64
 13  retweeted_status_id     516 non-null    float64
 14  created_at              1394 non-null   

### Representative sample based on week and location

In [20]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_02_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_02_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2015-w05,Alagoas,2,0.001435
1,2015-w05,Amapa,1,0.000717
2,2015-w05,Amazonas,9,0.006456
3,2015-w05,Bahia,19,0.013630
4,2015-w05,Ceara,10,0.007174
...,...,...,...,...
100,2015-w09,Rio Grande Do Sul,16,0.011478
101,2015-w09,Roraima,3,0.002152
102,2015-w09,Santa Catarina,4,0.002869
103,2015-w09,Sao Paulo,20,0.014347


In [21]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2015-w05,Alagoas,2,0.001435,3
1,2015-w05,Amapa,1,0.000717,1
2,2015-w05,Amazonas,9,0.006456,15
3,2015-w05,Bahia,19,0.013630,33
4,2015-w05,Ceara,10,0.007174,17
...,...,...,...,...,...
100,2015-w09,Rio Grande Do Sul,16,0.011478,28
101,2015-w09,Roraima,3,0.002152,5
102,2015-w09,Santa Catarina,4,0.002869,7
103,2015-w09,Sao Paulo,20,0.014347,35


In [22]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_02_pos_sample_size = pd.merge(df_2015_02_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [24]:
# Step 6: Perform the stratified sampling
#df_2015_02_pos_sample = df_2015_02_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2015_02_pos_sample = df_2015_02_pos
# Step 7: Check the result
print(f"Sampled {len(df_2015_02_pos_sample)} rows.")


Sampled 1394 rows.


In [25]:
text_gpt_2015_02_pos = '\n'.join(df_2015_02_pos_sample['text'].astype(str))

### Prompts and loops

In [26]:
prompt_2 = prompt_raw.replace('TWEETS', text_gpt_2015_02_pos)

In [27]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_2}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [28]:
response.choices[0].message.content

'The provided Portuguese sentences discuss various aspects of vaccination, highlighting its importance in preventing diseases like HPV, measles, yellow fever, and Chagas. Research indicates that vaccines do not encourage risky sexual behavior or other negative outcomes. New vaccines are being developed for HIV and other diseases. The text emphasizes the significance of vaccination in public health, especially amid outbreaks. Additionally, it mentions the role of organizations like WHO in promoting vaccination campaigns.'

In [29]:
#summary = []
#dataset = []

In [30]:
dataset.append("2015_02_positive")

In [31]:
summary.append(response.choices[0].message.content)

In [32]:
df_2015_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  2 non-null      object
 1   summary  2 non-null      object
dtypes: object(2)
memory usage: 160.0+ bytes


In [33]:
df_2015_pos.tail(14)

Unnamed: 0,dataset,summary
0,2015_01_positive,Recent developments include Johnson & Johnson'...
1,2015_02_positive,The provided Portuguese sentences discuss vari...


In [34]:
response

<OpenAIObject chat.completion id=chatcmpl-AUd8adoBEhNQegzgywzmalb3mnM3m at 0x21d87010db0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided Portuguese sentences discuss various aspects of vaccination, highlighting its importance in preventing diseases like HPV, measles, yellow fever, and Chagas. Research indicates that vaccines do not encourage risky sexual behavior or other negative outcomes. New vaccines are being developed for HIV and other diseases. The text emphasizes the significance of vaccination in public health, especially amid outbreaks. Additionally, it mentions the role of organizations like WHO in promoting vaccination campaigns.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731863648,
  "id": "chatcmpl-AUd8adoBEhNQegzgywzmalb3mnM3m",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp

## March

In [35]:
# Get monthly data
df_2015_03_pos = pd.read_csv('../data/local/df_pos_2015-03.csv').reset_index()
df_2015_03_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2370 entries, 0 to 2369
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2370 non-null   int64  
 1   LocationCode            2370 non-null   object 
 2   LocationName            2370 non-null   object 
 3   StatesCode              2370 non-null   object 
 4   ...4                    2370 non-null   int64  
 5   id                      2370 non-null   float64
 6   text                    2370 non-null   object 
 7   label                   2370 non-null   object 
 8   in_reply_to_status_id   79 non-null     float64
 9   in_reply_to_user_id     86 non-null     float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       554 non-null    float64
 13  retweeted_status_id     554 non-null    float64
 14  created_at              2370 non-null   

### Representative sample based on week and location

In [36]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_03_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_03_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2015-w09,Alagoas,1,0.000422
1,2015-w09,Amapa,19,0.008017
2,2015-w09,Amazonas,3,0.001266
3,2015-w09,Bahia,25,0.010549
4,2015-w09,Ceara,7,0.002954
...,...,...,...,...
111,2015-w13,Rondonia,1,0.000422
112,2015-w13,Roraima,2,0.000844
113,2015-w13,Santa Catarina,5,0.002110
114,2015-w13,Sao Paulo,93,0.039241


In [37]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2015-w09,Alagoas,1,0.000422,1
1,2015-w09,Amapa,19,0.008017,19
2,2015-w09,Amazonas,3,0.001266,3
3,2015-w09,Bahia,25,0.010549,25
4,2015-w09,Ceara,7,0.002954,7
...,...,...,...,...,...
111,2015-w13,Rondonia,1,0.000422,1
112,2015-w13,Roraima,2,0.000844,2
113,2015-w13,Santa Catarina,5,0.002110,5
114,2015-w13,Sao Paulo,93,0.039241,96


In [38]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_03_pos_sample_size = pd.merge(df_2015_03_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [40]:
# Step 6: Perform the stratified sampling
#df_2015_03_pos_sample = df_2015_03_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2015_03_pos_sample = df_2015_03_pos
# Step 7: Check the result
print(f"Sampled {len(df_2015_03_pos_sample)} rows.")


Sampled 2370 rows.


In [41]:
text_gpt_2015_03_pos = '\n'.join(df_2015_03_pos_sample['text'].astype(str))

### Prompts and loops

In [42]:
prompt_3 = prompt_raw.replace('TWEETS', text_gpt_2015_03_pos)

In [43]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_3}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [44]:
response.choices[0].message.content

'The provided texts primarily discuss various vaccination campaigns, particularly against HPV for girls aged 9 to 11, which aims to prevent cervical cancer. The texts emphasize the importance of vaccination in public health, including plans to vaccinate women with HIV and the expected availability of vaccines for dengue and other diseases. There are mentions of vaccination efforts across different regions and the efficacy of the HPV vaccine, which is noted for its high effectiveness.'

In [45]:
#summary = []
#dataset = []

In [46]:
dataset.append("2015_03_positive")

In [47]:
summary.append(response.choices[0].message.content)

In [48]:
df_2015_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  3 non-null      object
 1   summary  3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes


In [49]:
df_2015_pos.tail(14)

Unnamed: 0,dataset,summary
0,2015_01_positive,Recent developments include Johnson & Johnson'...
1,2015_02_positive,The provided Portuguese sentences discuss vari...
2,2015_03_positive,The provided texts primarily discuss various v...


In [50]:
response

<OpenAIObject chat.completion id=chatcmpl-AUd9uEBBXcSRoqkVayfr4Cc3O7FCl at 0x21d872c95e0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided texts primarily discuss various vaccination campaigns, particularly against HPV for girls aged 9 to 11, which aims to prevent cervical cancer. The texts emphasize the importance of vaccination in public health, including plans to vaccinate women with HIV and the expected availability of vaccines for dengue and other diseases. There are mentions of vaccination efforts across different regions and the efficacy of the HPV vaccine, which is noted for its high effectiveness.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731863730,
  "id": "chatcmpl-AUd9uEBBXcSRoqkVayfr4Cc3O7FCl",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "compl

## April

In [51]:
# Get monthly data
df_2015_04_pos = pd.read_csv('../data/local/df_pos_2015-04.csv').reset_index()
df_2015_04_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2005 entries, 0 to 2004
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2005 non-null   int64  
 1   LocationCode            2005 non-null   object 
 2   LocationName            2005 non-null   object 
 3   StatesCode              2005 non-null   object 
 4   ...4                    2005 non-null   int64  
 5   id                      2005 non-null   float64
 6   text                    2005 non-null   object 
 7   label                   2005 non-null   object 
 8   in_reply_to_status_id   53 non-null     float64
 9   in_reply_to_user_id     95 non-null     float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       535 non-null    float64
 13  retweeted_status_id     535 non-null    float64
 14  created_at              2005 non-null   

### Representative sample based on week and location

In [52]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_04_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_04_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2015-w13,Amapa,1,0.000499
1,2015-w13,Bahia,3,0.001496
2,2015-w13,Ceara,4,0.001995
3,2015-w13,Distrito Federal,4,0.001995
4,2015-w13,Goias,6,0.002993
...,...,...,...,...
121,2015-w18,Parana,5,0.002494
122,2015-w18,Pernambuco,5,0.002494
123,2015-w18,Rio Grande Do Sul,8,0.003990
124,2015-w18,Santa Catarina,3,0.001496


In [53]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2015-w13,Amapa,1,0.000499,1
1,2015-w13,Bahia,3,0.001496,3
2,2015-w13,Ceara,4,0.001995,4
3,2015-w13,Distrito Federal,4,0.001995,4
4,2015-w13,Goias,6,0.002993,7
...,...,...,...,...,...
121,2015-w18,Parana,5,0.002494,6
122,2015-w18,Pernambuco,5,0.002494,6
123,2015-w18,Rio Grande Do Sul,8,0.003990,9
124,2015-w18,Santa Catarina,3,0.001496,3


In [54]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_04_pos_sample_size = pd.merge(df_2015_04_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [56]:
# Step 6: Perform the stratified sampling
#df_2015_04_pos_sample = df_2015_04_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2015_04_pos_sample = df_2015_04_pos
# Step 7: Check the result
print(f"Sampled {len(df_2015_04_pos_sample)} rows.")


Sampled 2005 rows.


In [57]:
text_gpt_2015_04_pos = '\n'.join(df_2015_04_pos_sample['text'].astype(str))

### Prompts and loops

In [58]:
prompt_4 = prompt_raw.replace('TWEETS', text_gpt_2015_04_pos)

In [59]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_4}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [60]:
response.choices[0].message.content

'Recent studies highlight the efficacy and safety of various vaccines, including those against HPV and dengue, while dispelling myths linking vaccines to autism. Cuba has announced a new lung cancer vaccine and plans to export it to the US. A significant campaign for influenza vaccination is set to begin in Brazil, targeting millions.'

In [61]:
#summary = []
#dataset = []

In [62]:
dataset.append("2015_04_positive")

In [63]:
summary.append(response.choices[0].message.content)

In [64]:
df_2015_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  4 non-null      object
 1   summary  4 non-null      object
dtypes: object(2)
memory usage: 192.0+ bytes


In [65]:
df_2015_pos.tail(14)

Unnamed: 0,dataset,summary
0,2015_01_positive,Recent developments include Johnson & Johnson'...
1,2015_02_positive,The provided Portuguese sentences discuss vari...
2,2015_03_positive,The provided texts primarily discuss various v...
3,2015_04_positive,Recent studies highlight the efficacy and safe...


In [66]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdAwBgFReXMriy5u0N8rEXcRABvH at 0x21d8746ba90> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Recent studies highlight the efficacy and safety of various vaccines, including those against HPV and dengue, while dispelling myths linking vaccines to autism. Cuba has announced a new lung cancer vaccine and plans to export it to the US. A significant campaign for influenza vaccination is set to begin in Brazil, targeting millions.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731863794,
  "id": "chatcmpl-AUdAwBgFReXMriy5u0N8rEXcRABvH",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_9b78b61c52",
  "usage": {
    "completion_tokens": 60,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reasoning_tokens": 0,
     

## May

In [67]:
# Get monthly data
df_2015_05_pos = pd.read_csv('../data/local/df_pos_2015-05.csv').reset_index()
df_2015_05_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4228 entries, 0 to 4227
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   4228 non-null   int64  
 1   LocationCode            4228 non-null   object 
 2   LocationName            4228 non-null   object 
 3   StatesCode              4228 non-null   object 
 4   ...4                    4228 non-null   int64  
 5   id                      4228 non-null   float64
 6   text                    4228 non-null   object 
 7   label                   4228 non-null   object 
 8   in_reply_to_status_id   136 non-null    float64
 9   in_reply_to_user_id     185 non-null    float64
 10  quoted_user_id          1 non-null      float64
 11  quoted_status_id        1 non-null      float64
 12  retweeted_user_id       1293 non-null   float64
 13  retweeted_status_id     1293 non-null   float64
 14  created_at              4228 non-null   

### Representative sample based on week and location

In [68]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_05_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_05_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2015-w18,Acre,3,0.000710
1,2015-w18,Alagoas,24,0.005676
2,2015-w18,Amapa,10,0.002365
3,2015-w18,Amazonas,18,0.004257
4,2015-w18,Bahia,55,0.013009
...,...,...,...,...
121,2015-w22,Roraima,1,0.000237
122,2015-w22,Santa Catarina,4,0.000946
123,2015-w22,Sao Paulo,75,0.017739
124,2015-w22,Sergipe,7,0.001656


In [69]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2015-w18,Acre,3,0.000710,1
1,2015-w18,Alagoas,24,0.005676,13
2,2015-w18,Amapa,10,0.002365,5
3,2015-w18,Amazonas,18,0.004257,10
4,2015-w18,Bahia,55,0.013009,31
...,...,...,...,...,...
121,2015-w22,Roraima,1,0.000237,0
122,2015-w22,Santa Catarina,4,0.000946,2
123,2015-w22,Sao Paulo,75,0.017739,43
124,2015-w22,Sergipe,7,0.001656,4


In [70]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_05_pos_sample_size = pd.merge(df_2015_05_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [71]:
# Step 6: Perform the stratified sampling
df_2015_05_pos_sample = df_2015_05_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2015_05_pos_sample)} rows.")


Sampled 2382 rows.


In [72]:
text_gpt_2015_05_pos = '\n'.join(df_2015_05_pos_sample['text'].astype(str))

### Prompts and loops

In [73]:
prompt_5 = prompt_raw.replace('TWEETS', text_gpt_2015_05_pos)

In [74]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_5}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [75]:
response.choices[0].message.content

'The provided sentences discuss various vaccination campaigns in Brazil and Nepal, focusing on the flu vaccine, the HPV vaccine, and a new meningitis vaccine. They highlight the importance of vaccination to prevent diseases, the targets for immunization, and the rollout of vaccines amid public health efforts. The campaigns aim to vaccinate vulnerable populations, including children, pregnant women, and the elderly. Additionally, there are mentions of a Cuban vaccine for lung cancer and ongoing research for vaccines against dengue and AIDS.'

In [76]:
#summary = []
#dataset = []

In [77]:
dataset.append("2015_05_positive")

In [78]:
summary.append(response.choices[0].message.content)

In [79]:
df_2015_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  5 non-null      object
 1   summary  5 non-null      object
dtypes: object(2)
memory usage: 208.0+ bytes


In [80]:
df_2015_pos.tail(14)

Unnamed: 0,dataset,summary
0,2015_01_positive,Recent developments include Johnson & Johnson'...
1,2015_02_positive,The provided Portuguese sentences discuss vari...
2,2015_03_positive,The provided texts primarily discuss various v...
3,2015_04_positive,Recent studies highlight the efficacy and safe...
4,2015_05_positive,The provided sentences discuss various vaccina...


In [81]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdB2wJGV4zhQXjEKqHfZiFfIJyKa at 0x21d87430f40> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided sentences discuss various vaccination campaigns in Brazil and Nepal, focusing on the flu vaccine, the HPV vaccine, and a new meningitis vaccine. They highlight the importance of vaccination to prevent diseases, the targets for immunization, and the rollout of vaccines amid public health efforts. The campaigns aim to vaccinate vulnerable populations, including children, pregnant women, and the elderly. Additionally, there are mentions of a Cuban vaccine for lung cancer and ongoing research for vaccines against dengue and AIDS.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731863800,
  "id": "chatcmpl-AUdB2wJGV4zhQXjEKqHfZiFfIJyKa",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "syst

## June

In [82]:
# Get monthly data
df_2015_06_pos = pd.read_csv('../data/local/df_pos_2015-06.csv').reset_index()
df_2015_06_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2054 entries, 0 to 2053
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2054 non-null   int64  
 1   LocationCode            2054 non-null   object 
 2   LocationName            2054 non-null   object 
 3   StatesCode              2054 non-null   object 
 4   ...4                    2054 non-null   int64  
 5   id                      2054 non-null   float64
 6   text                    2054 non-null   object 
 7   label                   2054 non-null   object 
 8   in_reply_to_status_id   73 non-null     float64
 9   in_reply_to_user_id     79 non-null     float64
 10  quoted_user_id          10 non-null     float64
 11  quoted_status_id        10 non-null     float64
 12  retweeted_user_id       398 non-null    float64
 13  retweeted_status_id     398 non-null    float64
 14  created_at              2054 non-null   

### Representative sample based on week and location

In [83]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_06_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_06_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2015-w22,Acre,1,0.000487
1,2015-w22,Alagoas,4,0.001947
2,2015-w22,Amapa,2,0.000974
3,2015-w22,Amazonas,2,0.000974
4,2015-w22,Bahia,14,0.006816
...,...,...,...,...
115,2015-w26,Roraima,1,0.000487
116,2015-w26,Santa Catarina,7,0.003408
117,2015-w26,Sao Paulo,58,0.028238
118,2015-w26,Sergipe,2,0.000974


In [84]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2015-w22,Acre,1,0.000487,1
1,2015-w22,Alagoas,4,0.001947,4
2,2015-w22,Amapa,2,0.000974,2
3,2015-w22,Amazonas,2,0.000974,2
4,2015-w22,Bahia,14,0.006816,16
...,...,...,...,...,...
115,2015-w26,Roraima,1,0.000487,1
116,2015-w26,Santa Catarina,7,0.003408,8
117,2015-w26,Sao Paulo,58,0.028238,69
118,2015-w26,Sergipe,2,0.000974,2


In [85]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_06_pos_sample_size = pd.merge(df_2015_06_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [87]:
# Step 6: Perform the stratified sampling
#df_2015_06_pos_sample = df_2015_06_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2015_06_pos_sample = df_2015_06_pos
# Step 7: Check the result
print(f"Sampled {len(df_2015_06_pos_sample)} rows.")


Sampled 2054 rows.


In [88]:
text_gpt_2015_06_pos = '\n'.join(df_2015_06_pos_sample['text'].astype(str))

### Prompts and loops

In [89]:
prompt_6 = prompt_raw.replace('TWEETS', text_gpt_2015_06_pos)

In [90]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_6}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [91]:
response.choices[0].message.content

'The text discusses the ongoing vaccination campaigns against the flu in Brazil, highlighting that more than 35 million people have been vaccinated. Various municipalities have surpassed their vaccination targets, with some reaching over 80% coverage. Additionally, it mentions the development of experimental vaccines for HIV and other diseases, emphasizing the importance of vaccination in preventing infections and serious health issues.'

In [92]:
#summary = []
#dataset = []

In [93]:
dataset.append("2015_06_positive")

In [94]:
summary.append(response.choices[0].message.content)

In [95]:
df_2015_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  6 non-null      object
 1   summary  6 non-null      object
dtypes: object(2)
memory usage: 224.0+ bytes


In [96]:
df_2015_pos.tail(14)

Unnamed: 0,dataset,summary
0,2015_01_positive,Recent developments include Johnson & Johnson'...
1,2015_02_positive,The provided Portuguese sentences discuss vari...
2,2015_03_positive,The provided texts primarily discuss various v...
3,2015_04_positive,Recent studies highlight the efficacy and safe...
4,2015_05_positive,The provided sentences discuss various vaccina...
5,2015_06_positive,The text discusses the ongoing vaccination cam...


In [97]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdBsSkY4miUc5ll8MvnNZ9TrUZw4 at 0x21d870543b0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses the ongoing vaccination campaigns against the flu in Brazil, highlighting that more than 35 million people have been vaccinated. Various municipalities have surpassed their vaccination targets, with some reaching over 80% coverage. Additionally, it mentions the development of experimental vaccines for HIV and other diseases, emphasizing the importance of vaccination in preventing infections and serious health issues.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731863852,
  "id": "chatcmpl-AUdBsSkY4miUc5ll8MvnNZ9TrUZw4",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 70,
    "completion_tokens_detail

## July

In [98]:
# Get monthly data
df_2015_07_pos = pd.read_csv('../data/local/df_pos_2015-07.csv').reset_index()
df_2015_07_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2485 entries, 0 to 2484
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2485 non-null   int64  
 1   LocationCode            2485 non-null   object 
 2   LocationName            2485 non-null   object 
 3   StatesCode              2485 non-null   object 
 4   ...4                    2485 non-null   int64  
 5   id                      2485 non-null   float64
 6   text                    2485 non-null   object 
 7   label                   2485 non-null   object 
 8   in_reply_to_status_id   110 non-null    float64
 9   in_reply_to_user_id     120 non-null    float64
 10  quoted_user_id          12 non-null     float64
 11  quoted_status_id        12 non-null     float64
 12  retweeted_user_id       514 non-null    float64
 13  retweeted_status_id     514 non-null    float64
 14  created_at              2485 non-null   

### Representative sample based on week and location

In [99]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_07_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_07_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2015-w26,Alagoas,1,0.000402
1,2015-w26,Bahia,1,0.000402
2,2015-w26,Ceara,5,0.002012
3,2015-w26,Distrito Federal,2,0.000805
4,2015-w26,Espirito Santo,1,0.000402
...,...,...,...,...
127,2015-w31,Rio Grande Do Sul,58,0.023340
128,2015-w31,Rondonia,2,0.000805
129,2015-w31,Roraima,4,0.001610
130,2015-w31,Santa Catarina,10,0.004024


In [100]:
# Step 3: Define the target sample size
#target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
#group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
#group_sizes
# Include all samples since there are not many tweets

In [101]:
# Step 5: Merge this sample size information back to the original DataFrame
#df_2015_07_pos_sample_size = pd.merge(df_2015_07_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
 #                              on=['year_week', 'LocationName'], how='left')

In [102]:
# Step 6: Perform the stratified sampling
#df_2015_07_pos_sample = df_2015_07_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
#    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2015_07_pos_sample = df_2015_07_pos
# Step 7: Check the result
print(f"Sampled {len(df_2015_07_pos_sample)} rows.")


Sampled 2485 rows.


In [103]:
text_gpt_2015_07_pos = '\n'.join(df_2015_07_pos_sample['text'].astype(str))

### Prompts and loops

In [104]:
prompt_7 = prompt_raw.replace('TWEETS', text_gpt_2015_07_pos)

In [105]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_7}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [106]:
response.choices[0].message.content

'The text discusses various aspects of vaccination in Brazil and worldwide, highlighting successes in vaccination campaigns for diseases such as influenza, dengue, hepatitis, and Ebola. It mentions specific locations, such as Rio Branco and Cachoeirinha, achieving or exceeding vaccination targets, and addresses the importance of vaccination for both humans and pets. The effectiveness of new vaccines, including those against malaria and HIV, is also emphasized, alongside calls for public participation in vaccination programs to ensure community health.'

In [107]:
#summary = []
#dataset = []

In [108]:
dataset.append("2015_07_positive")

In [109]:
summary.append(response.choices[0].message.content)

In [110]:
df_2015_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  7 non-null      object
 1   summary  7 non-null      object
dtypes: object(2)
memory usage: 240.0+ bytes


In [111]:
df_2015_pos.tail(14)

Unnamed: 0,dataset,summary
0,2015_01_positive,Recent developments include Johnson & Johnson'...
1,2015_02_positive,The provided Portuguese sentences discuss vari...
2,2015_03_positive,The provided texts primarily discuss various v...
3,2015_04_positive,Recent studies highlight the efficacy and safe...
4,2015_05_positive,The provided sentences discuss various vaccina...
5,2015_06_positive,The text discusses the ongoing vaccination cam...
6,2015_07_positive,The text discusses various aspects of vaccinat...


In [112]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdByAY7oOBoQWS99ZmiEdP9Ec48C at 0x21d870595e0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various aspects of vaccination in Brazil and worldwide, highlighting successes in vaccination campaigns for diseases such as influenza, dengue, hepatitis, and Ebola. It mentions specific locations, such as Rio Branco and Cachoeirinha, achieving or exceeding vaccination targets, and addresses the importance of vaccination for both humans and pets. The effectiveness of new vaccines, including those against malaria and HIV, is also emphasized, alongside calls for public participation in vaccination programs to ensure community health.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731863858,
  "id": "chatcmpl-AUdByAY7oOBoQWS99ZmiEdP9Ec48C",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completi

## August

In [113]:
# Get monthly data
df_2015_08_pos = pd.read_csv('../data/local/df_pos_2015-08.csv').reset_index()
df_2015_08_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4455 entries, 0 to 4454
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   4455 non-null   int64  
 1   LocationCode            4455 non-null   object 
 2   LocationName            4455 non-null   object 
 3   StatesCode              4455 non-null   object 
 4   ...4                    4455 non-null   int64  
 5   id                      4455 non-null   float64
 6   text                    4455 non-null   object 
 7   label                   4455 non-null   object 
 8   in_reply_to_status_id   108 non-null    float64
 9   in_reply_to_user_id     130 non-null    float64
 10  quoted_user_id          40 non-null     float64
 11  quoted_status_id        40 non-null     float64
 12  retweeted_user_id       1224 non-null   float64
 13  retweeted_status_id     1224 non-null   float64
 14  created_at              4455 non-null   

### Representative sample based on week and location

In [114]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_08_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_08_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2015-w31,Acre,1,0.000224
1,2015-w31,Alagoas,3,0.000673
2,2015-w31,Amapa,8,0.001796
3,2015-w31,Amazonas,7,0.001571
4,2015-w31,Bahia,38,0.008530
...,...,...,...,...
118,2015-w35,Roraima,4,0.000898
119,2015-w35,Santa Catarina,3,0.000673
120,2015-w35,Sao Paulo,83,0.018631
121,2015-w35,Sergipe,26,0.005836


In [115]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2015-w31,Acre,1,0.000224,0
1,2015-w31,Alagoas,3,0.000673,1
2,2015-w31,Amapa,8,0.001796,4
3,2015-w31,Amazonas,7,0.001571,3
4,2015-w31,Bahia,38,0.008530,20
...,...,...,...,...,...
118,2015-w35,Roraima,4,0.000898,2
119,2015-w35,Santa Catarina,3,0.000673,1
120,2015-w35,Sao Paulo,83,0.018631,45
121,2015-w35,Sergipe,26,0.005836,14


In [116]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_08_pos_sample_size = pd.merge(df_2015_08_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [117]:
# Step 6: Perform the stratified sampling
#df_2015_08_pos_sample = df_2015_08_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2015_08_pos_sample = df_2015_08_pos
# Step 7: Check the result
print(f"Sampled {len(df_2015_08_pos_sample)} rows.")


Sampled 4455 rows.


In [118]:
text_gpt_2015_08_pos = '\n'.join(df_2015_08_pos_sample['text'].astype(str))

### Prompts and loops

In [119]:
prompt_8 = prompt_raw.replace('TWEETS', text_gpt_2015_08_pos)

In [120]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_8}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [121]:
response.choices[0].message.content

'The sentences discuss the importance of vaccination against poliomyelitis (polio) and other diseases, emphasizing that vaccination is crucial for protecting children. Various campaigns are highlighted, including the National Vaccination Campaign against polio, which runs from August 15 to 31. The text mentions the availability of vaccines, the goal to immunize millions of children, and the efforts of health officials to increase vaccination rates. It also stresses the safety and efficacy of vaccines and the need for community participation in vaccination efforts.'

In [122]:
#summary = []
#dataset = []

In [123]:
dataset.append("2015_08_positive")

In [124]:
summary.append(response.choices[0].message.content)

In [125]:
df_2015_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  8 non-null      object
 1   summary  8 non-null      object
dtypes: object(2)
memory usage: 256.0+ bytes


In [126]:
df_2015_pos.tail(14)

Unnamed: 0,dataset,summary
0,2015_01_positive,Recent developments include Johnson & Johnson'...
1,2015_02_positive,The provided Portuguese sentences discuss vari...
2,2015_03_positive,The provided texts primarily discuss various v...
3,2015_04_positive,Recent studies highlight the efficacy and safe...
4,2015_05_positive,The provided sentences discuss various vaccina...
5,2015_06_positive,The text discusses the ongoing vaccination cam...
6,2015_07_positive,The text discusses various aspects of vaccinat...
7,2015_08_positive,The sentences discuss the importance of vaccin...


In [127]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdC6EAVjlW6zdGIPn2b8gzoSbrbl at 0x21d878e0db0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The sentences discuss the importance of vaccination against poliomyelitis (polio) and other diseases, emphasizing that vaccination is crucial for protecting children. Various campaigns are highlighted, including the National Vaccination Campaign against polio, which runs from August 15 to 31. The text mentions the availability of vaccines, the goal to immunize millions of children, and the efforts of health officials to increase vaccination rates. It also stresses the safety and efficacy of vaccines and the need for community participation in vaccination efforts.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731863866,
  "id": "chatcmpl-AUdC6EAVjlW6zdGIPn2b8gzoSbrbl",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "

## September

In [128]:
# Get monthly data
df_2015_09_pos = pd.read_csv('../data/local/df_pos_2015-09.csv').reset_index()
df_2015_09_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2413 entries, 0 to 2412
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2413 non-null   int64  
 1   LocationCode            2413 non-null   object 
 2   LocationName            2413 non-null   object 
 3   StatesCode              2413 non-null   object 
 4   ...4                    2413 non-null   int64  
 5   id                      2413 non-null   float64
 6   text                    2413 non-null   object 
 7   label                   2413 non-null   object 
 8   in_reply_to_status_id   68 non-null     float64
 9   in_reply_to_user_id     74 non-null     float64
 10  quoted_user_id          13 non-null     float64
 11  quoted_status_id        13 non-null     float64
 12  retweeted_user_id       480 non-null    float64
 13  retweeted_status_id     480 non-null    float64
 14  created_at              2413 non-null   

### Representative sample based on week and location

In [129]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_09_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_09_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2015-w35,Alagoas,11,0.004559
1,2015-w35,Amapa,1,0.000414
2,2015-w35,Amazonas,7,0.002901
3,2015-w35,Bahia,26,0.010775
4,2015-w35,Ceara,15,0.006216
...,...,...,...,...
110,2015-w39,Roraima,1,0.000414
111,2015-w39,Santa Catarina,10,0.004144
112,2015-w39,Sao Paulo,74,0.030667
113,2015-w39,Sergipe,6,0.002487


In [130]:
# Step 3: Define the target sample size
#target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
#group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
#group_sizes
# Taking all tweets since there were not enough for sampling

In [131]:
# Step 5: Merge this sample size information back to the original DataFrame
#df_2015_09_pos_sample_size = pd.merge(df_2015_09_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
 #                              on=['year_week', 'LocationName'], how='left')

In [132]:
# Step 6: Perform the stratified sampling
#df_2015_09_pos_sample = df_2015_09_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2015_09_pos_sample = df_2015_09_pos
# Step 7: Check the result
print(f"Sampled {len(df_2015_09_pos_sample)} rows.")


Sampled 2413 rows.


In [133]:
text_gpt_2015_09_pos = '\n'.join(df_2015_09_pos_sample['text'].astype(str))

### Prompts and loops

In [134]:
prompt_9 = prompt_raw.replace('TWEETS', text_gpt_2015_09_pos)

In [135]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_9}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [136]:
response.choices[0].message.content

'The text discusses vaccination campaigns in Brazil, highlighting the ongoing vaccination against poliomyelitis (polio) and the initiation of the second dose of the HPV vaccine for girls aged 9 to 11. Various regions are mentioned, indicating their vaccination rates and goals. The importance of vaccination in preventing diseases is emphasized, along with efforts to reach underserved populations and promote public health awareness.'

In [137]:
#summary = []
#dataset = []

In [138]:
dataset.append("2015_09_positive")

In [139]:
summary.append(response.choices[0].message.content)

In [140]:
df_2015_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  9 non-null      object
 1   summary  9 non-null      object
dtypes: object(2)
memory usage: 272.0+ bytes


In [141]:
df_2015_pos.tail(14)

Unnamed: 0,dataset,summary
0,2015_01_positive,Recent developments include Johnson & Johnson'...
1,2015_02_positive,The provided Portuguese sentences discuss vari...
2,2015_03_positive,The provided texts primarily discuss various v...
3,2015_04_positive,Recent studies highlight the efficacy and safe...
4,2015_05_positive,The provided sentences discuss various vaccina...
5,2015_06_positive,The text discusses the ongoing vaccination cam...
6,2015_07_positive,The text discusses various aspects of vaccinat...
7,2015_08_positive,The sentences discuss the importance of vaccin...
8,2015_09_positive,The text discusses vaccination campaigns in Br...


In [142]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdCGN9nyEALpoviIXPaQmybBLn8K at 0x21d87bc46d0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses vaccination campaigns in Brazil, highlighting the ongoing vaccination against poliomyelitis (polio) and the initiation of the second dose of the HPV vaccine for girls aged 9 to 11. Various regions are mentioned, indicating their vaccination rates and goals. The importance of vaccination in preventing diseases is emphasized, along with efforts to reach underserved populations and promote public health awareness.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731863876,
  "id": "chatcmpl-AUdCGN9nyEALpoviIXPaQmybBLn8K",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 76,
    "completion_tokens_details": {


## October

In [143]:
# Get monthly data
df_2015_10_pos = pd.read_csv('../data/local/df_pos_2015-10.csv').reset_index()
df_2015_10_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1641 entries, 0 to 1640
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1641 non-null   int64  
 1   LocationCode            1641 non-null   object 
 2   LocationName            1641 non-null   object 
 3   StatesCode              1641 non-null   object 
 4   ...4                    1641 non-null   int64  
 5   id                      1641 non-null   float64
 6   text                    1641 non-null   object 
 7   label                   1641 non-null   object 
 8   in_reply_to_status_id   88 non-null     float64
 9   in_reply_to_user_id     104 non-null    float64
 10  quoted_user_id          22 non-null     float64
 11  quoted_status_id        22 non-null     float64
 12  retweeted_user_id       629 non-null    float64
 13  retweeted_status_id     629 non-null    float64
 14  created_at              1641 non-null   

### Representative sample based on week and location

In [144]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_10_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_10_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2015-w40,Acre,3,0.001828
1,2015-w40,Alagoas,9,0.005484
2,2015-w40,Amapa,2,0.001219
3,2015-w40,Amazonas,4,0.002438
4,2015-w40,Bahia,18,0.010969
...,...,...,...,...
109,2015-w44,Pernambuco,3,0.001828
110,2015-w44,Rio De Janeiro,1,0.000609
111,2015-w44,Rio Grande Do Sul,23,0.014016
112,2015-w44,Santa Catarina,5,0.003047


In [145]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2015-w40,Acre,3,0.001828,4
1,2015-w40,Alagoas,9,0.005484,13
2,2015-w40,Amapa,2,0.001219,2
3,2015-w40,Amazonas,4,0.002438,5
4,2015-w40,Bahia,18,0.010969,26
...,...,...,...,...,...
109,2015-w44,Pernambuco,3,0.001828,4
110,2015-w44,Rio De Janeiro,1,0.000609,1
111,2015-w44,Rio Grande Do Sul,23,0.014016,34
112,2015-w44,Santa Catarina,5,0.003047,7


In [146]:
# Step 5: Merge this sample size information back to the original DataFrame
#df_2015_10_pos_sample_size = pd.merge(df_2015_10_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
 #                              on=['year_week', 'LocationName'], how='left')
# Taking all tweets for not having enough tweets for sampling

In [147]:
# Step 6: Perform the stratified sampling
#df_2015_10_pos_sample = df_2015_10_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2015_10_pos_sample = df_2015_10_pos
# Step 7: Check the result
print(f"Sampled {len(df_2015_10_pos_sample)} rows.")


Sampled 1641 rows.


In [148]:
text_gpt_2015_10_pos = '\n'.join(df_2015_10_pos_sample['text'].astype(str))

### Prompts and loops

In [149]:
prompt_10 = prompt_raw.replace('TWEETS', text_gpt_2015_10_pos)

In [150]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_10}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [151]:
response.choices[0].message.content

'The text discusses various vaccination campaigns and initiatives in Brazil, highlighting the importance of vaccinations for both humans and animals. It covers topics such as free vaccinations for pets, the HPV vaccine for girls aged 9-13, anti-rabies campaigns, and advancements in vaccines against diseases like tuberculosis and dengue. The text emphasizes the need for awareness and participation in vaccination programs to prevent outbreaks and protect public health.'

In [152]:
#summary = []
#dataset = []

In [153]:
dataset.append("2015_10_positive")

In [154]:
summary.append(response.choices[0].message.content)

In [155]:
df_2015_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  10 non-null     object
 1   summary  10 non-null     object
dtypes: object(2)
memory usage: 288.0+ bytes


In [156]:
df_2015_pos.tail(14)

Unnamed: 0,dataset,summary
0,2015_01_positive,Recent developments include Johnson & Johnson'...
1,2015_02_positive,The provided Portuguese sentences discuss vari...
2,2015_03_positive,The provided texts primarily discuss various v...
3,2015_04_positive,Recent studies highlight the efficacy and safe...
4,2015_05_positive,The provided sentences discuss various vaccina...
5,2015_06_positive,The text discusses the ongoing vaccination cam...
6,2015_07_positive,The text discusses various aspects of vaccinat...
7,2015_08_positive,The sentences discuss the importance of vaccin...
8,2015_09_positive,The text discusses vaccination campaigns in Br...
9,2015_10_positive,The text discusses various vaccination campaig...


In [157]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdCMxRBSxblLtD76gj9fVtobhZ9U at 0x21d87d53ae0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns and initiatives in Brazil, highlighting the importance of vaccinations for both humans and animals. It covers topics such as free vaccinations for pets, the HPV vaccine for girls aged 9-13, anti-rabies campaigns, and advancements in vaccines against diseases like tuberculosis and dengue. The text emphasizes the need for awareness and participation in vaccination programs to prevent outbreaks and protect public health.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731863882,
  "id": "chatcmpl-AUdCMxRBSxblLtD76gj9fVtobhZ9U",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_9b78b61c52",
  "usage": {
    "completion_tokens": 79

## November

In [158]:
# Get monthly data
df_2015_11_pos = pd.read_csv('../data/local/df_pos_2015-11.csv').reset_index()
df_2015_11_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1217 entries, 0 to 1216
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1217 non-null   int64  
 1   LocationCode            1217 non-null   object 
 2   LocationName            1217 non-null   object 
 3   StatesCode              1217 non-null   object 
 4   ...4                    1217 non-null   int64  
 5   id                      1217 non-null   float64
 6   text                    1217 non-null   object 
 7   label                   1217 non-null   object 
 8   in_reply_to_status_id   55 non-null     float64
 9   in_reply_to_user_id     65 non-null     float64
 10  quoted_user_id          10 non-null     float64
 11  quoted_status_id        10 non-null     float64
 12  retweeted_user_id       358 non-null    float64
 13  retweeted_status_id     358 non-null    float64
 14  created_at              1217 non-null   

### Representative sample based on week and location

In [159]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_11_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_11_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2015-w44,Acre,1,0.000822
1,2015-w44,Amazonas,6,0.004930
2,2015-w44,Bahia,9,0.007395
3,2015-w44,Ceara,4,0.003287
4,2015-w44,Distrito Federal,10,0.008217
...,...,...,...,...
97,2015-w48,Rondonia,1,0.000822
98,2015-w48,Roraima,2,0.001643
99,2015-w48,Santa Catarina,3,0.002465
100,2015-w48,Sao Paulo,33,0.027116


In [160]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2015-w44,Acre,1,0.000822,2
1,2015-w44,Amazonas,6,0.004930,12
2,2015-w44,Bahia,9,0.007395,18
3,2015-w44,Ceara,4,0.003287,8
4,2015-w44,Distrito Federal,10,0.008217,20
...,...,...,...,...,...
97,2015-w48,Rondonia,1,0.000822,2
98,2015-w48,Roraima,2,0.001643,4
99,2015-w48,Santa Catarina,3,0.002465,6
100,2015-w48,Sao Paulo,33,0.027116,66


In [161]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_11_pos_sample_size = pd.merge(df_2015_11_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [162]:
# Step 6: Perform the stratified sampling
#df_2015_11_pos_sample = df_2015_11_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2015_11_pos_sample = df_2015_11_pos
# Step 7: Check the result
print(f"Sampled {len(df_2015_11_pos_sample)} rows.")


Sampled 1217 rows.


In [163]:
text_gpt_2015_11_pos = '\n'.join(df_2015_11_pos_sample['text'].astype(str))

### Prompts and loops

In [164]:
prompt_11 = prompt_raw.replace('TWEETS', text_gpt_2015_11_pos)

In [165]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_11}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [166]:
response.choices[0].message.content

'The text discusses various aspects of vaccination, highlighting the importance of immunization against diseases for both humans and animals. It mentions new vaccines, such as the first lung cancer vaccine developed in Cuba, and ongoing vaccination campaigns against rabies, HPV, and foot-and-mouth disease. There are also references to the effectiveness and necessity of vaccinations in public health strategies to prevent diseases. Additionally, it emphasizes the role of vaccines in protecting both individual and community health.'

In [167]:
#summary = []
#dataset = []

In [168]:
dataset.append("2015_11_positive")

In [169]:
summary.append(response.choices[0].message.content)

In [170]:
df_2015_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  11 non-null     object
 1   summary  11 non-null     object
dtypes: object(2)
memory usage: 304.0+ bytes


In [171]:
df_2015_pos.tail(14)

Unnamed: 0,dataset,summary
0,2015_01_positive,Recent developments include Johnson & Johnson'...
1,2015_02_positive,The provided Portuguese sentences discuss vari...
2,2015_03_positive,The provided texts primarily discuss various v...
3,2015_04_positive,Recent studies highlight the efficacy and safe...
4,2015_05_positive,The provided sentences discuss various vaccina...
5,2015_06_positive,The text discusses the ongoing vaccination cam...
6,2015_07_positive,The text discusses various aspects of vaccinat...
7,2015_08_positive,The sentences discuss the importance of vaccin...
8,2015_09_positive,The text discusses vaccination campaigns in Br...
9,2015_10_positive,The text discusses various vaccination campaig...


In [172]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdCSEJznWkPlpW00HklWwA7UCdgZ at 0x21d87eb9c70> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various aspects of vaccination, highlighting the importance of immunization against diseases for both humans and animals. It mentions new vaccines, such as the first lung cancer vaccine developed in Cuba, and ongoing vaccination campaigns against rabies, HPV, and foot-and-mouth disease. There are also references to the effectiveness and necessity of vaccinations in public health strategies to prevent diseases. Additionally, it emphasizes the role of vaccines in protecting both individual and community health.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731863888,
  "id": "chatcmpl-AUdCSEJznWkPlpW00HklWwA7UCdgZ",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerpr

## December

In [173]:
# Get monthly data
df_2015_12_pos = pd.read_csv('../data/local/df_pos_2015-12.csv').reset_index()
df_2015_12_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2577 entries, 0 to 2576
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2577 non-null   int64  
 1   LocationCode            2577 non-null   object 
 2   LocationName            2577 non-null   object 
 3   StatesCode              2577 non-null   object 
 4   ...4                    2577 non-null   int64  
 5   id                      2577 non-null   float64
 6   text                    2577 non-null   object 
 7   label                   2577 non-null   object 
 8   in_reply_to_status_id   68 non-null     float64
 9   in_reply_to_user_id     77 non-null     float64
 10  quoted_user_id          18 non-null     float64
 11  quoted_status_id        18 non-null     float64
 12  retweeted_user_id       967 non-null    float64
 13  retweeted_status_id     967 non-null    float64
 14  created_at              2577 non-null   

### Representative sample based on week and location

In [174]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_12_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_12_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2015-w48,Amazonas,3,0.001164
1,2015-w48,Bahia,5,0.001940
2,2015-w48,Ceara,4,0.001552
3,2015-w48,Distrito Federal,3,0.001164
4,2015-w48,Espirito Santo,1,0.000388
...,...,...,...,...
112,2015-w53,Mato Grosso Do Sul,2,0.000776
113,2015-w53,Minas Gerais,9,0.003492
114,2015-w53,Paraiba,2,0.000776
115,2015-w53,Parana,1,0.000388


In [175]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2015-w48,Amazonas,3,0.001164,2
1,2015-w48,Bahia,5,0.001940,4
2,2015-w48,Ceara,4,0.001552,3
3,2015-w48,Distrito Federal,3,0.001164,2
4,2015-w48,Espirito Santo,1,0.000388,0
...,...,...,...,...,...
112,2015-w53,Mato Grosso Do Sul,2,0.000776,1
113,2015-w53,Minas Gerais,9,0.003492,8
114,2015-w53,Paraiba,2,0.000776,1
115,2015-w53,Parana,1,0.000388,0


In [176]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_12_pos_sample_size = pd.merge(df_2015_12_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [177]:
# Step 6: Perform the stratified sampling
#df_2015_12_pos_sample = df_2015_12_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2015_12_pos_sample = df_2015_12_pos
# Step 7: Check the result
print(f"Sampled {len(df_2015_12_pos_sample)} rows.")


Sampled 2577 rows.


In [178]:
text_gpt_2015_12_pos = '\n'.join(df_2015_12_pos_sample['text'].astype(str))

### Prompts and loops

In [179]:
prompt_12 = prompt_raw.replace('TWEETS', text_gpt_2015_12_pos)

In [180]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_12}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [181]:
response.choices[0].message.content

"The content discusses the approval and development of dengue vaccines in Brazil and Mexico. It highlights the first dengue vaccine registered in the world, the vaccine's effectiveness against various dengue virus types, and ongoing clinical trials involving thousands of participants. Additionally, it mentions public health campaigns emphasizing the importance of vaccinations and debunks myths linking vaccines to conditions like microcephaly."

In [182]:
#summary = []
#dataset = []

In [183]:
dataset.append("2015_12_positive")

In [184]:
summary.append(response.choices[0].message.content)

In [185]:
df_2015_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  12 non-null     object
 1   summary  12 non-null     object
dtypes: object(2)
memory usage: 320.0+ bytes


In [186]:
df_2015_pos.tail(14)

Unnamed: 0,dataset,summary
0,2015_01_positive,Recent developments include Johnson & Johnson'...
1,2015_02_positive,The provided Portuguese sentences discuss vari...
2,2015_03_positive,The provided texts primarily discuss various v...
3,2015_04_positive,Recent studies highlight the efficacy and safe...
4,2015_05_positive,The provided sentences discuss various vaccina...
5,2015_06_positive,The text discusses the ongoing vaccination cam...
6,2015_07_positive,The text discusses various aspects of vaccinat...
7,2015_08_positive,The sentences discuss the importance of vaccin...
8,2015_09_positive,The text discusses vaccination campaigns in Br...
9,2015_10_positive,The text discusses various vaccination campaig...


In [187]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdCXLrvoB8NOSD2BsvODe3oQFsq7 at 0x21d88120e50> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The content discusses the approval and development of dengue vaccines in Brazil and Mexico. It highlights the first dengue vaccine registered in the world, the vaccine's effectiveness against various dengue virus types, and ongoing clinical trials involving thousands of participants. Additionally, it mentions public health campaigns emphasizing the importance of vaccinations and debunks myths linking vaccines to conditions like microcephaly.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731863893,
  "id": "chatcmpl-AUdCXLrvoB8NOSD2BsvODe3oQFsq7",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_f8dd36be86",
  "usage": {
    "completion_tokens": 71,
    "completion_tokens_

## All 2015 positive

In [188]:
df_2015_pos.to_csv('../data/summary_2015_positive_tweets.csv')