# Import libraries and data

In [1]:
# Import libraries
import os
import openai
import pandas as pd
import numpy as np
import openpyxl

In [2]:
# Get OpenAI API secret key (change this file with your own OpenAI key)
open_ai_key = open('../data/local/openai_key.txt', 'r').read()
openai.api_key = open_ai_key
#openai.Model.list()

In [3]:
# Prompt for GPT
prompt_raw = "I would like to summarise in maximum 300 characters and in English what the following sentences in Portuguese talk about. \
Do not use aspects mentioned only in one sentence as part of the summary: TWEETS"

# 2016 positive

## January

In [4]:
# Get monthly data
df_2016_01_pos = pd.read_csv('../data/local/df_pos_2016-01.csv').reset_index()
df_2016_01_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1910 entries, 0 to 1909
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1910 non-null   int64  
 1   LocationCode            1910 non-null   object 
 2   LocationName            1910 non-null   object 
 3   StatesCode              1910 non-null   object 
 4   ...4                    1910 non-null   int64  
 5   id                      1910 non-null   float64
 6   text                    1910 non-null   object 
 7   label                   1910 non-null   object 
 8   in_reply_to_status_id   105 non-null    float64
 9   in_reply_to_user_id     114 non-null    float64
 10  quoted_user_id          15 non-null     float64
 11  quoted_status_id        15 non-null     float64
 12  retweeted_user_id       612 non-null    float64
 13  retweeted_status_id     612 non-null    float64
 14  created_at              1910 non-null   

### Representative sample based on week and location

In [5]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_01_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_01_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w01,Alagoas,1,0.000524
1,2016-w01,Amazonas,5,0.002618
2,2016-w01,Bahia,16,0.008377
3,2016-w01,Ceara,3,0.001571
4,2016-w01,Distrito Federal,12,0.006283
...,...,...,...,...
109,2016-w05,Roraima,4,0.002094
110,2016-w05,Santa Catarina,3,0.001571
111,2016-w05,Sao Paulo,75,0.039267
112,2016-w05,Sergipe,5,0.002618


In [6]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w01,Alagoas,1,0.000524,1
1,2016-w01,Amazonas,5,0.002618,6
2,2016-w01,Bahia,16,0.008377,20
3,2016-w01,Ceara,3,0.001571,3
4,2016-w01,Distrito Federal,12,0.006283,15
...,...,...,...,...,...
109,2016-w05,Roraima,4,0.002094,5
110,2016-w05,Santa Catarina,3,0.001571,3
111,2016-w05,Sao Paulo,75,0.039267,96
112,2016-w05,Sergipe,5,0.002618,6


In [7]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_01_pos_sample_size = pd.merge(df_2016_01_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [9]:
# Step 6: Perform the stratified sampling
#df_2016_01_pos_sample = df_2016_01_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2016_01_pos_sample = df_2016_01_pos
# Step 7: Check the result
print(f"Sampled {len(df_2016_01_pos_sample)} rows.")


Sampled 1910 rows.


In [10]:
text_gpt_2016_01_pos = '\n'.join(df_2016_01_pos_sample['text'].astype(str))

### Prompts and loops

In [11]:
prompt_1 = prompt_raw.replace('TWEETS', text_gpt_2016_01_pos)

In [12]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_1}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [13]:
response.choices[0].message.content

'The sentences discuss various aspects of vaccination and public health initiatives in Brazil, including the importance of vaccines against diseases like Zika, dengue, and HPV. They highlight partnerships between countries, such as Brazil and the USA, to develop new vaccines, and emphasize the significance of vaccination campaigns to prevent outbreaks. Additionally, the text notes the global acknowledgment of handwashing as a critical health measure.'

In [14]:
summary = []
dataset = []

In [15]:
dataset.append("2016_01_positive")

In [16]:
summary.append(response.choices[0].message.content)

In [17]:
df_2016_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  1 non-null      object
 1   summary  1 non-null      object
dtypes: object(2)
memory usage: 144.0+ bytes


In [18]:
df_2016_pos.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_positive,The sentences discuss various aspects of vacci...


In [19]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcrR5jhMNeo9yGAqVYvnwoiFapsK at 0x2107f0c5a40> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The sentences discuss various aspects of vaccination and public health initiatives in Brazil, including the importance of vaccines against diseases like Zika, dengue, and HPV. They highlight partnerships between countries, such as Brazil and the USA, to develop new vaccines, and emphasize the significance of vaccination campaigns to prevent outbreaks. Additionally, the text notes the global acknowledgment of handwashing as a critical health measure.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862585,
  "id": "chatcmpl-AUcrR5jhMNeo9yGAqVYvnwoiFapsK",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 76,
    "completion

## February

In [20]:
# Get monthly data
df_2016_02_pos = pd.read_csv('../data/local/df_pos_2016-02.csv').reset_index()
df_2016_02_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3118 entries, 0 to 3117
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3118 non-null   int64  
 1   LocationCode            3118 non-null   object 
 2   LocationName            3118 non-null   object 
 3   StatesCode              3118 non-null   object 
 4   ...4                    3118 non-null   int64  
 5   id                      3118 non-null   float64
 6   text                    3118 non-null   object 
 7   label                   3118 non-null   object 
 8   in_reply_to_status_id   103 non-null    float64
 9   in_reply_to_user_id     124 non-null    float64
 10  quoted_user_id          14 non-null     float64
 11  quoted_status_id        14 non-null     float64
 12  retweeted_user_id       969 non-null    float64
 13  retweeted_status_id     969 non-null    float64
 14  created_at              3118 non-null   

### Representative sample based on week and location

In [21]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_02_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_02_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w05,Acre,1,0.000321
1,2016-w05,Alagoas,3,0.000962
2,2016-w05,Amapa,5,0.001604
3,2016-w05,Amazonas,17,0.005452
4,2016-w05,Bahia,41,0.013149
...,...,...,...,...
113,2016-w09,Roraima,2,0.000641
114,2016-w09,Santa Catarina,3,0.000962
115,2016-w09,Sao Paulo,38,0.012187
116,2016-w09,Sergipe,3,0.000962


In [22]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w05,Acre,1,0.000321,0
1,2016-w05,Alagoas,3,0.000962,2
2,2016-w05,Amapa,5,0.001604,3
3,2016-w05,Amazonas,17,0.005452,13
4,2016-w05,Bahia,41,0.013149,32
...,...,...,...,...,...
113,2016-w09,Roraima,2,0.000641,1
114,2016-w09,Santa Catarina,3,0.000962,2
115,2016-w09,Sao Paulo,38,0.012187,29
116,2016-w09,Sergipe,3,0.000962,2


In [23]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_02_pos_sample_size = pd.merge(df_2016_02_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [24]:
# Step 6: Perform the stratified sampling
df_2016_02_pos_sample = df_2016_02_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2016_02_pos_sample)} rows.")


Sampled 2392 rows.


In [25]:
text_gpt_2016_02_pos = '\n'.join(df_2016_02_pos_sample['text'].astype(str))

### Prompts and loops

In [26]:
prompt_2 = prompt_raw.replace('TWEETS', text_gpt_2016_02_pos)

In [27]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_2}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [28]:
response.choices[0].message.content

'The excerpts discuss the urgent need for a vaccine against the Zika virus, highlighting efforts by Brazil and the United States to develop one. They mention partnerships with various laboratories and research institutions, including the Butantan Institute and the University of Texas. There are references to ongoing studies and the adaptation of existing dengue vaccines for Zika. Additionally, the texts caution against misinformation regarding vaccines and emphasize the importance of combating the Aedes aegypti mosquito while vaccine development is underway.'

In [29]:
#summary = []
#dataset = []

In [30]:
dataset.append("2016_02_positive")

In [31]:
summary.append(response.choices[0].message.content)

In [32]:
df_2016_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  2 non-null      object
 1   summary  2 non-null      object
dtypes: object(2)
memory usage: 160.0+ bytes


In [33]:
df_2016_pos.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_positive,The sentences discuss various aspects of vacci...
1,2016_02_positive,The excerpts discuss the urgent need for a vac...


In [34]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcrXytBLGZcjYEFoz18J89Xv91od at 0x2107f2a5d10> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The excerpts discuss the urgent need for a vaccine against the Zika virus, highlighting efforts by Brazil and the United States to develop one. They mention partnerships with various laboratories and research institutions, including the Butantan Institute and the University of Texas. There are references to ongoing studies and the adaptation of existing dengue vaccines for Zika. Additionally, the texts caution against misinformation regarding vaccines and emphasize the importance of combating the Aedes aegypti mosquito while vaccine development is underway.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862591,
  "id": "chatcmpl-AUcrXytBLGZcjYEFoz18J89Xv91od",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.c

## March

In [35]:
# Get monthly data
df_2016_03_pos = pd.read_csv('../data/local/df_pos_2016-03.csv').reset_index()
df_2016_03_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2567 entries, 0 to 2566
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2567 non-null   int64  
 1   LocationCode            2567 non-null   object 
 2   LocationName            2567 non-null   object 
 3   StatesCode              2567 non-null   object 
 4   ...4                    2567 non-null   int64  
 5   id                      2567 non-null   float64
 6   text                    2567 non-null   object 
 7   label                   2567 non-null   object 
 8   in_reply_to_status_id   95 non-null     float64
 9   in_reply_to_user_id     105 non-null    float64
 10  quoted_user_id          37 non-null     float64
 11  quoted_status_id        37 non-null     float64
 12  retweeted_user_id       1022 non-null   float64
 13  retweeted_status_id     1022 non-null   float64
 14  created_at              2567 non-null   

### Representative sample based on week and location

In [36]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_03_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_03_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w09,Acre,1,0.000390
1,2016-w09,Alagoas,3,0.001169
2,2016-w09,Amapa,3,0.001169
3,2016-w09,Amazonas,4,0.001558
4,2016-w09,Bahia,15,0.005843
...,...,...,...,...
108,2016-w13,Roraima,2,0.000779
109,2016-w13,Santa Catarina,18,0.007012
110,2016-w13,Sao Paulo,144,0.056097
111,2016-w13,Sergipe,9,0.003506


In [37]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w09,Acre,1,0.000390,0
1,2016-w09,Alagoas,3,0.001169,2
2,2016-w09,Amapa,3,0.001169,2
3,2016-w09,Amazonas,4,0.001558,3
4,2016-w09,Bahia,15,0.005843,14
...,...,...,...,...,...
108,2016-w13,Roraima,2,0.000779,1
109,2016-w13,Santa Catarina,18,0.007012,17
110,2016-w13,Sao Paulo,144,0.056097,137
111,2016-w13,Sergipe,9,0.003506,8


In [38]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_03_pos_sample_size = pd.merge(df_2016_03_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [39]:
# Step 6: Perform the stratified sampling
df_2016_03_pos_sample = df_2016_03_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2016_03_pos_sample)} rows.")


Sampled 2375 rows.


In [40]:
text_gpt_2016_03_pos = '\n'.join(df_2016_03_pos_sample['text'].astype(str))

### Prompts and loops

In [41]:
prompt_3 = prompt_raw.replace('TWEETS', text_gpt_2016_03_pos)

In [42]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_3}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [43]:
response.choices[0].message.content

'In Brazil, there is a strong emphasis on vaccination, particularly against HPV for boys and girls aged 9 to 13, as advocated by a Nobel Prize-winning doctor. Additionally, campaigns for rabies vaccination in pets and national initiatives against diseases like dengue and H1N1 are underway. Vaccination is highlighted as crucial for public health, with ongoing efforts to enhance awareness and accessibility.'

In [44]:
#summary = []
#dataset = []

In [45]:
dataset.append("2016_03_positive")

In [46]:
summary.append(response.choices[0].message.content)

In [47]:
df_2016_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  3 non-null      object
 1   summary  3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes


In [48]:
df_2016_pos.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_positive,The sentences discuss various aspects of vacci...
1,2016_02_positive,The excerpts discuss the urgent need for a vac...
2,2016_03_positive,"In Brazil, there is a strong emphasis on vacci..."


In [49]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcrdQl7qWhX9GOlwYxSj3vLF4JrA at 0x2107f2a5e50> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "In Brazil, there is a strong emphasis on vaccination, particularly against HPV for boys and girls aged 9 to 13, as advocated by a Nobel Prize-winning doctor. Additionally, campaigns for rabies vaccination in pets and national initiatives against diseases like dengue and H1N1 are underway. Vaccination is highlighted as crucial for public health, with ongoing efforts to enhance awareness and accessibility.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862597,
  "id": "chatcmpl-AUcrdQl7qWhX9GOlwYxSj3vLF4JrA",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 77,
    "completion_tokens_details": {
      "accepted_prediction

## April

In [50]:
# Get monthly data
df_2016_04_pos = pd.read_csv('../data/local/df_pos_2016-04.csv').reset_index()
df_2016_04_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5617 entries, 0 to 5616
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   5617 non-null   int64  
 1   LocationCode            5617 non-null   object 
 2   LocationName            5617 non-null   object 
 3   StatesCode              5617 non-null   object 
 4   ...4                    5617 non-null   int64  
 5   id                      5617 non-null   float64
 6   text                    5617 non-null   object 
 7   label                   5617 non-null   object 
 8   in_reply_to_status_id   245 non-null    float64
 9   in_reply_to_user_id     314 non-null    float64
 10  quoted_user_id          62 non-null     float64
 11  quoted_status_id        62 non-null     float64
 12  retweeted_user_id       1252 non-null   float64
 13  retweeted_status_id     1252 non-null   float64
 14  created_at              5617 non-null   

### Representative sample based on week and location

In [51]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_04_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_04_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w14,Acre,2,0.000356
1,2016-w14,Alagoas,14,0.002492
2,2016-w14,Amapa,17,0.003027
3,2016-w14,Amazonas,23,0.004095
4,2016-w14,Bahia,58,0.010326
...,...,...,...,...
123,2016-w18,Roraima,2,0.000356
124,2016-w18,Santa Catarina,22,0.003917
125,2016-w18,Sao Paulo,100,0.017803
126,2016-w18,Sergipe,12,0.002136


In [52]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w14,Acre,2,0.000356,0
1,2016-w14,Alagoas,14,0.002492,6
2,2016-w14,Amapa,17,0.003027,7
3,2016-w14,Amazonas,23,0.004095,10
4,2016-w14,Bahia,58,0.010326,25
...,...,...,...,...,...
123,2016-w18,Roraima,2,0.000356,0
124,2016-w18,Santa Catarina,22,0.003917,9
125,2016-w18,Sao Paulo,100,0.017803,43
126,2016-w18,Sergipe,12,0.002136,5


In [53]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_04_pos_sample_size = pd.merge(df_2016_04_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [54]:
# Step 6: Perform the stratified sampling
df_2016_04_pos_sample = df_2016_04_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2016_04_pos_sample)} rows.")


Sampled 2392 rows.


In [55]:
text_gpt_2016_04_pos = '\n'.join(df_2016_04_pos_sample['text'].astype(str))

### Prompts and loops

In [56]:
prompt_4 = prompt_raw.replace('TWEETS', text_gpt_2016_04_pos)

In [57]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_4}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [58]:
response.choices[0].message.content

'The text discusses various vaccination campaigns in Brazil, focusing on the importance of vaccinations against HPV and H1N1. It highlights efforts by health authorities to encourage girls to get vaccinated against HPV, the organization of vaccination days for the flu, and the urgency of vaccination in response to outbreaks. Additionally, it mentions the vaccination of pets against rabies and public health initiatives to ensure widespread immunization among vulnerable groups.'

In [59]:
#summary = []
#dataset = []

In [60]:
dataset.append("2016_04_positive")

In [61]:
summary.append(response.choices[0].message.content)

In [62]:
df_2016_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  4 non-null      object
 1   summary  4 non-null      object
dtypes: object(2)
memory usage: 192.0+ bytes


In [63]:
df_2016_pos.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_positive,The sentences discuss various aspects of vacci...
1,2016_02_positive,The excerpts discuss the urgent need for a vac...
2,2016_03_positive,"In Brazil, there is a strong emphasis on vacci..."
3,2016_04_positive,The text discusses various vaccination campaig...


In [64]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcrjHoj75z8WZ0kMpYtzzDIOn6fK at 0x210000175e0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns in Brazil, focusing on the importance of vaccinations against HPV and H1N1. It highlights efforts by health authorities to encourage girls to get vaccinated against HPV, the organization of vaccination days for the flu, and the urgency of vaccination in response to outbreaks. Additionally, it mentions the vaccination of pets against rabies and public health initiatives to ensure widespread immunization among vulnerable groups.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862603,
  "id": "chatcmpl-AUcrjHoj75z8WZ0kMpYtzzDIOn6fK",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_to

## May

In [65]:
# Get monthly data
df_2016_05_pos = pd.read_csv('../data/local/df_pos_2016-05.csv').reset_index()
df_2016_05_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4577 entries, 0 to 4576
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   4577 non-null   int64  
 1   LocationCode            4577 non-null   object 
 2   LocationName            4577 non-null   object 
 3   StatesCode              4577 non-null   object 
 4   ...4                    4577 non-null   int64  
 5   id                      4577 non-null   float64
 6   text                    4577 non-null   object 
 7   label                   4577 non-null   object 
 8   in_reply_to_status_id   224 non-null    float64
 9   in_reply_to_user_id     276 non-null    float64
 10  quoted_user_id          68 non-null     float64
 11  quoted_status_id        68 non-null     float64
 12  retweeted_user_id       1194 non-null   float64
 13  retweeted_status_id     1194 non-null   float64
 14  created_at              4577 non-null   

### Representative sample based on week and location

In [66]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_05_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_05_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w18,Alagoas,9,0.001966
1,2016-w18,Amapa,2,0.000437
2,2016-w18,Amazonas,15,0.003277
3,2016-w18,Bahia,33,0.007210
4,2016-w18,Ceara,41,0.008958
...,...,...,...,...
117,2016-w22,Rondonia,2,0.000437
118,2016-w22,Roraima,1,0.000218
119,2016-w22,Santa Catarina,8,0.001748
120,2016-w22,Sao Paulo,51,0.011143


In [67]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w18,Alagoas,9,0.001966,4
1,2016-w18,Amapa,2,0.000437,1
2,2016-w18,Amazonas,15,0.003277,8
3,2016-w18,Bahia,33,0.007210,17
4,2016-w18,Ceara,41,0.008958,21
...,...,...,...,...,...
117,2016-w22,Rondonia,2,0.000437,1
118,2016-w22,Roraima,1,0.000218,0
119,2016-w22,Santa Catarina,8,0.001748,4
120,2016-w22,Sao Paulo,51,0.011143,27


In [68]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_05_pos_sample_size = pd.merge(df_2016_05_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [69]:
# Step 6: Perform the stratified sampling
df_2016_05_pos_sample = df_2016_05_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2016_05_pos_sample)} rows.")


Sampled 2389 rows.


In [70]:
text_gpt_2016_05_pos = '\n'.join(df_2016_05_pos_sample['text'].astype(str))

### Prompts and loops

In [71]:
prompt_5 = prompt_raw.replace('TWEETS', text_gpt_2016_05_pos)

In [72]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_5}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [73]:
response.choices[0].message.content

'The text discusses various vaccination campaigns in Brazil, particularly against influenza (H1N1) and foot-and-mouth disease for livestock. It emphasizes the importance of vaccination for specific groups, such as children, pregnant women, and the elderly. There are mentions of vaccination rates, the availability of vaccines, and health authorities encouraging the public to get vaccinated to prevent outbreaks. The campaigns aim to achieve high coverage rates and have reported significant participation from the public.'

In [74]:
#summary = []
#dataset = []

In [75]:
dataset.append("2016_05_positive")

In [76]:
summary.append(response.choices[0].message.content)

In [77]:
df_2016_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  5 non-null      object
 1   summary  5 non-null      object
dtypes: object(2)
memory usage: 208.0+ bytes


In [78]:
df_2016_pos.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_positive,The sentences discuss various aspects of vacci...
1,2016_02_positive,The excerpts discuss the urgent need for a vac...
2,2016_03_positive,"In Brazil, there is a strong emphasis on vacci..."
3,2016_04_positive,The text discusses various vaccination campaig...
4,2016_05_positive,The text discusses various vaccination campaig...


In [79]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcrpFh5AxmCTZnJjTOaa3naw8E7j at 0x21000027810> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns in Brazil, particularly against influenza (H1N1) and foot-and-mouth disease for livestock. It emphasizes the importance of vaccination for specific groups, such as children, pregnant women, and the elderly. There are mentions of vaccination rates, the availability of vaccines, and health authorities encouraging the public to get vaccinated to prevent outbreaks. The campaigns aim to achieve high coverage rates and have reported significant participation from the public.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862609,
  "id": "chatcmpl-AUcrpFh5AxmCTZnJjTOaa3naw8E7j",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0

## June

In [80]:
# Get monthly data
df_2016_06_pos = pd.read_csv('../data/local/df_pos_2016-06.csv').reset_index()
df_2016_06_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2676 entries, 0 to 2675
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2676 non-null   int64  
 1   LocationCode            2676 non-null   object 
 2   LocationName            2676 non-null   object 
 3   StatesCode              2676 non-null   object 
 4   ...4                    2676 non-null   int64  
 5   id                      2676 non-null   float64
 6   text                    2676 non-null   object 
 7   label                   2676 non-null   object 
 8   in_reply_to_status_id   95 non-null     float64
 9   in_reply_to_user_id     106 non-null    float64
 10  quoted_user_id          30 non-null     float64
 11  quoted_status_id        30 non-null     float64
 12  retweeted_user_id       640 non-null    float64
 13  retweeted_status_id     640 non-null    float64
 14  created_at              2676 non-null   

### Representative sample based on week and location

In [81]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_06_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_06_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w22,Acre,1,0.000374
1,2016-w22,Alagoas,7,0.002616
2,2016-w22,Amazonas,6,0.002242
3,2016-w22,Bahia,20,0.007474
4,2016-w22,Ceara,4,0.001495
...,...,...,...,...
111,2016-w26,Roraima,7,0.002616
112,2016-w26,Santa Catarina,14,0.005232
113,2016-w26,Sao Paulo,149,0.055680
114,2016-w26,Sergipe,5,0.001868


In [82]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w22,Acre,1,0.000374,0
1,2016-w22,Alagoas,7,0.002616,6
2,2016-w22,Amazonas,6,0.002242,5
3,2016-w22,Bahia,20,0.007474,18
4,2016-w22,Ceara,4,0.001495,3
...,...,...,...,...,...
111,2016-w26,Roraima,7,0.002616,6
112,2016-w26,Santa Catarina,14,0.005232,12
113,2016-w26,Sao Paulo,149,0.055680,136
114,2016-w26,Sergipe,5,0.001868,4


In [83]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_06_pos_sample_size = pd.merge(df_2016_06_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [84]:
# Step 6: Perform the stratified sampling
df_2016_06_pos_sample = df_2016_06_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2016_06_pos_sample)} rows.")


Sampled 2380 rows.


In [85]:
text_gpt_2016_06_pos = '\n'.join(df_2016_06_pos_sample['text'].astype(str))

### Prompts and loops

In [86]:
prompt_6 = prompt_raw.replace('TWEETS', text_gpt_2016_06_pos)

In [87]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_6}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [88]:
response.choices[0].message.content

'The texts discuss various vaccination initiatives and studies in Brazil. Highlights include the availability of the H1N1 vaccine for children, significant vaccination coverage achievements in various municipalities, and promising research on cancer vaccines. Additionally, there are mentions of the development of vaccines for diseases like dengue and zika, emphasizing the importance of vaccination in public health.'

In [89]:
#summary = []
#dataset = []

In [90]:
dataset.append("2016_06_positive")

In [91]:
summary.append(response.choices[0].message.content)

In [92]:
df_2016_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  6 non-null      object
 1   summary  6 non-null      object
dtypes: object(2)
memory usage: 224.0+ bytes


In [93]:
df_2016_pos.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_positive,The sentences discuss various aspects of vacci...
1,2016_02_positive,The excerpts discuss the urgent need for a vac...
2,2016_03_positive,"In Brazil, there is a strong emphasis on vacci..."
3,2016_04_positive,The text discusses various vaccination campaig...
4,2016_05_positive,The text discusses various vaccination campaig...
5,2016_06_positive,The texts discuss various vaccination initiati...


In [94]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcrw9u7GXD87ClXYJUd9v1UAb2jJ at 0x2100182a4f0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The texts discuss various vaccination initiatives and studies in Brazil. Highlights include the availability of the H1N1 vaccine for children, significant vaccination coverage achievements in various municipalities, and promising research on cancer vaccines. Additionally, there are mentions of the development of vaccines for diseases like dengue and zika, emphasizing the importance of vaccination in public health.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862616,
  "id": "chatcmpl-AUcrw9u7GXD87ClXYJUd9v1UAb2jJ",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 67,
    "completion_tokens_details": {
      "accepted_

## July

In [95]:
# Get monthly data
df_2016_07_pos = pd.read_csv('../data/local/df_pos_2016-07.csv').reset_index()
df_2016_07_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2181 entries, 0 to 2180
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2181 non-null   int64  
 1   LocationCode            2181 non-null   object 
 2   LocationName            2181 non-null   object 
 3   StatesCode              2181 non-null   object 
 4   ...4                    2181 non-null   int64  
 5   id                      2181 non-null   float64
 6   text                    2181 non-null   object 
 7   label                   2181 non-null   object 
 8   in_reply_to_status_id   94 non-null     float64
 9   in_reply_to_user_id     108 non-null    float64
 10  quoted_user_id          18 non-null     float64
 11  quoted_status_id        18 non-null     float64
 12  retweeted_user_id       665 non-null    float64
 13  retweeted_status_id     665 non-null    float64
 14  created_at              2181 non-null   

### Representative sample based on week and location

In [96]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_07_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_07_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w27,Acre,2,0.000917
1,2016-w27,Alagoas,5,0.002293
2,2016-w27,Amapa,2,0.000917
3,2016-w27,Amazonas,11,0.005044
4,2016-w27,Bahia,20,0.009170
...,...,...,...,...
107,2016-w31,Roraima,1,0.000459
108,2016-w31,Santa Catarina,2,0.000917
109,2016-w31,Sao Paulo,27,0.012380
110,2016-w31,Sergipe,4,0.001834


In [97]:
# Step 3: Define the target sample size
#target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
#group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
#group_sizes
# Include all samples since there are not many tweets

In [98]:
# Step 5: Merge this sample size information back to the original DataFrame
#df_2016_07_pos_sample_size = pd.merge(df_2016_07_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
 #                              on=['year_week', 'LocationName'], how='left')

In [99]:
# Step 6: Perform the stratified sampling
#df_2016_07_pos_sample = df_2016_07_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
#    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2016_07_pos_sample = df_2016_07_pos
# Step 7: Check the result
print(f"Sampled {len(df_2016_07_pos_sample)} rows.")


Sampled 2181 rows.


In [100]:
text_gpt_2016_07_pos = '\n'.join(df_2016_07_pos_sample['text'].astype(str))

### Prompts and loops

In [101]:
prompt_7 = prompt_raw.replace('TWEETS', text_gpt_2016_07_pos)

In [102]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_7}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [103]:
response.choices[0].message.content

"The text discusses various aspects of vaccination, including the development and testing of vaccines against diseases like Zika, dengue, hepatitis, and HPV. It emphasizes the importance of vaccination for public health, mentions specific vaccination campaigns in Brazil, and highlights personal anecdotes about vaccination experiences. There's also discussion about the necessity of keeping vaccination records up to date and the role of vaccines in preventing outbreaks of diseases like yellow fever and rabies."

In [104]:
#summary = []
#dataset = []

In [105]:
dataset.append("2016_07_positive")

In [106]:
summary.append(response.choices[0].message.content)

In [107]:
df_2016_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  7 non-null      object
 1   summary  7 non-null      object
dtypes: object(2)
memory usage: 240.0+ bytes


In [108]:
df_2016_pos.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_positive,The sentences discuss various aspects of vacci...
1,2016_02_positive,The excerpts discuss the urgent need for a vac...
2,2016_03_positive,"In Brazil, there is a strong emphasis on vacci..."
3,2016_04_positive,The text discusses various vaccination campaig...
4,2016_05_positive,The text discusses various vaccination campaig...
5,2016_06_positive,The texts discuss various vaccination initiati...
6,2016_07_positive,The text discusses various aspects of vaccinat...


In [109]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcs1HAdZqxJ7J01bke7P9w1r2Xmx at 0x2107f2aa680> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various aspects of vaccination, including the development and testing of vaccines against diseases like Zika, dengue, hepatitis, and HPV. It emphasizes the importance of vaccination for public health, mentions specific vaccination campaigns in Brazil, and highlights personal anecdotes about vaccination experiences. There's also discussion about the necessity of keeping vaccination records up to date and the role of vaccines in preventing outbreaks of diseases like yellow fever and rabies.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862621,
  "id": "chatcmpl-AUcs1HAdZqxJ7J01bke7P9w1r2Xmx",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1"

## August

In [110]:
# Get monthly data
df_2016_08_pos = pd.read_csv('../data/local/df_pos_2016-08.csv').reset_index()
df_2016_08_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1958 entries, 0 to 1957
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1958 non-null   int64  
 1   LocationCode            1958 non-null   object 
 2   LocationName            1958 non-null   object 
 3   StatesCode              1958 non-null   object 
 4   ...4                    1958 non-null   int64  
 5   id                      1958 non-null   float64
 6   text                    1958 non-null   object 
 7   label                   1958 non-null   object 
 8   in_reply_to_status_id   92 non-null     float64
 9   in_reply_to_user_id     98 non-null     float64
 10  quoted_user_id          25 non-null     float64
 11  quoted_status_id        25 non-null     float64
 12  retweeted_user_id       429 non-null    float64
 13  retweeted_status_id     429 non-null    float64
 14  created_at              1958 non-null   

### Representative sample based on week and location

In [111]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_08_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_08_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w31,Alagoas,7,0.003575
1,2016-w31,Amapa,3,0.001532
2,2016-w31,Amazonas,12,0.006129
3,2016-w31,Bahia,31,0.015832
4,2016-w31,Ceara,7,0.003575
...,...,...,...,...
98,2016-w35,Rondonia,6,0.003064
99,2016-w35,Roraima,5,0.002554
100,2016-w35,Santa Catarina,4,0.002043
101,2016-w35,Sao Paulo,62,0.031665


In [112]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w31,Alagoas,7,0.003575,8
1,2016-w31,Amapa,3,0.001532,3
2,2016-w31,Amazonas,12,0.006129,15
3,2016-w31,Bahia,31,0.015832,38
4,2016-w31,Ceara,7,0.003575,8
...,...,...,...,...,...
98,2016-w35,Rondonia,6,0.003064,7
99,2016-w35,Roraima,5,0.002554,6
100,2016-w35,Santa Catarina,4,0.002043,5
101,2016-w35,Sao Paulo,62,0.031665,77


In [113]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_08_pos_sample_size = pd.merge(df_2016_08_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [115]:
# Step 6: Perform the stratified sampling
#df_2016_08_pos_sample = df_2016_08_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2016_08_pos_sample = df_2016_08_pos
# Step 7: Check the result
print(f"Sampled {len(df_2016_08_pos_sample)} rows.")


Sampled 1958 rows.


In [116]:
text_gpt_2016_08_pos = '\n'.join(df_2016_08_pos_sample['text'].astype(str))

### Prompts and loops

In [117]:
prompt_8 = prompt_raw.replace('TWEETS', text_gpt_2016_08_pos)

In [118]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_8}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [119]:
response.choices[0].message.content

'The texts discuss various vaccines currently in development or available in Brazil, including those against Zika, dengue, and esquistossomose. The U.S. has initiated human trials for a Zika vaccine, while Brazil is rolling out dengue vaccination campaigns. There are also mentions of vaccination against diseases like HPV, rubella, and rabies, emphasizing the importance of vaccination for public health.'

In [120]:
#summary = []
#dataset = []

In [121]:
dataset.append("2016_08_positive")

In [122]:
summary.append(response.choices[0].message.content)

In [123]:
df_2016_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  8 non-null      object
 1   summary  8 non-null      object
dtypes: object(2)
memory usage: 256.0+ bytes


In [124]:
df_2016_pos.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_positive,The sentences discuss various aspects of vacci...
1,2016_02_positive,The excerpts discuss the urgent need for a vac...
2,2016_03_positive,"In Brazil, there is a strong emphasis on vacci..."
3,2016_04_positive,The text discusses various vaccination campaig...
4,2016_05_positive,The text discusses various vaccination campaig...
5,2016_06_positive,The texts discuss various vaccination initiati...
6,2016_07_positive,The text discusses various aspects of vaccinat...
7,2016_08_positive,The texts discuss various vaccines currently i...


In [125]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcsVPzxuQjLt4TYqWiH5wIfDdgRD at 0x2100133ed60> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The texts discuss various vaccines currently in development or available in Brazil, including those against Zika, dengue, and esquistossomose. The U.S. has initiated human trials for a Zika vaccine, while Brazil is rolling out dengue vaccination campaigns. There are also mentions of vaccination against diseases like HPV, rubella, and rabies, emphasizing the importance of vaccination for public health.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862651,
  "id": "chatcmpl-AUcsVPzxuQjLt4TYqWiH5wIfDdgRD",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 78,
    "completion_tokens_details": {
      "accepted_prediction_to

## September

In [126]:
# Get monthly data
df_2016_09_pos = pd.read_csv('../data/local/df_pos_2016-09.csv').reset_index()
df_2016_09_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3200 entries, 0 to 3199
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3200 non-null   int64  
 1   LocationCode            3200 non-null   object 
 2   LocationName            3200 non-null   object 
 3   StatesCode              3200 non-null   object 
 4   ...4                    3200 non-null   int64  
 5   id                      3200 non-null   float64
 6   text                    3200 non-null   object 
 7   label                   3200 non-null   object 
 8   in_reply_to_status_id   125 non-null    float64
 9   in_reply_to_user_id     130 non-null    float64
 10  quoted_user_id          65 non-null     float64
 11  quoted_status_id        65 non-null     float64
 12  retweeted_user_id       1000 non-null   float64
 13  retweeted_status_id     1000 non-null   float64
 14  created_at              3200 non-null   

### Representative sample based on week and location

In [127]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_09_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_09_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w35,Amapa,3,0.000937
1,2016-w35,Amazonas,1,0.000313
2,2016-w35,Bahia,2,0.000625
3,2016-w35,Ceara,1,0.000313
4,2016-w35,Espirito Santo,3,0.000937
...,...,...,...,...
122,2016-w40,Pernambuco,2,0.000625
123,2016-w40,Rio Grande Do Sul,6,0.001875
124,2016-w40,Santa Catarina,3,0.000937
125,2016-w40,Sao Paulo,11,0.003438


In [128]:
# Step 3: Define the target sample size
#target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
#group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
#group_sizes
# Taking all tweets since there were not enough for sampling

In [129]:
# Step 5: Merge this sample size information back to the original DataFrame
#df_2016_09_pos_sample_size = pd.merge(df_2016_09_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
 #                              on=['year_week', 'LocationName'], how='left')

In [130]:
# Step 6: Perform the stratified sampling
#df_2016_09_pos_sample = df_2016_09_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2016_09_pos_sample = df_2016_09_pos
# Step 7: Check the result
print(f"Sampled {len(df_2016_09_pos_sample)} rows.")


Sampled 3200 rows.


In [131]:
text_gpt_2016_09_pos = '\n'.join(df_2016_09_pos_sample['text'].astype(str))

### Prompts and loops

In [132]:
prompt_9 = prompt_raw.replace('TWEETS', text_gpt_2016_09_pos)

In [133]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_9}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [134]:
response.choices[0].message.content

'Brazilian researchers have developed a vaccine against cocaine addiction, which is currently being tested on animals. Meanwhile, a national vaccination campaign for children and adolescents, featuring Zé Gotinha and Carreta Furacão, begins on September 19, aiming to update vaccination records and promote health awareness.'

In [135]:
#summary = []
#dataset = []

In [136]:
dataset.append("2016_09_positive")

In [137]:
summary.append(response.choices[0].message.content)

In [138]:
df_2016_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  9 non-null      object
 1   summary  9 non-null      object
dtypes: object(2)
memory usage: 272.0+ bytes


In [139]:
df_2016_pos.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_positive,The sentences discuss various aspects of vacci...
1,2016_02_positive,The excerpts discuss the urgent need for a vac...
2,2016_03_positive,"In Brazil, there is a strong emphasis on vacci..."
3,2016_04_positive,The text discusses various vaccination campaig...
4,2016_05_positive,The text discusses various vaccination campaig...
5,2016_06_positive,The texts discuss various vaccination initiati...
6,2016_07_positive,The text discusses various aspects of vaccinat...
7,2016_08_positive,The texts discuss various vaccines currently i...
8,2016_09_positive,Brazilian researchers have developed a vaccine...


In [140]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcsca0RCxcej0FRJPkJPfvJPwkaX at 0x2100133dc70> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Brazilian researchers have developed a vaccine against cocaine addiction, which is currently being tested on animals. Meanwhile, a national vaccination campaign for children and adolescents, featuring Z\u00e9 Gotinha and Carreta Furac\u00e3o, begins on September 19, aiming to update vaccination records and promote health awareness.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862658,
  "id": "chatcmpl-AUcsca0RCxcej0FRJPkJPfvJPwkaX",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 58,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reasoning_tokens": 0,
      "

## October

In [141]:
# Get monthly data
df_2016_10_pos = pd.read_csv('../data/local/df_pos_2016-10.csv').reset_index()
df_2016_10_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1985 entries, 0 to 1984
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1985 non-null   int64  
 1   LocationCode            1985 non-null   object 
 2   LocationName            1985 non-null   object 
 3   StatesCode              1985 non-null   object 
 4   ...4                    1985 non-null   int64  
 5   id                      1985 non-null   float64
 6   text                    1985 non-null   object 
 7   label                   1985 non-null   object 
 8   in_reply_to_status_id   105 non-null    float64
 9   in_reply_to_user_id     113 non-null    float64
 10  quoted_user_id          37 non-null     float64
 11  quoted_status_id        37 non-null     float64
 12  retweeted_user_id       585 non-null    float64
 13  retweeted_status_id     585 non-null    float64
 14  created_at              1985 non-null   

### Representative sample based on week and location

In [142]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_10_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_10_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w40,Alagoas,2,0.001008
1,2016-w40,Amapa,1,0.000504
2,2016-w40,Amazonas,15,0.007557
3,2016-w40,Bahia,17,0.008564
4,2016-w40,Distrito Federal,4,0.002015
...,...,...,...,...
100,2016-w44,Rio Grande Do Sul,7,0.003526
101,2016-w44,Rondonia,2,0.001008
102,2016-w44,Santa Catarina,1,0.000504
103,2016-w44,Sao Paulo,31,0.015617


In [143]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w40,Alagoas,2,0.001008,2
1,2016-w40,Amapa,1,0.000504,1
2,2016-w40,Amazonas,15,0.007557,18
3,2016-w40,Bahia,17,0.008564,20
4,2016-w40,Distrito Federal,4,0.002015,4
...,...,...,...,...,...
100,2016-w44,Rio Grande Do Sul,7,0.003526,8
101,2016-w44,Rondonia,2,0.001008,2
102,2016-w44,Santa Catarina,1,0.000504,1
103,2016-w44,Sao Paulo,31,0.015617,38


In [144]:
# Step 5: Merge this sample size information back to the original DataFrame
#df_2016_10_pos_sample_size = pd.merge(df_2016_10_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
 #                              on=['year_week', 'LocationName'], how='left')
# Taking all tweets for not having enough tweets for sampling

In [145]:
# Step 6: Perform the stratified sampling
#df_2016_10_pos_sample = df_2016_10_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2016_10_pos_sample = df_2016_10_pos
# Step 7: Check the result
print(f"Sampled {len(df_2016_10_pos_sample)} rows.")


Sampled 1985 rows.


In [146]:
text_gpt_2016_10_pos = '\n'.join(df_2016_10_pos_sample['text'].astype(str))

### Prompts and loops

In [147]:
prompt_10 = prompt_raw.replace('TWEETS', text_gpt_2016_10_pos)

In [148]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_10}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [149]:
response.choices[0].message.content

'The sentences discuss various vaccination campaigns and recommendations in Brazil, emphasizing the importance of vaccinations for children, especially against HPV and rabies. Specific age groups, such as girls aged 9-13 and boys aged 12-13, are highlighted for HPV vaccination. Additionally, there are mentions of new vaccine developments, including those for dengue and cancer, as well as the role of organizations like Fiocruz in these initiatives. The overall message stresses the necessity of maintaining updated vaccination records to ensure health and prevent diseases.'

In [150]:
#summary = []
#dataset = []

In [151]:
dataset.append("2016_10_positive")

In [152]:
summary.append(response.choices[0].message.content)

In [153]:
df_2016_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  10 non-null     object
 1   summary  10 non-null     object
dtypes: object(2)
memory usage: 288.0+ bytes


In [154]:
df_2016_pos.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_positive,The sentences discuss various aspects of vacci...
1,2016_02_positive,The excerpts discuss the urgent need for a vac...
2,2016_03_positive,"In Brazil, there is a strong emphasis on vacci..."
3,2016_04_positive,The text discusses various vaccination campaig...
4,2016_05_positive,The text discusses various vaccination campaig...
5,2016_06_positive,The texts discuss various vaccination initiati...
6,2016_07_positive,The text discusses various aspects of vaccinat...
7,2016_08_positive,The texts discuss various vaccines currently i...
8,2016_09_positive,Brazilian researchers have developed a vaccine...
9,2016_10_positive,The sentences discuss various vaccination camp...


In [155]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcsiN7nf5hkLJ60VYOxQZA775Avh at 0x210021d5a90> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The sentences discuss various vaccination campaigns and recommendations in Brazil, emphasizing the importance of vaccinations for children, especially against HPV and rabies. Specific age groups, such as girls aged 9-13 and boys aged 12-13, are highlighted for HPV vaccination. Additionally, there are mentions of new vaccine developments, including those for dengue and cancer, as well as the role of organizations like Fiocruz in these initiatives. The overall message stresses the necessity of maintaining updated vaccination records to ensure health and prevent diseases.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862664,
  "id": "chatcmpl-AUcsiN7nf5hkLJ60VYOxQZA775Avh",
  "model": "gpt-4o-mini-2024-07-18",
  "obje

## November

In [156]:
# Get monthly data
df_2016_11_pos = pd.read_csv('../data/local/df_pos_2016-11.csv').reset_index()
df_2016_11_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1228 entries, 0 to 1227
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1228 non-null   int64  
 1   LocationCode            1228 non-null   object 
 2   LocationName            1228 non-null   object 
 3   StatesCode              1228 non-null   object 
 4   ...4                    1228 non-null   int64  
 5   id                      1228 non-null   float64
 6   text                    1228 non-null   object 
 7   label                   1228 non-null   object 
 8   in_reply_to_status_id   104 non-null    float64
 9   in_reply_to_user_id     110 non-null    float64
 10  quoted_user_id          51 non-null     float64
 11  quoted_status_id        51 non-null     float64
 12  retweeted_user_id       348 non-null    float64
 13  retweeted_status_id     348 non-null    float64
 14  created_at              1228 non-null   

### Representative sample based on week and location

In [157]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_11_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_11_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w44,Alagoas,2,0.001629
1,2016-w44,Amazonas,4,0.003257
2,2016-w44,Bahia,2,0.001629
3,2016-w44,Ceara,4,0.003257
4,2016-w44,Distrito Federal,1,0.000814
...,...,...,...,...
101,2016-w48,Roraima,1,0.000814
102,2016-w48,Santa Catarina,5,0.004072
103,2016-w48,Sao Paulo,57,0.046417
104,2016-w48,Sergipe,2,0.001629


In [158]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w44,Alagoas,2,0.001629,3
1,2016-w44,Amazonas,4,0.003257,7
2,2016-w44,Bahia,2,0.001629,3
3,2016-w44,Ceara,4,0.003257,7
4,2016-w44,Distrito Federal,1,0.000814,1
...,...,...,...,...,...
101,2016-w48,Roraima,1,0.000814,1
102,2016-w48,Santa Catarina,5,0.004072,9
103,2016-w48,Sao Paulo,57,0.046417,113
104,2016-w48,Sergipe,2,0.001629,3


In [159]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_11_pos_sample_size = pd.merge(df_2016_11_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [160]:
# Step 6: Perform the stratified sampling
#df_2016_11_pos_sample = df_2016_11_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2016_11_pos_sample = df_2016_11_pos
# Step 7: Check the result
print(f"Sampled {len(df_2016_11_pos_sample)} rows.")


Sampled 1228 rows.


In [161]:
text_gpt_2016_11_pos = '\n'.join(df_2016_11_pos_sample['text'].astype(str))

### Prompts and loops

In [162]:
prompt_11 = prompt_raw.replace('TWEETS', text_gpt_2016_11_pos)

In [163]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_11}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [164]:
response.choices[0].message.content

'The content discusses various vaccination campaigns and initiatives in Brazil, emphasizing the importance of vaccinating both animals and humans against diseases like foot-and-mouth disease, rabies, HPV, and dengue. It highlights upcoming vaccination days, availability of vaccines, and the need for public awareness in maintaining vaccination schedules for pets and livestock.'

In [165]:
#summary = []
#dataset = []

In [166]:
dataset.append("2016_11_positive")

In [167]:
summary.append(response.choices[0].message.content)

In [168]:
df_2016_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  11 non-null     object
 1   summary  11 non-null     object
dtypes: object(2)
memory usage: 304.0+ bytes


In [169]:
df_2016_pos.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_positive,The sentences discuss various aspects of vacci...
1,2016_02_positive,The excerpts discuss the urgent need for a vac...
2,2016_03_positive,"In Brazil, there is a strong emphasis on vacci..."
3,2016_04_positive,The text discusses various vaccination campaig...
4,2016_05_positive,The text discusses various vaccination campaig...
5,2016_06_positive,The texts discuss various vaccination initiati...
6,2016_07_positive,The text discusses various aspects of vaccinat...
7,2016_08_positive,The texts discuss various vaccines currently i...
8,2016_09_positive,Brazilian researchers have developed a vaccine...
9,2016_10_positive,The sentences discuss various vaccination camp...


In [170]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcsniIbTl3MAgoj7hMXz75CaAbk9 at 0x21001d0c860> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The content discusses various vaccination campaigns and initiatives in Brazil, emphasizing the importance of vaccinating both animals and humans against diseases like foot-and-mouth disease, rabies, HPV, and dengue. It highlights upcoming vaccination days, availability of vaccines, and the need for public awareness in maintaining vaccination schedules for pets and livestock.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862669,
  "id": "chatcmpl-AUcsniIbTl3MAgoj7hMXz75CaAbk9",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_f8dd36be86",
  "usage": {
    "completion_tokens": 62,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_toke

## December

In [171]:
# Get monthly data
df_2016_12_pos = pd.read_csv('../data/local/df_pos_2016-12.csv').reset_index()
df_2016_12_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1293 entries, 0 to 1292
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1293 non-null   int64  
 1   LocationCode            1293 non-null   object 
 2   LocationName            1293 non-null   object 
 3   StatesCode              1293 non-null   object 
 4   ...4                    1293 non-null   int64  
 5   id                      1293 non-null   float64
 6   text                    1293 non-null   object 
 7   label                   1293 non-null   object 
 8   in_reply_to_status_id   84 non-null     float64
 9   in_reply_to_user_id     91 non-null     float64
 10  quoted_user_id          42 non-null     float64
 11  quoted_status_id        42 non-null     float64
 12  retweeted_user_id       343 non-null    float64
 13  retweeted_status_id     343 non-null    float64
 14  created_at              1293 non-null   

### Representative sample based on week and location

In [172]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2016_12_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2016_12_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2016-w48,Amapa,1,0.000773
1,2016-w48,Bahia,2,0.001547
2,2016-w48,Distrito Federal,1,0.000773
3,2016-w48,Espirito Santo,1,0.000773
4,2016-w48,Goias,1,0.000773
...,...,...,...,...
106,2016-w53,Parana,3,0.002320
107,2016-w53,Pernambuco,2,0.001547
108,2016-w53,Rio Grande Do Sul,1,0.000773
109,2016-w53,Santa Catarina,2,0.001547


In [173]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2016-w48,Amapa,1,0.000773,1
1,2016-w48,Bahia,2,0.001547,3
2,2016-w48,Distrito Federal,1,0.000773,1
3,2016-w48,Espirito Santo,1,0.000773,1
4,2016-w48,Goias,1,0.000773,1
...,...,...,...,...,...
106,2016-w53,Parana,3,0.002320,5
107,2016-w53,Pernambuco,2,0.001547,3
108,2016-w53,Rio Grande Do Sul,1,0.000773,1
109,2016-w53,Santa Catarina,2,0.001547,3


In [174]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2016_12_pos_sample_size = pd.merge(df_2016_12_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [175]:
# Step 6: Perform the stratified sampling
#df_2016_12_pos_sample = df_2016_12_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2016_12_pos_sample = df_2016_12_pos
# Step 7: Check the result
print(f"Sampled {len(df_2016_12_pos_sample)} rows.")


Sampled 1293 rows.


In [176]:
text_gpt_2016_12_pos = '\n'.join(df_2016_12_pos_sample['text'].astype(str))

### Prompts and loops

In [177]:
prompt_12 = prompt_raw.replace('TWEETS', text_gpt_2016_12_pos)

In [178]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_12}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [179]:
response.choices[0].message.content

'The text discusses various vaccination efforts in Brazil, including the introduction of a dengue vaccine being tested by the Butantan Institute, the high vaccination rates against foot-and-mouth disease in livestock, and the importance of vaccinations for both humans and pets. It also highlights the effectiveness of the Ebola vaccine, which has shown 100% efficacy in trials. The text emphasizes the significance of vaccinations in preventing diseases and the ongoing campaigns to promote vaccination awareness.'

In [180]:
#summary = []
#dataset = []

In [181]:
dataset.append("2016_12_positive")

In [182]:
summary.append(response.choices[0].message.content)

In [183]:
df_2016_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2016_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  12 non-null     object
 1   summary  12 non-null     object
dtypes: object(2)
memory usage: 320.0+ bytes


In [184]:
df_2016_pos.tail(14)

Unnamed: 0,dataset,summary
0,2016_01_positive,The sentences discuss various aspects of vacci...
1,2016_02_positive,The excerpts discuss the urgent need for a vac...
2,2016_03_positive,"In Brazil, there is a strong emphasis on vacci..."
3,2016_04_positive,The text discusses various vaccination campaig...
4,2016_05_positive,The text discusses various vaccination campaig...
5,2016_06_positive,The texts discuss various vaccination initiati...
6,2016_07_positive,The text discusses various aspects of vaccinat...
7,2016_08_positive,The texts discuss various vaccines currently i...
8,2016_09_positive,Brazilian researchers have developed a vaccine...
9,2016_10_positive,The sentences discuss various vaccination camp...


In [185]:
response

<OpenAIObject chat.completion id=chatcmpl-AUcsrujZSJJV28KyTrSaAZ4pR8kPH at 0x21001dfb400> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination efforts in Brazil, including the introduction of a dengue vaccine being tested by the Butantan Institute, the high vaccination rates against foot-and-mouth disease in livestock, and the importance of vaccinations for both humans and pets. It also highlights the effectiveness of the Ebola vaccine, which has shown 100% efficacy in trials. The text emphasizes the significance of vaccinations in preventing diseases and the ongoing campaigns to promote vaccination awareness.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731862673,
  "id": "chatcmpl-AUcsrujZSJJV28KyTrSaAZ4pR8kPH",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1

## All 2016 positive

In [186]:
df_2016_pos.to_csv('../data/summary_2016_positive_tweets.csv')