# Import libraries and data

In [1]:
# Import libraries
import os
import openai
import pandas as pd
import numpy as np
import openpyxl

In [2]:
# Get OpenAI API secret key (change this file with your own OpenAI key)
open_ai_key = open('../data/local/openai_key.txt', 'r').read()
openai.api_key = open_ai_key
#openai.Model.list()

In [3]:
# Prompt for GPT
prompt_raw = "I would like to summarise in maximum 300 characters and in English what the following sentences in Portuguese talk about. \
Do not use aspects mentioned only in one sentence as part of the summary: TWEETS"

# 2013 positive

## January

In [4]:
# Get monthly data
df_2013_01_pos = pd.read_csv('../data/local/df_pos_2013-01.csv').reset_index()
df_2013_01_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2516 entries, 0 to 2515
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2516 non-null   int64  
 1   LocationCode            2516 non-null   object 
 2   LocationName            2516 non-null   object 
 3   StatesCode              2516 non-null   object 
 4   ...4                    2516 non-null   int64  
 5   id                      2516 non-null   float64
 6   text                    2516 non-null   object 
 7   label                   2516 non-null   object 
 8   in_reply_to_status_id   82 non-null     float64
 9   in_reply_to_user_id     116 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       495 non-null    float64
 13  retweeted_status_id     495 non-null    float64
 14  created_at              2516 non-null   

### Representative sample based on week and location

In [5]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_01_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_01_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w01,Acre,5,0.001987
1,2013-w01,Alagoas,17,0.006757
2,2013-w01,Amapa,5,0.001987
3,2013-w01,Amazonas,15,0.005962
4,2013-w01,Bahia,55,0.021860
...,...,...,...,...
104,2013-w05,Rondonia,3,0.001192
105,2013-w05,Santa Catarina,12,0.004769
106,2013-w05,Sao Paulo,84,0.033386
107,2013-w05,Sergipe,9,0.003577


In [6]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w01,Acre,5,0.001987,4
1,2013-w01,Alagoas,17,0.006757,16
2,2013-w01,Amapa,5,0.001987,4
3,2013-w01,Amazonas,15,0.005962,14
4,2013-w01,Bahia,55,0.021860,53
...,...,...,...,...,...
104,2013-w05,Rondonia,3,0.001192,2
105,2013-w05,Santa Catarina,12,0.004769,11
106,2013-w05,Sao Paulo,84,0.033386,81
107,2013-w05,Sergipe,9,0.003577,8


In [7]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_01_pos_sample_size = pd.merge(df_2013_01_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [8]:
# Step 6: Perform the stratified sampling
#df_2013_01_pos_sample = df_2013_01_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_01_pos_sample = df_2013_01_pos
# Step 7: Check the result
print(f"Sampled {len(df_2013_01_pos_sample)} rows.")


Sampled 2516 rows.


In [9]:
text_gpt_2013_01_pos = '\n'.join(df_2013_01_pos_sample['text'].astype(str))

### Prompts and loops

In [10]:
prompt_1 = prompt_raw.replace('TWEETS', text_gpt_2013_01_pos)

In [11]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_1}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [12]:
response.choices[0].message.content

"Spanish scientists have developed a vaccine that temporarily controls HIV and another that uses the virus from infected patients. Vaccination against HPV is now recommended for both genders to prevent anal cancer, while a therapeutic vaccine for Alzheimer's is also being researched. Additionally, a vaccine for addiction to cocaine is expected to be available in three years."

In [13]:
summary = []
dataset = []

In [14]:
dataset.append("2013_01_positive")

In [15]:
summary.append(response.choices[0].message.content)

In [16]:
df_2013_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  1 non-null      object
 1   summary  1 non-null      object
dtypes: object(2)
memory usage: 144.0+ bytes


In [17]:
df_2013_pos.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_positive,Spanish scientists have developed a vaccine th...


In [18]:
response

<OpenAIObject chat.completion id=chatcmpl-AUejBGQd2dHIW3E62cGJUWOUuySrU at 0x1f5c8270c70> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Spanish scientists have developed a vaccine that temporarily controls HIV and another that uses the virus from infected patients. Vaccination against HPV is now recommended for both genders to prevent anal cancer, while a therapeutic vaccine for Alzheimer's is also being researched. Additionally, a vaccine for addiction to cocaine is expected to be available in three years.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869761,
  "id": "chatcmpl-AUejBGQd2dHIW3E62cGJUWOUuySrU",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 63,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_token

## February

In [19]:
# Get monthly data
df_2013_02_pos = pd.read_csv('../data/local/df_pos_2013-02.csv').reset_index()
df_2013_02_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279 entries, 0 to 278
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   279 non-null    int64  
 1   LocationCode            279 non-null    object 
 2   LocationName            279 non-null    object 
 3   StatesCode              279 non-null    object 
 4   ...4                    279 non-null    int64  
 5   id                      279 non-null    float64
 6   text                    279 non-null    object 
 7   label                   279 non-null    object 
 8   in_reply_to_status_id   17 non-null     float64
 9   in_reply_to_user_id     19 non-null     float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       72 non-null     float64
 13  retweeted_status_id     72 non-null     float64
 14  created_at              279 non-null    ob

### Representative sample based on week and location

In [20]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_02_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_02_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w05,Acre,3,0.010753
1,2013-w05,Alagoas,1,0.003584
2,2013-w05,Amazonas,4,0.014337
3,2013-w05,Bahia,6,0.021505
4,2013-w05,Ceara,5,0.017921
5,2013-w05,Distrito Federal,6,0.021505
6,2013-w05,Espirito Santo,2,0.007168
7,2013-w05,Goias,8,0.028674
8,2013-w05,Maranhao,4,0.014337
9,2013-w05,Mato Grosso,1,0.003584


In [21]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w05,Acre,3,0.010753,26
1,2013-w05,Alagoas,1,0.003584,8
2,2013-w05,Amazonas,4,0.014337,35
3,2013-w05,Bahia,6,0.021505,52
4,2013-w05,Ceara,5,0.017921,43
5,2013-w05,Distrito Federal,6,0.021505,52
6,2013-w05,Espirito Santo,2,0.007168,17
7,2013-w05,Goias,8,0.028674,70
8,2013-w05,Maranhao,4,0.014337,35
9,2013-w05,Mato Grosso,1,0.003584,8


In [22]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_02_pos_sample_size = pd.merge(df_2013_02_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [23]:
# Step 6: Perform the stratified sampling
#df_2013_02_pos_sample = df_2013_02_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_02_pos_sample = df_2013_02_pos
# Step 7: Check the result
print(f"Sampled {len(df_2013_02_pos_sample)} rows.")


Sampled 279 rows.


In [24]:
text_gpt_2013_02_pos = '\n'.join(df_2013_02_pos_sample['text'].astype(str))

### Prompts and loops

In [25]:
prompt_2 = prompt_raw.replace('TWEETS', text_gpt_2013_02_pos)

In [26]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_2}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [27]:
response.choices[0].message.content

"The text discusses various vaccination initiatives and developments. SESI plans to vaccinate over 900,000 workers against the flu. A new HIV vaccine is being tested on humans in France, alongside Cuba's recently developed lung cancer vaccine. Additionally, a free HPV vaccine has been approved for women aged 9 to 45."

In [28]:
#summary = []
#dataset = []

In [29]:
dataset.append("2013_02_positive")

In [30]:
summary.append(response.choices[0].message.content)

In [31]:
df_2013_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  2 non-null      object
 1   summary  2 non-null      object
dtypes: object(2)
memory usage: 160.0+ bytes


In [32]:
df_2013_pos.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_positive,Spanish scientists have developed a vaccine th...
1,2013_02_positive,The text discusses various vaccination initiat...


In [33]:
response

<OpenAIObject chat.completion id=chatcmpl-AUejGt6C7DPG6F9tzJnlptr9ft4Gg at 0x1f5c8960950> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination initiatives and developments. SESI plans to vaccinate over 900,000 workers against the flu. A new HIV vaccine is being tested on humans in France, alongside Cuba's recently developed lung cancer vaccine. Additionally, a free HPV vaccine has been approved for women aged 9 to 45.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869766,
  "id": "chatcmpl-AUejGt6C7DPG6F9tzJnlptr9ft4Gg",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 64,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reasoning_tokens": 0,
      "rejected_predict

## March

In [34]:
# Get monthly data
df_2013_03_pos = pd.read_csv('../data/local/df_pos_2013-03.csv').reset_index()
df_2013_03_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258 entries, 0 to 257
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   258 non-null    int64  
 1   LocationCode            258 non-null    object 
 2   LocationName            258 non-null    object 
 3   StatesCode              258 non-null    object 
 4   ...4                    258 non-null    int64  
 5   id                      258 non-null    float64
 6   text                    258 non-null    object 
 7   label                   258 non-null    object 
 8   in_reply_to_status_id   18 non-null     float64
 9   in_reply_to_user_id     19 non-null     float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       0 non-null      float64
 13  retweeted_status_id     0 non-null      float64
 14  created_at              258 non-null    ob

### Representative sample based on week and location

In [35]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_03_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_03_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w11,Paraiba,1,0.003876
1,2013-w13,Alagoas,3,0.011628
2,2013-w13,Amazonas,11,0.042636
3,2013-w13,Bahia,20,0.077519
4,2013-w13,Ceara,5,0.01938
5,2013-w13,Distrito Federal,10,0.03876
6,2013-w13,Espirito Santo,1,0.003876
7,2013-w13,Goias,4,0.015504
8,2013-w13,Maranhao,12,0.046512
9,2013-w13,Mato Grosso,2,0.007752


In [36]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w11,Paraiba,1,0.003876,9
1,2013-w13,Alagoas,3,0.011628,28
2,2013-w13,Amazonas,11,0.042636,104
3,2013-w13,Bahia,20,0.077519,189
4,2013-w13,Ceara,5,0.01938,47
5,2013-w13,Distrito Federal,10,0.03876,94
6,2013-w13,Espirito Santo,1,0.003876,9
7,2013-w13,Goias,4,0.015504,37
8,2013-w13,Maranhao,12,0.046512,113
9,2013-w13,Mato Grosso,2,0.007752,18


In [37]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_03_pos_sample_size = pd.merge(df_2013_03_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [38]:
# Step 6: Perform the stratified sampling
#df_2013_03_pos_sample = df_2013_03_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_03_pos_sample = df_2013_03_pos
# Step 7: Check the result
print(f"Sampled {len(df_2013_03_pos_sample)} rows.")


Sampled 258 rows.


In [39]:
text_gpt_2013_03_pos = '\n'.join(df_2013_03_pos_sample['text'].astype(str))

### Prompts and loops

In [40]:
prompt_3 = prompt_raw.replace('TWEETS', text_gpt_2013_03_pos)

In [41]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_3}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [42]:
response.choices[0].message.content

'The text discusses various vaccination campaigns in Brazil, particularly targeting the flu, which aims to immunize over 31 million people in April. It also highlights the development of new vaccines, including those for foot-and-mouth disease and synthetic vaccines. Additionally, there are mentions of HPV vaccination and progress toward eradicating polio.'

In [43]:
#summary = []
#dataset = []

In [44]:
dataset.append("2013_03_positive")

In [45]:
summary.append(response.choices[0].message.content)

In [46]:
df_2013_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  3 non-null      object
 1   summary  3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes


In [47]:
df_2013_pos.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_positive,Spanish scientists have developed a vaccine th...
1,2013_02_positive,The text discusses various vaccination initiat...
2,2013_03_positive,The text discusses various vaccination campaig...


In [48]:
response

<OpenAIObject chat.completion id=chatcmpl-AUejIoOtQ9DezCAp50o7BLjT2SEkz at 0x1f5c8805db0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns in Brazil, particularly targeting the flu, which aims to immunize over 31 million people in April. It also highlights the development of new vaccines, including those for foot-and-mouth disease and synthetic vaccines. Additionally, there are mentions of HPV vaccination and progress toward eradicating polio.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869768,
  "id": "chatcmpl-AUejIoOtQ9DezCAp50o7BLjT2SEkz",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 64,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reaso

## April

In [49]:
# Get monthly data
df_2013_04_pos = pd.read_csv('../data/local/df_pos_2013-04.csv').reset_index()
df_2013_04_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3431 entries, 0 to 3430
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3431 non-null   int64  
 1   LocationCode            3431 non-null   object 
 2   LocationName            3431 non-null   object 
 3   StatesCode              3431 non-null   object 
 4   ...4                    3431 non-null   int64  
 5   id                      3431 non-null   float64
 6   text                    3431 non-null   object 
 7   label                   3431 non-null   object 
 8   in_reply_to_status_id   179 non-null    float64
 9   in_reply_to_user_id     251 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       0 non-null      float64
 13  retweeted_status_id     0 non-null      float64
 14  created_at              3431 non-null   

### Representative sample based on week and location

In [50]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_04_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_04_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w13,Alagoas,4,0.001166
1,2013-w13,Amapa,1,0.000291
2,2013-w13,Amazonas,4,0.001166
3,2013-w13,Bahia,6,0.001749
4,2013-w13,Ceara,3,0.000874
...,...,...,...,...
133,2013-w18,Rio Grande Do Sul,11,0.003206
134,2013-w18,Santa Catarina,5,0.001457
135,2013-w18,Sao Paulo,19,0.005538
136,2013-w18,Sergipe,1,0.000291


In [51]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w13,Alagoas,4,0.001166,2
1,2013-w13,Amapa,1,0.000291,0
2,2013-w13,Amazonas,4,0.001166,2
3,2013-w13,Bahia,6,0.001749,4
4,2013-w13,Ceara,3,0.000874,2
...,...,...,...,...,...
133,2013-w18,Rio Grande Do Sul,11,0.003206,7
134,2013-w18,Santa Catarina,5,0.001457,3
135,2013-w18,Sao Paulo,19,0.005538,13
136,2013-w18,Sergipe,1,0.000291,0


In [52]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_04_pos_sample_size = pd.merge(df_2013_04_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [53]:
# Step 6: Perform the stratified sampling
#df_2013_04_pos_sample = df_2013_04_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_04_pos_sample = df_2013_04_pos
# Step 7: Check the result
print(f"Sampled {len(df_2013_04_pos_sample)} rows.")


Sampled 3431 rows.


In [54]:
text_gpt_2013_04_pos = '\n'.join(df_2013_04_pos_sample['text'].astype(str))

### Prompts and loops

In [55]:
prompt_4 = prompt_raw.replace('TWEETS', text_gpt_2013_04_pos)

In [56]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_4}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [57]:
response.choices[0].message.content

'Recent news highlights significant advancements in vaccine research, particularly against AIDS, where scientists are making promising strides. Additionally, there are ongoing vaccination campaigns across Brazil for various diseases, including rabies and influenza, with specific emphasis on protecting vulnerable groups such as the elderly, children, and pregnant women. The HPV vaccine is also being integrated into national health initiatives. These efforts reflect a broader commitment to public health and disease prevention through vaccination.'

In [58]:
#summary = []
#dataset = []

In [59]:
dataset.append("2013_04_positive")

In [60]:
summary.append(response.choices[0].message.content)

In [61]:
df_2013_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  4 non-null      object
 1   summary  4 non-null      object
dtypes: object(2)
memory usage: 192.0+ bytes


In [62]:
df_2013_pos.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_positive,Spanish scientists have developed a vaccine th...
1,2013_02_positive,The text discusses various vaccination initiat...
2,2013_03_positive,The text discusses various vaccination campaig...
3,2013_04_positive,Recent news highlights significant advancement...


In [63]:
response

<OpenAIObject chat.completion id=chatcmpl-AUejPKMBkr3pyc543xaukstZb1aBP at 0x1f5c85ab860> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Recent news highlights significant advancements in vaccine research, particularly against AIDS, where scientists are making promising strides. Additionally, there are ongoing vaccination campaigns across Brazil for various diseases, including rabies and influenza, with specific emphasis on protecting vulnerable groups such as the elderly, children, and pregnant women. The HPV vaccine is also being integrated into national health initiatives. These efforts reflect a broader commitment to public health and disease prevention through vaccination.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869775,
  "id": "chatcmpl-AUejPKMBkr3pyc543xaukstZb1aBP",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  

## May

In [64]:
# Get monthly data
df_2013_05_pos = pd.read_csv('../data/local/df_pos_2013-05.csv').reset_index()
df_2013_05_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2297 entries, 0 to 2296
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2297 non-null   int64  
 1   LocationCode            2297 non-null   object 
 2   LocationName            2297 non-null   object 
 3   StatesCode              2297 non-null   object 
 4   ...4                    2297 non-null   int64  
 5   id                      2297 non-null   float64
 6   text                    2297 non-null   object 
 7   label                   2297 non-null   object 
 8   in_reply_to_status_id   170 non-null    float64
 9   in_reply_to_user_id     206 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       0 non-null      float64
 13  retweeted_status_id     0 non-null      float64
 14  created_at              2297 non-null   

### Representative sample based on week and location

In [65]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_05_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_05_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w18,Acre,2,0.000871
1,2013-w18,Alagoas,1,0.000435
2,2013-w18,Amapa,3,0.001306
3,2013-w18,Amazonas,2,0.000871
4,2013-w18,Bahia,19,0.008272
...,...,...,...,...
112,2013-w22,Rio Grande Do Sul,18,0.007836
113,2013-w22,Santa Catarina,11,0.004789
114,2013-w22,Sao Paulo,64,0.027862
115,2013-w22,Sergipe,2,0.000871


In [66]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w18,Acre,2,0.000871,2
1,2013-w18,Alagoas,1,0.000435,1
2,2013-w18,Amapa,3,0.001306,3
3,2013-w18,Amazonas,2,0.000871,2
4,2013-w18,Bahia,19,0.008272,20
...,...,...,...,...,...
112,2013-w22,Rio Grande Do Sul,18,0.007836,19
113,2013-w22,Santa Catarina,11,0.004789,11
114,2013-w22,Sao Paulo,64,0.027862,68
115,2013-w22,Sergipe,2,0.000871,2


In [67]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_05_pos_sample_size = pd.merge(df_2013_05_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [68]:
# Step 6: Perform the stratified sampling
#df_2013_05_pos_sample = df_2013_05_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_05_pos_sample = df_2013_05_pos
# Step 7: Check the result
print(f"Sampled {len(df_2013_05_pos_sample)} rows.")


Sampled 2297 rows.


In [69]:
text_gpt_2013_05_pos = '\n'.join(df_2013_05_pos_sample['text'].astype(str))

### Prompts and loops

In [70]:
prompt_5 = prompt_raw.replace('TWEETS', text_gpt_2013_05_pos)

In [71]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_5}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [72]:
response.choices[0].message.content

'The passages discuss various vaccination campaigns in Brazil, focusing on the importance of vaccinations against diseases like influenza, HPV, and rabies. They highlight the progress in immunization rates, the promotion of free vaccines for specific groups, and the development of new vaccines, including those targeting heroin addiction and malaria.'

In [73]:
#summary = []
#dataset = []

In [74]:
dataset.append("2013_05_positive")

In [75]:
summary.append(response.choices[0].message.content)

In [76]:
df_2013_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  5 non-null      object
 1   summary  5 non-null      object
dtypes: object(2)
memory usage: 208.0+ bytes


In [77]:
df_2013_pos.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_positive,Spanish scientists have developed a vaccine th...
1,2013_02_positive,The text discusses various vaccination initiat...
2,2013_03_positive,The text discusses various vaccination campaig...
3,2013_04_positive,Recent news highlights significant advancement...
4,2013_05_positive,The passages discuss various vaccination campa...


In [78]:
response

<OpenAIObject chat.completion id=chatcmpl-AUejYQqK9nxicOyC8alXua8LAauDw at 0x1f5c85b4360> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The passages discuss various vaccination campaigns in Brazil, focusing on the importance of vaccinations against diseases like influenza, HPV, and rabies. They highlight the progress in immunization rates, the promotion of free vaccines for specific groups, and the development of new vaccines, including those targeting heroin addiction and malaria.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869784,
  "id": "chatcmpl-AUejYQqK9nxicOyC8alXua8LAauDw",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 59,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reasoning_to

## June

In [79]:
# Get monthly data
df_2013_06_pos = pd.read_csv('../data/local/df_pos_2013-06.csv').reset_index()
df_2013_06_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4607 entries, 0 to 4606
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   4607 non-null   int64  
 1   LocationCode            4607 non-null   object 
 2   LocationName            4607 non-null   object 
 3   StatesCode              4607 non-null   object 
 4   ...4                    4607 non-null   int64  
 5   id                      4607 non-null   float64
 6   text                    4607 non-null   object 
 7   label                   4607 non-null   object 
 8   in_reply_to_status_id   177 non-null    float64
 9   in_reply_to_user_id     280 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       1023 non-null   float64
 13  retweeted_status_id     1023 non-null   float64
 14  created_at              4607 non-null   

### Representative sample based on week and location

In [80]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_06_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_06_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w22,Alagoas,3,0.000651
1,2013-w22,Amazonas,11,0.002388
2,2013-w22,Bahia,20,0.004341
3,2013-w22,Ceara,10,0.002171
4,2013-w22,Distrito Federal,11,0.002388
...,...,...,...,...
113,2013-w26,Roraima,2,0.000434
114,2013-w26,Santa Catarina,20,0.004341
115,2013-w26,Sao Paulo,137,0.029737
116,2013-w26,Sergipe,6,0.001302


In [81]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w22,Alagoas,3,0.000651,1
1,2013-w22,Amazonas,11,0.002388,5
2,2013-w22,Bahia,20,0.004341,10
3,2013-w22,Ceara,10,0.002171,5
4,2013-w22,Distrito Federal,11,0.002388,5
...,...,...,...,...,...
113,2013-w26,Roraima,2,0.000434,1
114,2013-w26,Santa Catarina,20,0.004341,10
115,2013-w26,Sao Paulo,137,0.029737,72
116,2013-w26,Sergipe,6,0.001302,3


In [82]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_06_pos_sample_size = pd.merge(df_2013_06_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [83]:
# Step 6: Perform the stratified sampling
#df_2013_06_pos_sample = df_2013_06_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_06_pos_sample = df_2013_06_pos
# Step 7: Check the result
print(f"Sampled {len(df_2013_06_pos_sample)} rows.")


Sampled 4607 rows.


In [84]:
text_gpt_2013_06_pos = '\n'.join(df_2013_06_pos_sample['text'].astype(str))

### Prompts and loops

In [85]:
prompt_6 = prompt_raw.replace('TWEETS', text_gpt_2013_06_pos)

In [86]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_6}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [87]:
response.choices[0].message.content

'The text discusses the launch of a national vaccination campaign in Brazil against poliomyelitis (also known as polio or infantile paralysis), which begins on a specific Saturday and aims to vaccinate millions of children aged 6 months to under 5 years. Various municipalities have set goals for vaccination coverage, and there are reminders for parents to participate by taking their children to vaccination posts. Additionally, mentions of a new vaccine for lung cancer developed by Argentina and Cuba are included, highlighting advancements in medical science.'

In [88]:
#summary = []
#dataset = []

In [89]:
dataset.append("2013_06_positive")

In [90]:
summary.append(response.choices[0].message.content)

In [91]:
df_2013_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  6 non-null      object
 1   summary  6 non-null      object
dtypes: object(2)
memory usage: 224.0+ bytes


In [92]:
df_2013_pos.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_positive,Spanish scientists have developed a vaccine th...
1,2013_02_positive,The text discusses various vaccination initiat...
2,2013_03_positive,The text discusses various vaccination campaig...
3,2013_04_positive,Recent news highlights significant advancement...
4,2013_05_positive,The passages discuss various vaccination campa...
5,2013_06_positive,The text discusses the launch of a national va...


In [93]:
response

<OpenAIObject chat.completion id=chatcmpl-AUejhYiiBLPqbgITCDQIPnZaOuROf at 0x1f5c8b534a0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses the launch of a national vaccination campaign in Brazil against poliomyelitis (also known as polio or infantile paralysis), which begins on a specific Saturday and aims to vaccinate millions of children aged 6 months to under 5 years. Various municipalities have set goals for vaccination coverage, and there are reminders for parents to participate by taking their children to vaccination posts. Additionally, mentions of a new vaccine for lung cancer developed by Argentina and Cuba are included, highlighting advancements in medical science.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869793,
  "id": "chatcmpl-AUejhYiiBLPqbgITCDQIPnZaOuROf",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.c

## July

In [94]:
# Get monthly data
df_2013_07_pos = pd.read_csv('../data/local/df_pos_2013-07.csv').reset_index()
df_2013_07_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2528 entries, 0 to 2527
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2528 non-null   int64  
 1   LocationCode            2528 non-null   object 
 2   LocationName            2528 non-null   object 
 3   StatesCode              2528 non-null   object 
 4   ...4                    2528 non-null   int64  
 5   id                      2528 non-null   float64
 6   text                    2528 non-null   object 
 7   label                   2528 non-null   object 
 8   in_reply_to_status_id   124 non-null    float64
 9   in_reply_to_user_id     136 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       621 non-null    float64
 13  retweeted_status_id     621 non-null    float64
 14  created_at              2528 non-null   

### Representative sample based on week and location

In [95]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_07_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_07_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w26,Acre,1,0.000396
1,2013-w26,Alagoas,5,0.001978
2,2013-w26,Amapa,1,0.000396
3,2013-w26,Amazonas,4,0.001582
4,2013-w26,Bahia,20,0.007911
...,...,...,...,...
128,2013-w31,Parana,4,0.001582
129,2013-w31,Rio Grande Do Sul,5,0.001978
130,2013-w31,Santa Catarina,2,0.000791
131,2013-w31,Sao Paulo,27,0.010680


In [96]:
# Step 3: Define the target sample size
#target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
#group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
#group_sizes
# Include all samples since there are not many tweets

In [97]:
# Step 5: Merge this sample size information back to the original DataFrame
#df_2013_07_pos_sample_size = pd.merge(df_2013_07_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
 #                              on=['year_week', 'LocationName'], how='left')

In [98]:
# Step 6: Perform the stratified sampling
#df_2013_07_pos_sample = df_2013_07_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
#    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_07_pos_sample = df_2013_07_pos
# Step 7: Check the result
print(f"Sampled {len(df_2013_07_pos_sample)} rows.")


Sampled 2528 rows.


In [99]:
text_gpt_2013_07_pos = '\n'.join(df_2013_07_pos_sample['text'].astype(str))

### Prompts and loops

In [100]:
prompt_7 = prompt_raw.replace('TWEETS', text_gpt_2013_07_pos)

In [101]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_7}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [102]:
response.choices[0].message.content

'Starting in 2014, the Brazilian public health system (SUS) will offer free HPV vaccinations to girls aged 10 and 11. This initiative aims to immunize over 17,000 girls in Acre and more than 81,000 in Amazonas. The vaccine protects against cervical cancer and related diseases. Additionally, the age limit for hepatitis B vaccination has been raised to 49 years.'

In [103]:
#summary = []
#dataset = []

In [104]:
dataset.append("2013_07_positive")

In [105]:
summary.append(response.choices[0].message.content)

In [106]:
df_2013_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  7 non-null      object
 1   summary  7 non-null      object
dtypes: object(2)
memory usage: 240.0+ bytes


In [107]:
df_2013_pos.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_positive,Spanish scientists have developed a vaccine th...
1,2013_02_positive,The text discusses various vaccination initiat...
2,2013_03_positive,The text discusses various vaccination campaig...
3,2013_04_positive,Recent news highlights significant advancement...
4,2013_05_positive,The passages discuss various vaccination campa...
5,2013_06_positive,The text discusses the launch of a national va...
6,2013_07_positive,"Starting in 2014, the Brazilian public health ..."


In [108]:
response

<OpenAIObject chat.completion id=chatcmpl-AUejyNGefLMhL3qFF4YpL0CzA526L at 0x1f5c8b3f180> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Starting in 2014, the Brazilian public health system (SUS) will offer free HPV vaccinations to girls aged 10 and 11. This initiative aims to immunize over 17,000 girls in Acre and more than 81,000 in Amazonas. The vaccine protects against cervical cancer and related diseases. Additionally, the age limit for hepatitis B vaccination has been raised to 49 years.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869810,
  "id": "chatcmpl-AUejyNGefLMhL3qFF4YpL0CzA526L",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 80,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "r

## August

In [109]:
# Get monthly data
df_2013_08_pos = pd.read_csv('../data/local/df_pos_2013-08.csv').reset_index()
df_2013_08_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3577 entries, 0 to 3576
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3577 non-null   int64  
 1   LocationCode            3577 non-null   object 
 2   LocationName            3577 non-null   object 
 3   StatesCode              3577 non-null   object 
 4   ...4                    3577 non-null   int64  
 5   id                      3577 non-null   float64
 6   text                    3577 non-null   object 
 7   label                   3577 non-null   object 
 8   in_reply_to_status_id   130 non-null    float64
 9   in_reply_to_user_id     193 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       855 non-null    float64
 13  retweeted_status_id     855 non-null    float64
 14  created_at              3577 non-null   

### Representative sample based on week and location

In [110]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_08_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_08_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w31,Alagoas,9,0.002516
1,2013-w31,Amapa,1,0.000280
2,2013-w31,Amazonas,19,0.005312
3,2013-w31,Bahia,26,0.007269
4,2013-w31,Ceara,22,0.006150
...,...,...,...,...
112,2013-w35,Rio Grande Do Sul,38,0.010623
113,2013-w35,Santa Catarina,5,0.001398
114,2013-w35,Sao Paulo,114,0.031870
115,2013-w35,Sergipe,5,0.001398


In [111]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w31,Alagoas,9,0.002516,6
1,2013-w31,Amapa,1,0.000280,0
2,2013-w31,Amazonas,19,0.005312,13
3,2013-w31,Bahia,26,0.007269,17
4,2013-w31,Ceara,22,0.006150,15
...,...,...,...,...,...
112,2013-w35,Rio Grande Do Sul,38,0.010623,26
113,2013-w35,Santa Catarina,5,0.001398,3
114,2013-w35,Sao Paulo,114,0.031870,78
115,2013-w35,Sergipe,5,0.001398,3


In [112]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_08_pos_sample_size = pd.merge(df_2013_08_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [113]:
# Step 6: Perform the stratified sampling
#df_2013_08_pos_sample = df_2013_08_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_08_pos_sample = df_2013_08_pos
# Step 7: Check the result
print(f"Sampled {len(df_2013_08_pos_sample)} rows.")


Sampled 3577 rows.


In [114]:
text_gpt_2013_08_pos = '\n'.join(df_2013_08_pos_sample['text'].astype(str))

### Prompts and loops

In [115]:
prompt_8 = prompt_raw.replace('TWEETS', text_gpt_2013_08_pos)

In [116]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_8}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [117]:
response.choices[0].message.content

"The sentences discuss various vaccination campaigns in Brazil, including updates for children's vaccination records, new vaccines being tested (such as for dengue and AIDS), and community efforts to promote vaccination. Specific campaigns target young children for routine immunizations and include initiatives for animal vaccinations. The importance of vaccines in preventing diseases is emphasized, alongside recent advancements in vaccine development."

In [118]:
#summary = []
#dataset = []

In [119]:
dataset.append("2013_08_positive")

In [120]:
summary.append(response.choices[0].message.content)

In [121]:
df_2013_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  8 non-null      object
 1   summary  8 non-null      object
dtypes: object(2)
memory usage: 256.0+ bytes


In [122]:
df_2013_pos.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_positive,Spanish scientists have developed a vaccine th...
1,2013_02_positive,The text discusses various vaccination initiat...
2,2013_03_positive,The text discusses various vaccination campaig...
3,2013_04_positive,Recent news highlights significant advancement...
4,2013_05_positive,The passages discuss various vaccination campa...
5,2013_06_positive,The text discusses the launch of a national va...
6,2013_07_positive,"Starting in 2014, the Brazilian public health ..."
7,2013_08_positive,The sentences discuss various vaccination camp...


In [123]:
response

<OpenAIObject chat.completion id=chatcmpl-AUekAJgMtb2e8Dq2McGdogKQmXOh8 at 0x1f5c8fc1a90> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The sentences discuss various vaccination campaigns in Brazil, including updates for children's vaccination records, new vaccines being tested (such as for dengue and AIDS), and community efforts to promote vaccination. Specific campaigns target young children for routine immunizations and include initiatives for animal vaccinations. The importance of vaccines in preventing diseases is emphasized, alongside recent advancements in vaccine development.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869822,
  "id": "chatcmpl-AUekAJgMtb2e8Dq2McGdogKQmXOh8",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 68,
    "completio

## September

In [124]:
# Get monthly data
df_2013_09_pos = pd.read_csv('../data/local/df_pos_2013-09.csv').reset_index()
df_2013_09_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2963 entries, 0 to 2962
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2963 non-null   int64  
 1   LocationCode            2963 non-null   object 
 2   LocationName            2963 non-null   object 
 3   StatesCode              2963 non-null   object 
 4   ...4                    2963 non-null   int64  
 5   id                      2963 non-null   float64
 6   text                    2963 non-null   object 
 7   label                   2963 non-null   object 
 8   in_reply_to_status_id   96 non-null     float64
 9   in_reply_to_user_id     109 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       675 non-null    float64
 13  retweeted_status_id     675 non-null    float64
 14  created_at              2963 non-null   

### Representative sample based on week and location

In [125]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_09_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_09_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w35,Alagoas,3,0.001012
1,2013-w35,Amazonas,3,0.001012
2,2013-w35,Bahia,8,0.002700
3,2013-w35,Ceara,5,0.001687
4,2013-w35,Distrito Federal,5,0.001687
...,...,...,...,...
111,2013-w39,Roraima,3,0.001012
112,2013-w39,Santa Catarina,4,0.001350
113,2013-w39,Sao Paulo,100,0.033750
114,2013-w39,Sergipe,11,0.003712


In [126]:
# Step 3: Define the target sample size
#target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
#group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
#group_sizes
# Taking all tweets since there were not enough for sampling

In [127]:
# Step 5: Merge this sample size information back to the original DataFrame
#df_2013_09_pos_sample_size = pd.merge(df_2013_09_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
 #                              on=['year_week', 'LocationName'], how='left')

In [128]:
# Step 6: Perform the stratified sampling
#df_2013_09_pos_sample = df_2013_09_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_09_pos_sample = df_2013_09_pos
# Step 7: Check the result
print(f"Sampled {len(df_2013_09_pos_sample)} rows.")


Sampled 2963 rows.


In [129]:
text_gpt_2013_09_pos = '\n'.join(df_2013_09_pos_sample['text'].astype(str))

### Prompts and loops

In [130]:
prompt_9 = prompt_raw.replace('TWEETS', text_gpt_2013_09_pos)

In [131]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_9}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [132]:
response.choices[0].message.content

"The text discusses the recent introduction of a varicella (chickenpox) vaccine into Brazil's public healthcare system (SUS), available for children aged 15 months. This vaccine is part of the national vaccination schedule, which aims to enhance public health by preventing chickenpox along with other diseases."

In [133]:
#summary = []
#dataset = []

In [134]:
dataset.append("2013_09_positive")

In [135]:
summary.append(response.choices[0].message.content)

In [136]:
df_2013_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  9 non-null      object
 1   summary  9 non-null      object
dtypes: object(2)
memory usage: 272.0+ bytes


In [137]:
df_2013_pos.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_positive,Spanish scientists have developed a vaccine th...
1,2013_02_positive,The text discusses various vaccination initiat...
2,2013_03_positive,The text discusses various vaccination campaig...
3,2013_04_positive,Recent news highlights significant advancement...
4,2013_05_positive,The passages discuss various vaccination campa...
5,2013_06_positive,The text discusses the launch of a national va...
6,2013_07_positive,"Starting in 2014, the Brazilian public health ..."
7,2013_08_positive,The sentences discuss various vaccination camp...
8,2013_09_positive,The text discusses the recent introduction of ...


In [138]:
response

<OpenAIObject chat.completion id=chatcmpl-AUekL85krp6HFZ5gCXQUENmFOeJkz at 0x1f5c91e4630> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses the recent introduction of a varicella (chickenpox) vaccine into Brazil's public healthcare system (SUS), available for children aged 15 months. This vaccine is part of the national vaccination schedule, which aims to enhance public health by preventing chickenpox along with other diseases.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869833,
  "id": "chatcmpl-AUekL85krp6HFZ5gCXQUENmFOeJkz",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 60,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reasoning_tokens": 0,
      "rejected_prediction_tok

## October

In [139]:
# Get monthly data
df_2013_10_pos = pd.read_csv('../data/local/df_pos_2013-10.csv').reset_index()
df_2013_10_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1833 entries, 0 to 1832
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1833 non-null   int64  
 1   LocationCode            1833 non-null   object 
 2   LocationName            1833 non-null   object 
 3   StatesCode              1833 non-null   object 
 4   ...4                    1833 non-null   int64  
 5   id                      1833 non-null   float64
 6   text                    1833 non-null   object 
 7   label                   1833 non-null   object 
 8   in_reply_to_status_id   68 non-null     float64
 9   in_reply_to_user_id     86 non-null     float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       430 non-null    float64
 13  retweeted_status_id     430 non-null    float64
 14  created_at              1833 non-null   

### Representative sample based on week and location

In [140]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_10_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_10_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w40,Acre,2,0.001091
1,2013-w40,Alagoas,6,0.003273
2,2013-w40,Amazonas,8,0.004364
3,2013-w40,Bahia,34,0.018549
4,2013-w40,Ceara,13,0.007092
...,...,...,...,...
106,2013-w44,Piaui,1,0.000546
107,2013-w44,Rio Grande Do Sul,17,0.009274
108,2013-w44,Santa Catarina,5,0.002728
109,2013-w44,Sao Paulo,53,0.028914


In [141]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w40,Acre,2,0.001091,2
1,2013-w40,Alagoas,6,0.003273,8
2,2013-w40,Amazonas,8,0.004364,10
3,2013-w40,Bahia,34,0.018549,45
4,2013-w40,Ceara,13,0.007092,17
...,...,...,...,...,...
106,2013-w44,Piaui,1,0.000546,1
107,2013-w44,Rio Grande Do Sul,17,0.009274,22
108,2013-w44,Santa Catarina,5,0.002728,6
109,2013-w44,Sao Paulo,53,0.028914,70


In [142]:
# Step 5: Merge this sample size information back to the original DataFrame
#df_2013_10_pos_sample_size = pd.merge(df_2013_10_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
 #                              on=['year_week', 'LocationName'], how='left')
# Taking all tweets for not having enough tweets for sampling

In [143]:
# Step 6: Perform the stratified sampling
#df_2013_10_pos_sample = df_2013_10_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_10_pos_sample = df_2013_10_pos
# Step 7: Check the result
print(f"Sampled {len(df_2013_10_pos_sample)} rows.")


Sampled 1833 rows.


In [144]:
text_gpt_2013_10_pos = '\n'.join(df_2013_10_pos_sample['text'].astype(str))

### Prompts and loops

In [145]:
prompt_10 = prompt_raw.replace('TWEETS', text_gpt_2013_10_pos)

In [146]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_10}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [147]:
response.choices[0].message.content

'The text discusses various aspects of vaccination in Brazil, highlighting initiatives such as the National Vaccination Day, the introduction of new vaccines, particularly against HPV, dengue, and prostate cancer, as well as the importance of maintaining vaccination schedules for both children and adults. It emphasizes the role of vaccines in disease prevention and public health.'

In [148]:
#summary = []
#dataset = []

In [149]:
dataset.append("2013_10_positive")

In [150]:
summary.append(response.choices[0].message.content)

In [151]:
df_2013_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  10 non-null     object
 1   summary  10 non-null     object
dtypes: object(2)
memory usage: 288.0+ bytes


In [152]:
df_2013_pos.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_positive,Spanish scientists have developed a vaccine th...
1,2013_02_positive,The text discusses various vaccination initiat...
2,2013_03_positive,The text discusses various vaccination campaig...
3,2013_04_positive,Recent news highlights significant advancement...
4,2013_05_positive,The passages discuss various vaccination campa...
5,2013_06_positive,The text discusses the launch of a national va...
6,2013_07_positive,"Starting in 2014, the Brazilian public health ..."
7,2013_08_positive,The sentences discuss various vaccination camp...
8,2013_09_positive,The text discusses the recent introduction of ...
9,2013_10_positive,The text discusses various aspects of vaccinat...


In [153]:
response

<OpenAIObject chat.completion id=chatcmpl-AUekSbJg9rw0xFhEzZ0IkPaFOF1Od at 0x1f5c91e4b80> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various aspects of vaccination in Brazil, highlighting initiatives such as the National Vaccination Day, the introduction of new vaccines, particularly against HPV, dengue, and prostate cancer, as well as the importance of maintaining vaccination schedules for both children and adults. It emphasizes the role of vaccines in disease prevention and public health.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869840,
  "id": "chatcmpl-AUekSbJg9rw0xFhEzZ0IkPaFOF1Od",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 64,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_

## November

In [154]:
# Get monthly data
df_2013_11_pos = pd.read_csv('../data/local/df_pos_2013-11.csv').reset_index()
df_2013_11_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1466 entries, 0 to 1465
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1466 non-null   int64  
 1   LocationCode            1466 non-null   object 
 2   LocationName            1466 non-null   object 
 3   StatesCode              1466 non-null   object 
 4   ...4                    1466 non-null   int64  
 5   id                      1466 non-null   float64
 6   text                    1466 non-null   object 
 7   label                   1466 non-null   object 
 8   in_reply_to_status_id   74 non-null     float64
 9   in_reply_to_user_id     86 non-null     float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       393 non-null    float64
 13  retweeted_status_id     393 non-null    float64
 14  created_at              1466 non-null   

### Representative sample based on week and location

In [155]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_11_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_11_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w44,Acre,1,0.000682
1,2013-w44,Alagoas,4,0.002729
2,2013-w44,Amapa,1,0.000682
3,2013-w44,Amazonas,6,0.004093
4,2013-w44,Bahia,15,0.010232
...,...,...,...,...
102,2013-w48,Pernambuco,12,0.008186
103,2013-w48,Piaui,2,0.001364
104,2013-w48,Rio Grande Do Sul,14,0.009550
105,2013-w48,Rondonia,1,0.000682


In [156]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w44,Acre,1,0.000682,1
1,2013-w44,Alagoas,4,0.002729,6
2,2013-w44,Amapa,1,0.000682,1
3,2013-w44,Amazonas,6,0.004093,10
4,2013-w44,Bahia,15,0.010232,25
...,...,...,...,...,...
102,2013-w48,Pernambuco,12,0.008186,20
103,2013-w48,Piaui,2,0.001364,3
104,2013-w48,Rio Grande Do Sul,14,0.009550,23
105,2013-w48,Rondonia,1,0.000682,1


In [157]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_11_pos_sample_size = pd.merge(df_2013_11_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [158]:
# Step 6: Perform the stratified sampling
#df_2013_11_pos_sample = df_2013_11_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_11_pos_sample = df_2013_11_pos
# Step 7: Check the result
print(f"Sampled {len(df_2013_11_pos_sample)} rows.")


Sampled 1466 rows.


In [159]:
text_gpt_2013_11_pos = '\n'.join(df_2013_11_pos_sample['text'].astype(str))

### Prompts and loops

In [160]:
prompt_11 = prompt_raw.replace('TWEETS', text_gpt_2013_11_pos)

In [161]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_11}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [162]:
response.choices[0].message.content

'The text discusses various advancements and campaigns related to vaccines, including research on a vaccine for HIV, free vaccinations for hepatitis B, and initiatives to vaccinate children against polio. It highlights the importance of vaccination in preventing diseases, the development of new vaccines, and ongoing public health efforts in Brazil and globally.'

In [163]:
#summary = []
#dataset = []

In [164]:
dataset.append("2013_11_positive")

In [165]:
summary.append(response.choices[0].message.content)

In [166]:
df_2013_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  11 non-null     object
 1   summary  11 non-null     object
dtypes: object(2)
memory usage: 304.0+ bytes


In [167]:
df_2013_pos.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_positive,Spanish scientists have developed a vaccine th...
1,2013_02_positive,The text discusses various vaccination initiat...
2,2013_03_positive,The text discusses various vaccination campaig...
3,2013_04_positive,Recent news highlights significant advancement...
4,2013_05_positive,The passages discuss various vaccination campa...
5,2013_06_positive,The text discusses the launch of a national va...
6,2013_07_positive,"Starting in 2014, the Brazilian public health ..."
7,2013_08_positive,The sentences discuss various vaccination camp...
8,2013_09_positive,The text discusses the recent introduction of ...
9,2013_10_positive,The text discusses various aspects of vaccinat...


In [168]:
response

<OpenAIObject chat.completion id=chatcmpl-AUekZJgJbpNojJXcJdiIbX6dx9eHc at 0x1f5c9474ea0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various advancements and campaigns related to vaccines, including research on a vaccine for HIV, free vaccinations for hepatitis B, and initiatives to vaccinate children against polio. It highlights the importance of vaccination in preventing diseases, the development of new vaccines, and ongoing public health efforts in Brazil and globally.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869847,
  "id": "chatcmpl-AUekZJgJbpNojJXcJdiIbX6dx9eHc",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 61,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "

## December

In [169]:
# Get monthly data
df_2013_12_pos = pd.read_csv('../data/local/df_pos_2013-12.csv').reset_index()
df_2013_12_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1359 entries, 0 to 1358
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1359 non-null   int64  
 1   LocationCode            1359 non-null   object 
 2   LocationName            1359 non-null   object 
 3   StatesCode              1359 non-null   object 
 4   ...4                    1359 non-null   int64  
 5   id                      1359 non-null   float64
 6   text                    1359 non-null   object 
 7   label                   1359 non-null   object 
 8   in_reply_to_status_id   70 non-null     float64
 9   in_reply_to_user_id     76 non-null     float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       361 non-null    float64
 13  retweeted_status_id     361 non-null    float64
 14  created_at              1359 non-null   

### Representative sample based on week and location

In [170]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2013_12_pos.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2013_12_pos)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2013-w48,Alagoas,7,0.005151
1,2013-w48,Amapa,1,0.000736
2,2013-w48,Bahia,3,0.002208
3,2013-w48,Ceara,1,0.000736
4,2013-w48,Espirito Santo,1,0.000736
...,...,...,...,...
100,2013-w53,Minas Gerais,7,0.005151
101,2013-w53,Paraiba,2,0.001472
102,2013-w53,Parana,1,0.000736
103,2013-w53,Rio Grande Do Sul,4,0.002943


In [171]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2013-w48,Alagoas,7,0.005151,12
1,2013-w48,Amapa,1,0.000736,1
2,2013-w48,Bahia,3,0.002208,5
3,2013-w48,Ceara,1,0.000736,1
4,2013-w48,Espirito Santo,1,0.000736,1
...,...,...,...,...,...
100,2013-w53,Minas Gerais,7,0.005151,12
101,2013-w53,Paraiba,2,0.001472,3
102,2013-w53,Parana,1,0.000736,1
103,2013-w53,Rio Grande Do Sul,4,0.002943,7


In [172]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2013_12_pos_sample_size = pd.merge(df_2013_12_pos, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [173]:
# Step 6: Perform the stratified sampling
#df_2013_12_pos_sample = df_2013_12_pos_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2013_12_pos_sample = df_2013_12_pos
# Step 7: Check the result
print(f"Sampled {len(df_2013_12_pos_sample)} rows.")


Sampled 1359 rows.


In [174]:
text_gpt_2013_12_pos = '\n'.join(df_2013_12_pos_sample['text'].astype(str))

### Prompts and loops

In [175]:
prompt_12 = prompt_raw.replace('TWEETS', text_gpt_2013_12_pos)

In [176]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_12}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [177]:
response.choices[0].message.content

'The sentences discuss various topics related to vaccination, including the introduction of new vaccines, the importance of vaccination campaigns, and the benefits of vaccines against diseases such as cancer, HPV, and rabies. They emphasize the need for vaccinations in both humans and animals, and highlight recent advancements in vaccine development. Additionally, the phrases imply a metaphorical use of "vaccination" against negativity or envy in social contexts.'

In [178]:
#summary = []
#dataset = []

In [179]:
dataset.append("2013_12_positive")

In [180]:
summary.append(response.choices[0].message.content)

In [181]:
df_2013_pos = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2013_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  12 non-null     object
 1   summary  12 non-null     object
dtypes: object(2)
memory usage: 320.0+ bytes


In [182]:
df_2013_pos.tail(14)

Unnamed: 0,dataset,summary
0,2013_01_positive,Spanish scientists have developed a vaccine th...
1,2013_02_positive,The text discusses various vaccination initiat...
2,2013_03_positive,The text discusses various vaccination campaig...
3,2013_04_positive,Recent news highlights significant advancement...
4,2013_05_positive,The passages discuss various vaccination campa...
5,2013_06_positive,The text discusses the launch of a national va...
6,2013_07_positive,"Starting in 2014, the Brazilian public health ..."
7,2013_08_positive,The sentences discuss various vaccination camp...
8,2013_09_positive,The text discusses the recent introduction of ...
9,2013_10_positive,The text discusses various aspects of vaccinat...


In [183]:
response

<OpenAIObject chat.completion id=chatcmpl-AUekdariqnacx22XcuJNN3W1dL9zg at 0x1f5c9584a40> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The sentences discuss various topics related to vaccination, including the introduction of new vaccines, the importance of vaccination campaigns, and the benefits of vaccines against diseases such as cancer, HPV, and rabies. They emphasize the need for vaccinations in both humans and animals, and highlight recent advancements in vaccine development. Additionally, the phrases imply a metaphorical use of \"vaccination\" against negativity or envy in social contexts.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731869851,
  "id": "chatcmpl-AUekdariqnacx22XcuJNN3W1dL9zg",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 81,


## All 2013 positive

In [184]:
df_2013_pos.to_csv('../data/summary_2013_positive_tweets.csv')