# Import libraries and data

In [1]:
# Import libraries
import os
import openai
import pandas as pd
import numpy as np
import openpyxl

In [2]:
# Get OpenAI API secret key (change this file with your own OpenAI key)
open_ai_key = open('../data/local/openai_key.txt', 'r').read()
openai.api_key = open_ai_key
#openai.Model.list()

In [3]:
# Prompt for GPT
prompt_raw = "I would like to summarise in maximum 300 characters and in English what the following sentences in Portuguese talk about. \
Do not use aspects mentioned only in one sentence as part of the summary: TWEETS"

# 2017 neutral

## January

In [4]:
# Get monthly data
df_2017_01_neu = pd.read_csv('../data/local/df_neu_2017-01.csv').reset_index()
df_2017_01_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6097 entries, 0 to 6096
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   6097 non-null   int64  
 1   LocationCode            6097 non-null   object 
 2   LocationName            6097 non-null   object 
 3   StatesCode              6097 non-null   object 
 4   ...4                    6097 non-null   int64  
 5   id                      6097 non-null   float64
 6   text                    6097 non-null   object 
 7   label                   6097 non-null   object 
 8   in_reply_to_status_id   458 non-null    float64
 9   in_reply_to_user_id     490 non-null    float64
 10  quoted_user_id          101 non-null    float64
 11  quoted_status_id        101 non-null    float64
 12  retweeted_user_id       1200 non-null   float64
 13  retweeted_status_id     1200 non-null   float64
 14  created_at              6097 non-null   

### Representative sample based on week and location

In [5]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_01_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_01_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w01,Acre,3,0.000492
1,2017-w01,Alagoas,10,0.001640
2,2017-w01,Amapa,3,0.000492
3,2017-w01,Amazonas,15,0.002460
4,2017-w01,Bahia,66,0.010825
...,...,...,...,...
116,2017-w05,Roraima,3,0.000492
117,2017-w05,Santa Catarina,6,0.000984
118,2017-w05,Sao Paulo,89,0.014597
119,2017-w05,Sergipe,3,0.000492


In [6]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w01,Acre,3,0.000492,1
1,2017-w01,Alagoas,10,0.001640,4
2,2017-w01,Amapa,3,0.000492,1
3,2017-w01,Amazonas,15,0.002460,6
4,2017-w01,Bahia,66,0.010825,26
...,...,...,...,...,...
116,2017-w05,Roraima,3,0.000492,1
117,2017-w05,Santa Catarina,6,0.000984,2
118,2017-w05,Sao Paulo,89,0.014597,35
119,2017-w05,Sergipe,3,0.000492,1


In [7]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_01_neu_sample_size = pd.merge(df_2017_01_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [8]:
# Step 6: Perform the stratified sampling
df_2017_01_neu_sample = df_2017_01_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_01_neu_sample)} rows.")


Sampled 2393 rows.


In [9]:
text_gpt_2017_01_neu = '\n'.join(df_2017_01_neu_sample['text'].astype(str))

### Prompts and loops

In [10]:
prompt_1 = prompt_raw.replace('TWEETS', text_gpt_2017_01_neu)

In [11]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_1}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [12]:
response.choices[0].message.content

'The sentences discuss the importance and logistics of vaccinations, particularly focusing on the HPV vaccine being made available for boys aged 12 to 13 in Brazil. There are also mentions of the ongoing vaccination campaigns against yellow fever, with increased public concern leading to higher demand for vaccines. The narrative reflects personal experiences, humor, and societal attitudes towards vaccinations.'

In [13]:
summary = []
dataset = []

In [14]:
dataset.append("2017_01_neutral")

In [15]:
summary.append(response.choices[0].message.content)

In [16]:
df_2017_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  1 non-null      object
 1   summary  1 non-null      object
dtypes: object(2)
memory usage: 144.0+ bytes


In [17]:
df_2017_neu.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_neuitive,The sentences discuss the importance and logis...


In [18]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBbk8u3uhbTPeiNUclxuzrCTXVl1 at 0x2159229bd60> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The sentences discuss the importance and logistics of vaccinations, particularly focusing on the HPV vaccine being made available for boys aged 12 to 13 in Brazil. There are also mentions of the ongoing vaccination campaigns against yellow fever, with increased public concern leading to higher demand for vaccines. The narrative reflects personal experiences, humor, and societal attitudes towards vaccinations.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731757824,
  "id": "chatcmpl-AUBbk8u3uhbTPeiNUclxuzrCTXVl1",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 68,
    "completion_tokens_details": {
      "accepted_predi

## February

In [19]:
# Get monthly data
df_2017_02_neu = pd.read_csv('../data/local/df_neu_2017-02.csv').reset_index()
df_2017_02_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4971 entries, 0 to 4970
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   4971 non-null   int64  
 1   LocationCode            4971 non-null   object 
 2   LocationName            4971 non-null   object 
 3   StatesCode              4971 non-null   object 
 4   ...4                    4971 non-null   int64  
 5   id                      4971 non-null   float64
 6   text                    4971 non-null   object 
 7   label                   4971 non-null   object 
 8   in_reply_to_status_id   367 non-null    float64
 9   in_reply_to_user_id     396 non-null    float64
 10  quoted_user_id          77 non-null     float64
 11  quoted_status_id        77 non-null     float64
 12  retweeted_user_id       921 non-null    float64
 13  retweeted_status_id     921 non-null    float64
 14  created_at              4971 non-null   

### Representative sample based on week and location

In [20]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_02_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_02_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w05,Acre,4,0.000805
1,2017-w05,Alagoas,17,0.003420
2,2017-w05,Amapa,2,0.000402
3,2017-w05,Amazonas,14,0.002816
4,2017-w05,Bahia,72,0.014484
...,...,...,...,...
111,2017-w09,Pernambuco,1,0.000201
112,2017-w09,Rio Grande Do Sul,5,0.001006
113,2017-w09,Santa Catarina,1,0.000201
114,2017-w09,Sao Paulo,16,0.003219


In [21]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w05,Acre,4,0.000805,1
1,2017-w05,Alagoas,17,0.003420,8
2,2017-w05,Amapa,2,0.000402,0
3,2017-w05,Amazonas,14,0.002816,6
4,2017-w05,Bahia,72,0.014484,35
...,...,...,...,...,...
111,2017-w09,Pernambuco,1,0.000201,0
112,2017-w09,Rio Grande Do Sul,5,0.001006,2
113,2017-w09,Santa Catarina,1,0.000201,0
114,2017-w09,Sao Paulo,16,0.003219,7


In [22]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_02_neu_sample_size = pd.merge(df_2017_02_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [23]:
# Step 6: Perform the stratified sampling
df_2017_02_neu_sample = df_2017_02_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_02_neu_sample)} rows.")


Sampled 2377 rows.


In [24]:
text_gpt_2017_02_neu = '\n'.join(df_2017_02_neu_sample['text'].astype(str))

### Prompts and loops

In [25]:
prompt_2 = prompt_raw.replace('TWEETS', text_gpt_2017_02_neu)

In [26]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_2}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [27]:
response.choices[0].message.content

"The text discusses the importance of vaccination against yellow fever, especially in relation to travel requirements, health recommendations, and the necessity of having proof of vaccination for certain destinations like Panama and Nicaragua. It also highlights the ongoing efforts and challenges in providing vaccines, the public's reactions to vaccination, and the various health guidelines associated with the vaccine, such as contraindications for specific populations."

In [28]:
#summary = []
#dataset = []

In [29]:
dataset.append("2017_02_neutral")

In [30]:
summary.append(response.choices[0].message.content)

In [31]:
df_2017_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  2 non-null      object
 1   summary  2 non-null      object
dtypes: object(2)
memory usage: 160.0+ bytes


In [32]:
df_2017_neu.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_neuitive,The sentences discuss the importance and logis...
1,2017_02_neuitive,The text discusses the importance of vaccinati...


In [33]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBbpizn4gC1GjatsP9jkpRtoFgRD at 0x21592367d10> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses the importance of vaccination against yellow fever, especially in relation to travel requirements, health recommendations, and the necessity of having proof of vaccination for certain destinations like Panama and Nicaragua. It also highlights the ongoing efforts and challenges in providing vaccines, the public's reactions to vaccination, and the various health guidelines associated with the vaccine, such as contraindications for specific populations.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731757829,
  "id": "chatcmpl-AUBbpizn4gC1GjatsP9jkpRtoFgRD",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens":

## March

In [34]:
# Get monthly data
df_2017_03_neu = pd.read_csv('../data/local/df_neu_2017-03.csv').reset_index()
df_2017_03_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16569 entries, 0 to 16568
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   16569 non-null  int64  
 1   LocationCode            16569 non-null  object 
 2   LocationName            16569 non-null  object 
 3   StatesCode              16569 non-null  object 
 4   ...4                    16569 non-null  int64  
 5   id                      16569 non-null  float64
 6   text                    16569 non-null  object 
 7   label                   16569 non-null  object 
 8   in_reply_to_status_id   743 non-null    float64
 9   in_reply_to_user_id     814 non-null    float64
 10  quoted_user_id          169 non-null    float64
 11  quoted_status_id        169 non-null    float64
 12  retweeted_user_id       7181 non-null   float64
 13  retweeted_status_id     7181 non-null   float64
 14  created_at              16569 non-null

### Representative sample based on week and location

In [35]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_03_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_03_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w09,Acre,5,0.000302
1,2017-w09,Alagoas,7,0.000422
2,2017-w09,Amapa,1,0.000060
3,2017-w09,Amazonas,14,0.000845
4,2017-w09,Bahia,38,0.002293
...,...,...,...,...
117,2017-w13,Roraima,24,0.001448
118,2017-w13,Santa Catarina,151,0.009113
119,2017-w13,Sao Paulo,917,0.055344
120,2017-w13,Sergipe,25,0.001509


In [36]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w09,Acre,5,0.000302,0
1,2017-w09,Alagoas,7,0.000422,1
2,2017-w09,Amapa,1,0.000060,0
3,2017-w09,Amazonas,14,0.000845,2
4,2017-w09,Bahia,38,0.002293,5
...,...,...,...,...,...
117,2017-w13,Roraima,24,0.001448,3
118,2017-w13,Santa Catarina,151,0.009113,22
119,2017-w13,Sao Paulo,917,0.055344,135
120,2017-w13,Sergipe,25,0.001509,3


In [37]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_03_neu_sample_size = pd.merge(df_2017_03_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [38]:
# Step 6: Perform the stratified sampling
df_2017_03_neu_sample = df_2017_03_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_03_neu_sample)} rows.")


Sampled 2394 rows.


In [39]:
text_gpt_2017_03_neu = '\n'.join(df_2017_03_neu_sample['text'].astype(str))

### Prompts and loops

In [40]:
prompt_3 = prompt_raw.replace('TWEETS', text_gpt_2017_03_neu)

In [41]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_3}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [42]:
response.choices[0].message.content

"The text discusses the ongoing vaccination campaigns in Brazil, particularly focusing on the yellow fever vaccine. It mentions the government's efforts to increase vaccine availability, changes in vaccination schedules, public reactions to vaccination, and the challenges faced, such as long queues and incidents affecting vaccination sites. There are also references to the necessity of vaccination due to health risks and requirements for travel."

In [43]:
#summary = []
#dataset = []

In [44]:
dataset.append("2017_03_neutral")

In [45]:
summary.append(response.choices[0].message.content)

In [46]:
df_2017_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  3 non-null      object
 1   summary  3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes


In [47]:
df_2017_neu.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_neuitive,The sentences discuss the importance and logis...
1,2017_02_neuitive,The text discusses the importance of vaccinati...
2,2017_03_neuitive,The text discusses the ongoing vaccination cam...


In [48]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBbu68XSv122hLTYKeHpXlBaIEJT at 0x21592384040> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses the ongoing vaccination campaigns in Brazil, particularly focusing on the yellow fever vaccine. It mentions the government's efforts to increase vaccine availability, changes in vaccination schedules, public reactions to vaccination, and the challenges faced, such as long queues and incidents affecting vaccination sites. There are also references to the necessity of vaccination due to health risks and requirements for travel.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731757834,
  "id": "chatcmpl-AUBbu68XSv122hLTYKeHpXlBaIEJT",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_9b78b61c52",
  "usage": {
    "completion_tokens": 71,
    "completion_toke

## April

In [49]:
# Get monthly data
df_2017_04_neu = pd.read_csv('../data/local/df_neu_2017-04.csv').reset_index()
df_2017_04_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11464 entries, 0 to 11463
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   11464 non-null  int64  
 1   LocationCode            11464 non-null  object 
 2   LocationName            11464 non-null  object 
 3   StatesCode              11464 non-null  object 
 4   ...4                    11464 non-null  int64  
 5   id                      11464 non-null  float64
 6   text                    11464 non-null  object 
 7   label                   11464 non-null  object 
 8   in_reply_to_status_id   428 non-null    float64
 9   in_reply_to_user_id     464 non-null    float64
 10  quoted_user_id          141 non-null    float64
 11  quoted_status_id        141 non-null    float64
 12  retweeted_user_id       4449 non-null   float64
 13  retweeted_status_id     4449 non-null   float64
 14  created_at              11464 non-null

### Representative sample based on week and location

In [50]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_04_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_04_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w13,Acre,8,0.000698
1,2017-w13,Alagoas,20,0.001745
2,2017-w13,Amapa,9,0.000785
3,2017-w13,Amazonas,41,0.003576
4,2017-w13,Bahia,122,0.010642
...,...,...,...,...
141,2017-w18,Rio Grande Do Sul,13,0.001134
142,2017-w18,Rondonia,1,0.000087
143,2017-w18,Santa Catarina,2,0.000174
144,2017-w18,Sao Paulo,14,0.001221


In [51]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w13,Acre,8,0.000698,1
1,2017-w13,Alagoas,20,0.001745,4
2,2017-w13,Amapa,9,0.000785,1
3,2017-w13,Amazonas,41,0.003576,8
4,2017-w13,Bahia,122,0.010642,26
...,...,...,...,...,...
141,2017-w18,Rio Grande Do Sul,13,0.001134,2
142,2017-w18,Rondonia,1,0.000087,0
143,2017-w18,Santa Catarina,2,0.000174,0
144,2017-w18,Sao Paulo,14,0.001221,2


In [52]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_04_neu_sample_size = pd.merge(df_2017_04_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [53]:
# Step 6: Perform the stratified sampling
df_2017_04_neu_sample = df_2017_04_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_04_neu_sample)} rows.")


Sampled 2380 rows.


In [54]:
text_gpt_2017_04_neu = '\n'.join(df_2017_04_neu_sample['text'].astype(str))

### Prompts and loops

In [55]:
prompt_4 = prompt_raw.replace('TWEETS', text_gpt_2017_04_neu)

In [57]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_4}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [59]:
response.choices[0].message.content

"The text discusses the vaccination campaigns against yellow fever and rabies in Brazil, emphasizing the need for immunization, the introduction of a single dose for yellow fever, and the public's mixed feelings about vaccination. It highlights the ongoing efforts to increase vaccination rates amidst public health concerns."

In [60]:
#summary = []
#dataset = []

In [61]:
dataset.append("2017_04_neutral")

In [62]:
summary.append(response.choices[0].message.content)

In [63]:
df_2017_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  4 non-null      object
 1   summary  4 non-null      object
dtypes: object(2)
memory usage: 192.0+ bytes


In [64]:
df_2017_neu.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_neuitive,The sentences discuss the importance and logis...
1,2017_02_neuitive,The text discusses the importance of vaccinati...
2,2017_03_neuitive,The text discusses the ongoing vaccination cam...
3,2017_04_neuitive,The text discusses the vaccination campaigns a...


In [65]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBg0IvyHvYTr1pafckjGtW9WSv90 at 0x21596e24f40> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses the vaccination campaigns against yellow fever and rabies in Brazil, emphasizing the need for immunization, the introduction of a single dose for yellow fever, and the public's mixed feelings about vaccination. It highlights the ongoing efforts to increase vaccination rates amidst public health concerns.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731758088,
  "id": "chatcmpl-AUBg0IvyHvYTr1pafckjGtW9WSv90",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 55,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reasoning_tokens": 0,
      "rejected_

## May

In [66]:
# Get monthly data
df_2017_05_neu = pd.read_csv('../data/local/df_neu_2017-05.csv').reset_index()
df_2017_05_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9630 entries, 0 to 9629
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   9630 non-null   int64  
 1   LocationCode            9630 non-null   object 
 2   LocationName            9630 non-null   object 
 3   StatesCode              9630 non-null   object 
 4   ...4                    9630 non-null   int64  
 5   id                      9630 non-null   float64
 6   text                    9630 non-null   object 
 7   label                   9630 non-null   object 
 8   in_reply_to_status_id   467 non-null    float64
 9   in_reply_to_user_id     497 non-null    float64
 10  quoted_user_id          148 non-null    float64
 11  quoted_status_id        148 non-null    float64
 12  retweeted_user_id       1735 non-null   float64
 13  retweeted_status_id     1735 non-null   float64
 14  created_at              9630 non-null   

### Representative sample based on week and location

In [67]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_05_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_05_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w18,Acre,3,0.000312
1,2017-w18,Alagoas,28,0.002908
2,2017-w18,Amapa,2,0.000208
3,2017-w18,Amazonas,34,0.003531
4,2017-w18,Bahia,96,0.009969
...,...,...,...,...
120,2017-w22,Roraima,1,0.000104
121,2017-w22,Santa Catarina,26,0.002700
122,2017-w22,Sao Paulo,123,0.012773
123,2017-w22,Sergipe,8,0.000831


In [68]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w18,Acre,3,0.000312,0
1,2017-w18,Alagoas,28,0.002908,7
2,2017-w18,Amapa,2,0.000208,0
3,2017-w18,Amazonas,34,0.003531,8
4,2017-w18,Bahia,96,0.009969,24
...,...,...,...,...,...
120,2017-w22,Roraima,1,0.000104,0
121,2017-w22,Santa Catarina,26,0.002700,6
122,2017-w22,Sao Paulo,123,0.012773,31
123,2017-w22,Sergipe,8,0.000831,2


In [69]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_05_neu_sample_size = pd.merge(df_2017_05_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [70]:
# Step 6: Perform the stratified sampling
df_2017_05_neu_sample = df_2017_05_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_05_neu_sample)} rows.")


Sampled 2395 rows.


In [71]:
text_gpt_2017_05_neu = '\n'.join(df_2017_05_neu_sample['text'].astype(str))

### Prompts and loops

In [72]:
prompt_5 = prompt_raw.replace('TWEETS', text_gpt_2017_05_neu)

In [73]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_5}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [74]:
response.choices[0].message.content

'The text discusses the ongoing vaccination campaigns in Brazil, particularly against influenza (the flu) and foot-and-mouth disease (aftosa). It highlights low participation rates in flu vaccinations, with some regions achieving less than 30% coverage. There are ongoing efforts to encourage vaccination among various groups, including children, pregnant women, and livestock. The campaigns have been extended multiple times due to low demand and are crucial for public health.'

In [75]:
#summary = []
#dataset = []

In [76]:
dataset.append("2017_05_neutral")

In [77]:
summary.append(response.choices[0].message.content)

In [78]:
df_2017_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  5 non-null      object
 1   summary  5 non-null      object
dtypes: object(2)
memory usage: 208.0+ bytes


In [79]:
df_2017_neu.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_neuitive,The sentences discuss the importance and logis...
1,2017_02_neuitive,The text discusses the importance of vaccinati...
2,2017_03_neuitive,The text discusses the ongoing vaccination cam...
3,2017_04_neuitive,The text discusses the vaccination campaigns a...
4,2017_05_neuitive,The text discusses the ongoing vaccination cam...


In [80]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBhWDcZ93xY6QpMtkvt1VupLYBSH at 0x21596e8ccc0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses the ongoing vaccination campaigns in Brazil, particularly against influenza (the flu) and foot-and-mouth disease (aftosa). It highlights low participation rates in flu vaccinations, with some regions achieving less than 30% coverage. There are ongoing efforts to encourage vaccination among various groups, including children, pregnant women, and livestock. The campaigns have been extended multiple times due to low demand and are crucial for public health.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731758182,
  "id": "chatcmpl-AUBhWDcZ93xY6QpMtkvt1VupLYBSH",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_toke

## June

In [81]:
# Get monthly data
df_2017_06_neu = pd.read_csv('../data/local/df_neu_2017-06.csv').reset_index()
df_2017_06_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6673 entries, 0 to 6672
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   6673 non-null   int64  
 1   LocationCode            6673 non-null   object 
 2   LocationName            6673 non-null   object 
 3   StatesCode              6673 non-null   object 
 4   ...4                    6673 non-null   int64  
 5   id                      6673 non-null   float64
 6   text                    6673 non-null   object 
 7   label                   6673 non-null   object 
 8   in_reply_to_status_id   507 non-null    float64
 9   in_reply_to_user_id     540 non-null    float64
 10  quoted_user_id          138 non-null    float64
 11  quoted_status_id        138 non-null    float64
 12  retweeted_user_id       1670 non-null   float64
 13  retweeted_status_id     1670 non-null   float64
 14  created_at              6673 non-null   

### Representative sample based on week and location

In [82]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_06_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_06_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w22,Acre,3,0.000450
1,2017-w22,Alagoas,22,0.003297
2,2017-w22,Amapa,12,0.001798
3,2017-w22,Amazonas,19,0.002847
4,2017-w22,Bahia,40,0.005994
...,...,...,...,...
121,2017-w26,Roraima,2,0.000300
122,2017-w26,Santa Catarina,28,0.004196
123,2017-w26,Sao Paulo,144,0.021579
124,2017-w26,Sergipe,3,0.000450


In [83]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w22,Acre,3,0.000450,1
1,2017-w22,Alagoas,22,0.003297,8
2,2017-w22,Amapa,12,0.001798,4
3,2017-w22,Amazonas,19,0.002847,6
4,2017-w22,Bahia,40,0.005994,14
...,...,...,...,...,...
121,2017-w26,Roraima,2,0.000300,0
122,2017-w26,Santa Catarina,28,0.004196,10
123,2017-w26,Sao Paulo,144,0.021579,52
124,2017-w26,Sergipe,3,0.000450,1


In [84]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_06_neu_sample_size = pd.merge(df_2017_06_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [85]:
# Step 6: Perform the stratified sampling
df_2017_06_neu_sample = df_2017_06_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_06_neu_sample)} rows.")


Sampled 2390 rows.


In [86]:
text_gpt_2017_06_neu = '\n'.join(df_2017_06_neu_sample['text'].astype(str))

### Prompts and loops

In [87]:
prompt_6 = prompt_raw.replace('TWEETS', text_gpt_2017_06_neu)

In [88]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_6}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [89]:
response.choices[0].message.content

'The text discusses various vaccination campaigns in Brazil, focusing primarily on the flu vaccine and its availability to the general population. The government has extended the vaccination periods due to low uptake and emphasizes the importance of immunization against diseases like H1N1 and yellow fever. Specific campaigns for foot-and-mouth disease vaccination for livestock and HPV vaccination for boys and girls are also mentioned. The texts reflect public reactions, experiences, and challenges related to vaccination.'

In [90]:
#summary = []
#dataset = []

In [91]:
dataset.append("2017_06_neutral")

In [92]:
summary.append(response.choices[0].message.content)

In [93]:
df_2017_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  6 non-null      object
 1   summary  6 non-null      object
dtypes: object(2)
memory usage: 224.0+ bytes


In [94]:
df_2017_neu.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_neuitive,The sentences discuss the importance and logis...
1,2017_02_neuitive,The text discusses the importance of vaccinati...
2,2017_03_neuitive,The text discusses the ongoing vaccination cam...
3,2017_04_neuitive,The text discusses the vaccination campaigns a...
4,2017_05_neuitive,The text discusses the ongoing vaccination cam...
5,2017_06_neuitive,The text discusses various vaccination campaig...


In [95]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBhhCEWrWDMtMkG5kMABEX8Hy4mA at 0x215978a0220> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns in Brazil, focusing primarily on the flu vaccine and its availability to the general population. The government has extended the vaccination periods due to low uptake and emphasizes the importance of immunization against diseases like H1N1 and yellow fever. Specific campaigns for foot-and-mouth disease vaccination for livestock and HPV vaccination for boys and girls are also mentioned. The texts reflect public reactions, experiences, and challenges related to vaccination.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731758193,
  "id": "chatcmpl-AUBhhCEWrWDMtMkG5kMABEX8Hy4mA",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "f

## July

In [96]:
# Get monthly data
df_2017_07_neu = pd.read_csv('../data/local/df_neu_2017-07.csv').reset_index()
df_2017_07_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3022 entries, 0 to 3021
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3022 non-null   int64  
 1   LocationCode            3022 non-null   object 
 2   LocationName            3022 non-null   object 
 3   StatesCode              3022 non-null   object 
 4   ...4                    3022 non-null   int64  
 5   id                      3022 non-null   float64
 6   text                    3022 non-null   object 
 7   label                   3022 non-null   object 
 8   in_reply_to_status_id   346 non-null    float64
 9   in_reply_to_user_id     363 non-null    float64
 10  quoted_user_id          69 non-null     float64
 11  quoted_status_id        69 non-null     float64
 12  retweeted_user_id       503 non-null    float64
 13  retweeted_status_id     503 non-null    float64
 14  created_at              3022 non-null   

### Representative sample based on week and location

In [97]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_07_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_07_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w26,Alagoas,1,0.000331
1,2017-w26,Amazonas,1,0.000331
2,2017-w26,Bahia,11,0.003640
3,2017-w26,Ceara,2,0.000662
4,2017-w26,Distrito Federal,3,0.000993
...,...,...,...,...
127,2017-w31,Rio De Janeiro,2,0.000662
128,2017-w31,Rio Grande Do Sul,5,0.001655
129,2017-w31,Santa Catarina,2,0.000662
130,2017-w31,Sao Paulo,23,0.007611


In [98]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w26,Alagoas,1,0.000331,0
1,2017-w26,Amazonas,1,0.000331,0
2,2017-w26,Bahia,11,0.003640,8
3,2017-w26,Ceara,2,0.000662,1
4,2017-w26,Distrito Federal,3,0.000993,2
...,...,...,...,...,...
127,2017-w31,Rio De Janeiro,2,0.000662,1
128,2017-w31,Rio Grande Do Sul,5,0.001655,4
129,2017-w31,Santa Catarina,2,0.000662,1
130,2017-w31,Sao Paulo,23,0.007611,18


In [99]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_07_neu_sample_size = pd.merge(df_2017_07_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [100]:
# Step 6: Perform the stratified sampling
df_2017_07_neu_sample = df_2017_07_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_07_neu_sample)} rows.")


Sampled 2377 rows.


In [101]:
text_gpt_2017_07_neu = '\n'.join(df_2017_07_neu_sample['text'].astype(str))

### Prompts and loops

In [102]:
prompt_7 = prompt_raw.replace('TWEETS', text_gpt_2017_07_neu)

In [103]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_7}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [104]:
response.choices[0].message.content

'The texts discuss various vaccination events and campaigns in Brazil, including the BCG vaccine against tuberculosis, vaccination against rabies for pets, and immunization against diseases like HPV and yellow fever. It highlights the importance of vaccination, public health initiatives, and personal experiences related to getting vaccinated. Additionally, there are mentions of vaccine scarcity and public perceptions about vaccines.'

In [105]:
#summary = []
#dataset = []

In [106]:
dataset.append("2017_07_neutral")

In [107]:
summary.append(response.choices[0].message.content)

In [108]:
df_2017_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  7 non-null      object
 1   summary  7 non-null      object
dtypes: object(2)
memory usage: 240.0+ bytes


In [109]:
df_2017_neu.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_neuitive,The sentences discuss the importance and logis...
1,2017_02_neuitive,The text discusses the importance of vaccinati...
2,2017_03_neuitive,The text discusses the ongoing vaccination cam...
3,2017_04_neuitive,The text discusses the vaccination campaigns a...
4,2017_05_neuitive,The text discusses the ongoing vaccination cam...
5,2017_06_neuitive,The text discusses various vaccination campaig...
6,2017_07_neuitive,The texts discuss various vaccination events a...


In [110]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBhmfNtCBU2SyrAafdLB1yEPAyrl at 0x215978ce310> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The texts discuss various vaccination events and campaigns in Brazil, including the BCG vaccine against tuberculosis, vaccination against rabies for pets, and immunization against diseases like HPV and yellow fever. It highlights the importance of vaccination, public health initiatives, and personal experiences related to getting vaccinated. Additionally, there are mentions of vaccine scarcity and public perceptions about vaccines.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731758198,
  "id": "chatcmpl-AUBhmfNtCBU2SyrAafdLB1yEPAyrl",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 70,
    "completion_tokens_details": 

## August

In [111]:
# Get monthly data
df_2017_08_neu = pd.read_csv('../data/local/df_neu_2017-08.csv').reset_index()
df_2017_08_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3877 entries, 0 to 3876
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3877 non-null   int64  
 1   LocationCode            3877 non-null   object 
 2   LocationName            3877 non-null   object 
 3   StatesCode              3877 non-null   object 
 4   ...4                    3877 non-null   int64  
 5   id                      3877 non-null   float64
 6   text                    3877 non-null   object 
 7   label                   3877 non-null   object 
 8   in_reply_to_status_id   411 non-null    float64
 9   in_reply_to_user_id     429 non-null    float64
 10  quoted_user_id          126 non-null    float64
 11  quoted_status_id        126 non-null    float64
 12  retweeted_user_id       774 non-null    float64
 13  retweeted_status_id     774 non-null    float64
 14  created_at              3877 non-null   

### Representative sample based on week and location

In [112]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_08_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_08_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w31,Acre,1,0.000258
1,2017-w31,Alagoas,4,0.001032
2,2017-w31,Amapa,2,0.000516
3,2017-w31,Amazonas,9,0.002321
4,2017-w31,Bahia,22,0.005674
...,...,...,...,...
111,2017-w35,Rio Grande Do Sul,92,0.023730
112,2017-w35,Roraima,1,0.000258
113,2017-w35,Santa Catarina,9,0.002321
114,2017-w35,Sao Paulo,170,0.043848


In [113]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w31,Acre,1,0.000258,0
1,2017-w31,Alagoas,4,0.001032,2
2,2017-w31,Amapa,2,0.000516,1
3,2017-w31,Amazonas,9,0.002321,5
4,2017-w31,Bahia,22,0.005674,13
...,...,...,...,...,...
111,2017-w35,Rio Grande Do Sul,92,0.023730,58
112,2017-w35,Roraima,1,0.000258,0
113,2017-w35,Santa Catarina,9,0.002321,5
114,2017-w35,Sao Paulo,170,0.043848,107


In [114]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_08_neu_sample_size = pd.merge(df_2017_08_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [115]:
# Step 6: Perform the stratified sampling
df_2017_08_neu_sample = df_2017_08_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_08_neu_sample)} rows.")


Sampled 2388 rows.


In [116]:
text_gpt_2017_08_neu = '\n'.join(df_2017_08_neu_sample['text'].astype(str))

### Prompts and loops

In [117]:
prompt_8 = prompt_raw.replace('TWEETS', text_gpt_2017_08_neu)

In [118]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_8}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [119]:
response.choices[0].message.content

'The text discusses various aspects of vaccination in Brazil, including the upcoming Zika vaccine, low demand for HPV vaccines, personal experiences with vaccination, and campaigns for rabies vaccination for pets. It highlights societal attitudes towards vaccination, including fears, humor, and the significance of vaccination for public health.'

In [120]:
#summary = []
#dataset = []

In [121]:
dataset.append("2017_08_neutral")

In [122]:
summary.append(response.choices[0].message.content)

In [123]:
df_2017_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  8 non-null      object
 1   summary  8 non-null      object
dtypes: object(2)
memory usage: 256.0+ bytes


In [124]:
df_2017_neu.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_neuitive,The sentences discuss the importance and logis...
1,2017_02_neuitive,The text discusses the importance of vaccinati...
2,2017_03_neuitive,The text discusses the ongoing vaccination cam...
3,2017_04_neuitive,The text discusses the vaccination campaigns a...
4,2017_05_neuitive,The text discusses the ongoing vaccination cam...
5,2017_06_neuitive,The text discusses various vaccination campaig...
6,2017_07_neuitive,The texts discuss various vaccination events a...
7,2017_08_neuitive,The text discusses various aspects of vaccinat...


In [125]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBhrvtboN5vUGoAJ19YPdtNlIMSy at 0x215984980e0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various aspects of vaccination in Brazil, including the upcoming Zika vaccine, low demand for HPV vaccines, personal experiences with vaccination, and campaigns for rabies vaccination for pets. It highlights societal attitudes towards vaccination, including fears, humor, and the significance of vaccination for public health.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731758203,
  "id": "chatcmpl-AUBhrvtboN5vUGoAJ19YPdtNlIMSy",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 58,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reasoning_tokens"

## September

In [126]:
# Get monthly data
df_2017_09_neu = pd.read_csv('../data/local/df_neu_2017-09.csv').reset_index()
df_2017_09_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5057 entries, 0 to 5056
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   5057 non-null   int64  
 1   LocationCode            5057 non-null   object 
 2   LocationName            5057 non-null   object 
 3   StatesCode              5057 non-null   object 
 4   ...4                    5057 non-null   int64  
 5   id                      5057 non-null   float64
 6   text                    5057 non-null   object 
 7   label                   5057 non-null   object 
 8   in_reply_to_status_id   445 non-null    float64
 9   in_reply_to_user_id     462 non-null    float64
 10  quoted_user_id          136 non-null    float64
 11  quoted_status_id        136 non-null    float64
 12  retweeted_user_id       1559 non-null   float64
 13  retweeted_status_id     1559 non-null   float64
 14  created_at              5057 non-null   

### Representative sample based on week and location

In [127]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_09_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_09_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w35,Acre,2,0.000395
1,2017-w35,Amazonas,3,0.000593
2,2017-w35,Bahia,8,0.001582
3,2017-w35,Ceara,3,0.000593
4,2017-w35,Distrito Federal,4,0.000791
...,...,...,...,...
117,2017-w39,Roraima,4,0.000791
118,2017-w39,Santa Catarina,9,0.001780
119,2017-w39,Sao Paulo,117,0.023136
120,2017-w39,Sergipe,5,0.000989


In [128]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w35,Acre,2,0.000395,0
1,2017-w35,Amazonas,3,0.000593,1
2,2017-w35,Bahia,8,0.001582,3
3,2017-w35,Ceara,3,0.000593,1
4,2017-w35,Distrito Federal,4,0.000791,1
...,...,...,...,...,...
117,2017-w39,Roraima,4,0.000791,1
118,2017-w39,Santa Catarina,9,0.001780,4
119,2017-w39,Sao Paulo,117,0.023136,56
120,2017-w39,Sergipe,5,0.000989,2


In [129]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_09_neu_sample_size = pd.merge(df_2017_09_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [130]:
# Step 6: Perform the stratified sampling
df_2017_09_neu_sample = df_2017_09_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_09_neu_sample)} rows.")


Sampled 2380 rows.


In [131]:
text_gpt_2017_09_neu = '\n'.join(df_2017_09_neu_sample['text'].astype(str))

### Prompts and loops

In [132]:
prompt_9 = prompt_raw.replace('TWEETS', text_gpt_2017_09_neu)

In [133]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_9}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [134]:
response.choices[0].message.content

'The text discusses various vaccination campaigns for both humans and animals in Brazil, highlighting the importance of vaccinations against diseases like rabies, HPV, and yellow fever. It features personal anecdotes about the challenges and experiences of taking pets or children to get vaccinated, as well as reminders about campaign deadlines and locations for vaccination clinics. Overall, it emphasizes public health awareness and the need for updated vaccination records.'

In [135]:
#summary = []
#dataset = []

In [136]:
dataset.append("2017_09_neutral")

In [137]:
summary.append(response.choices[0].message.content)

In [138]:
df_2017_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  9 non-null      object
 1   summary  9 non-null      object
dtypes: object(2)
memory usage: 272.0+ bytes


In [139]:
df_2017_neu.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_neuitive,The sentences discuss the importance and logis...
1,2017_02_neuitive,The text discusses the importance of vaccinati...
2,2017_03_neuitive,The text discusses the ongoing vaccination cam...
3,2017_04_neuitive,The text discusses the vaccination campaigns a...
4,2017_05_neuitive,The text discusses the ongoing vaccination cam...
5,2017_06_neuitive,The text discusses various vaccination campaig...
6,2017_07_neuitive,The texts discuss various vaccination events a...
7,2017_08_neuitive,The text discusses various aspects of vaccinat...
8,2017_09_neuitive,The text discusses various vaccination campaig...


In [140]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBhydnJmDymtbYp5Ds8YbnKeZx0C at 0x21597f1ee50> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns for both humans and animals in Brazil, highlighting the importance of vaccinations against diseases like rabies, HPV, and yellow fever. It features personal anecdotes about the challenges and experiences of taking pets or children to get vaccinated, as well as reminders about campaign deadlines and locations for vaccination clinics. Overall, it emphasizes public health awareness and the need for updated vaccination records.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731758210,
  "id": "chatcmpl-AUBhydnJmDymtbYp5Ds8YbnKeZx0C",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_token

## October

In [141]:
# Get monthly data
df_2017_10_neu = pd.read_csv('../data/local/df_neu_2017-10.csv').reset_index()
df_2017_10_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3435 entries, 0 to 3434
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3435 non-null   int64  
 1   LocationCode            3435 non-null   object 
 2   LocationName            3435 non-null   object 
 3   StatesCode              3435 non-null   object 
 4   ...4                    3435 non-null   int64  
 5   id                      3435 non-null   float64
 6   text                    3435 non-null   object 
 7   label                   3435 non-null   object 
 8   in_reply_to_status_id   365 non-null    float64
 9   in_reply_to_user_id     385 non-null    float64
 10  quoted_user_id          76 non-null     float64
 11  quoted_status_id        76 non-null     float64
 12  retweeted_user_id       556 non-null    float64
 13  retweeted_status_id     556 non-null    float64
 14  created_at              3435 non-null   

### Representative sample based on week and location

In [142]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_10_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_10_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w40,Acre,5,0.001456
1,2017-w40,Alagoas,5,0.001456
2,2017-w40,Amapa,2,0.000582
3,2017-w40,Amazonas,22,0.006405
4,2017-w40,Bahia,35,0.010189
...,...,...,...,...
110,2017-w44,Rondonia,1,0.000291
111,2017-w44,Roraima,1,0.000291
112,2017-w44,Santa Catarina,5,0.001456
113,2017-w44,Sao Paulo,100,0.029112


In [143]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w40,Acre,5,0.001456,3
1,2017-w40,Alagoas,5,0.001456,3
2,2017-w40,Amapa,2,0.000582,1
3,2017-w40,Amazonas,22,0.006405,15
4,2017-w40,Bahia,35,0.010189,24
...,...,...,...,...,...
110,2017-w44,Rondonia,1,0.000291,0
111,2017-w44,Roraima,1,0.000291,0
112,2017-w44,Santa Catarina,5,0.001456,3
113,2017-w44,Sao Paulo,100,0.029112,71


In [144]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_10_neu_sample_size = pd.merge(df_2017_10_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [145]:
# Step 6: Perform the stratified sampling
df_2017_10_neu_sample = df_2017_10_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_10_neu_sample)} rows.")


Sampled 2386 rows.


In [146]:
text_gpt_2017_10_neu = '\n'.join(df_2017_10_neu_sample['text'].astype(str))

### Prompts and loops

In [147]:
prompt_10 = prompt_raw.replace('TWEETS', text_gpt_2017_10_neu)

In [148]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_10}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [149]:
response.choices[0].message.content

'The text discusses various aspects of vaccination, including the importance of updating vaccination records for children, the approval of plans to phase out cattle vaccination against foot-and-mouth disease, and the launch of rabies vaccination campaigns for pets. There are also references to misconceptions about vaccines, such as false claims regarding HPV vaccines. Additionally, it highlights the urgency of getting vaccinated due to outbreaks, especially in relation to diseases like measles and yellow fever. Overall, the content focuses on public health initiatives, vaccination campaigns, and common public attitudes toward vaccinations.'

In [150]:
#summary = []
#dataset = []

In [151]:
dataset.append("2017_10_neutral")

In [152]:
summary.append(response.choices[0].message.content)

In [153]:
df_2017_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  10 non-null     object
 1   summary  10 non-null     object
dtypes: object(2)
memory usage: 288.0+ bytes


In [154]:
df_2017_neu.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_neuitive,The sentences discuss the importance and logis...
1,2017_02_neuitive,The text discusses the importance of vaccinati...
2,2017_03_neuitive,The text discusses the ongoing vaccination cam...
3,2017_04_neuitive,The text discusses the vaccination campaigns a...
4,2017_05_neuitive,The text discusses the ongoing vaccination cam...
5,2017_06_neuitive,The text discusses various vaccination campaig...
6,2017_07_neuitive,The texts discuss various vaccination events a...
7,2017_08_neuitive,The text discusses various aspects of vaccinat...
8,2017_09_neuitive,The text discusses various vaccination campaig...
9,2017_10_neuitive,The text discusses various aspects of vaccinat...


In [155]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBi5FVrkZNypZ5nPWy6sCLUNHEL7 at 0x21597f3e810> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various aspects of vaccination, including the importance of updating vaccination records for children, the approval of plans to phase out cattle vaccination against foot-and-mouth disease, and the launch of rabies vaccination campaigns for pets. There are also references to misconceptions about vaccines, such as false claims regarding HPV vaccines. Additionally, it highlights the urgency of getting vaccinated due to outbreaks, especially in relation to diseases like measles and yellow fever. Overall, the content focuses on public health initiatives, vaccination campaigns, and common public attitudes toward vaccinations.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731758217,
  "id": "chatcmpl-AUBi5

## November

In [156]:
# Get monthly data
df_2017_11_neu = pd.read_csv('../data/local/df_neu_2017-11.csv').reset_index()
df_2017_11_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3203 entries, 0 to 3202
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3203 non-null   int64  
 1   LocationCode            3203 non-null   object 
 2   LocationName            3203 non-null   object 
 3   StatesCode              3203 non-null   object 
 4   ...4                    3203 non-null   int64  
 5   id                      3203 non-null   float64
 6   text                    3203 non-null   object 
 7   label                   3203 non-null   object 
 8   in_reply_to_status_id   361 non-null    float64
 9   in_reply_to_user_id     379 non-null    float64
 10  quoted_user_id          91 non-null     float64
 11  quoted_status_id        91 non-null     float64
 12  retweeted_user_id       690 non-null    float64
 13  retweeted_status_id     690 non-null    float64
 14  created_at              3203 non-null   

### Representative sample based on week and location

In [157]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_11_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_11_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w44,Acre,3,0.000937
1,2017-w44,Alagoas,1,0.000312
2,2017-w44,Amapa,5,0.001561
3,2017-w44,Amazonas,6,0.001873
4,2017-w44,Bahia,16,0.004995
...,...,...,...,...
116,2017-w48,Roraima,1,0.000312
117,2017-w48,Santa Catarina,14,0.004371
118,2017-w48,Sao Paulo,114,0.035592
119,2017-w48,Sergipe,5,0.001561


In [158]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w44,Acre,3,0.000937,2
1,2017-w44,Alagoas,1,0.000312,0
2,2017-w44,Amapa,5,0.001561,3
3,2017-w44,Amazonas,6,0.001873,4
4,2017-w44,Bahia,16,0.004995,12
...,...,...,...,...,...
116,2017-w48,Roraima,1,0.000312,0
117,2017-w48,Santa Catarina,14,0.004371,10
118,2017-w48,Sao Paulo,114,0.035592,87
119,2017-w48,Sergipe,5,0.001561,3


In [159]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_11_neu_sample_size = pd.merge(df_2017_11_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [160]:
# Step 6: Perform the stratified sampling
df_2017_11_neu_sample = df_2017_11_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_11_neu_sample)} rows.")


Sampled 2388 rows.


In [161]:
text_gpt_2017_11_neu = '\n'.join(df_2017_11_neu_sample['text'].astype(str))

### Prompts and loops

In [162]:
prompt_11 = prompt_raw.replace('TWEETS', text_gpt_2017_11_neu)

In [163]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_11}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [164]:
response.choices[0].message.content

'The text is primarily about vaccination campaigns in Brazil, focusing on various diseases such as foot-and-mouth disease and dengue. It highlights the start of vaccination phases, the importance of keeping vaccination records up to date, and the need for public participation. Additionally, there are mentions of specific vaccination events and recommendations from health authorities regarding who should receive certain vaccines. There is also a discussion about the challenges faced in maintaining adequate vaccination coverage among the population.'

In [165]:
#summary = []
#dataset = []

In [166]:
dataset.append("2017_11_neutral")

In [167]:
summary.append(response.choices[0].message.content)

In [168]:
df_2017_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  11 non-null     object
 1   summary  11 non-null     object
dtypes: object(2)
memory usage: 304.0+ bytes


In [169]:
df_2017_neu.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_neuitive,The sentences discuss the importance and logis...
1,2017_02_neuitive,The text discusses the importance of vaccinati...
2,2017_03_neuitive,The text discusses the ongoing vaccination cam...
3,2017_04_neuitive,The text discusses the vaccination campaigns a...
4,2017_05_neuitive,The text discusses the ongoing vaccination cam...
5,2017_06_neuitive,The text discusses various vaccination campaig...
6,2017_07_neuitive,The texts discuss various vaccination events a...
7,2017_08_neuitive,The text discusses various aspects of vaccinat...
8,2017_09_neuitive,The text discusses various vaccination campaig...
9,2017_10_neuitive,The text discusses various aspects of vaccinat...


In [170]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBiDoMtebOQihWRRsETVjXkW7yww at 0x21598d579f0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text is primarily about vaccination campaigns in Brazil, focusing on various diseases such as foot-and-mouth disease and dengue. It highlights the start of vaccination phases, the importance of keeping vaccination records up to date, and the need for public participation. Additionally, there are mentions of specific vaccination events and recommendations from health authorities regarding who should receive certain vaccines. There is also a discussion about the challenges faced in maintaining adequate vaccination coverage among the population.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731758225,
  "id": "chatcmpl-AUBiDoMtebOQihWRRsETVjXkW7yww",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",

## December

In [171]:
# Get monthly data
df_2017_12_neu = pd.read_csv('../data/local/df_neu_2017-12.csv').reset_index()
df_2017_12_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2791 entries, 0 to 2790
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2791 non-null   int64  
 1   LocationCode            2791 non-null   object 
 2   LocationName            2791 non-null   object 
 3   StatesCode              2791 non-null   object 
 4   ...4                    2791 non-null   int64  
 5   id                      2791 non-null   float64
 6   text                    2791 non-null   object 
 7   label                   2791 non-null   object 
 8   in_reply_to_status_id   313 non-null    float64
 9   in_reply_to_user_id     317 non-null    float64
 10  quoted_user_id          69 non-null     float64
 11  quoted_status_id        69 non-null     float64
 12  retweeted_user_id       1001 non-null   float64
 13  retweeted_status_id     1001 non-null   float64
 14  created_at              2791 non-null   

### Representative sample based on week and location

In [172]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2017_12_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2017_12_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2017-w48,Acre,4,0.001433
1,2017-w48,Alagoas,1,0.000358
2,2017-w48,Amazonas,1,0.000358
3,2017-w48,Bahia,2,0.000717
4,2017-w48,Ceara,3,0.001075
...,...,...,...,...
121,2017-w53,Minas Gerais,17,0.006091
122,2017-w53,Paraiba,1,0.000358
123,2017-w53,Parana,1,0.000358
124,2017-w53,Rio Grande Do Sul,1,0.000358


In [173]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2017-w48,Acre,4,0.001433,3
1,2017-w48,Alagoas,1,0.000358,0
2,2017-w48,Amazonas,1,0.000358,0
3,2017-w48,Bahia,2,0.000717,1
4,2017-w48,Ceara,3,0.001075,2
...,...,...,...,...,...
121,2017-w53,Minas Gerais,17,0.006091,14
122,2017-w53,Paraiba,1,0.000358,0
123,2017-w53,Parana,1,0.000358,0
124,2017-w53,Rio Grande Do Sul,1,0.000358,0


In [174]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2017_12_neu_sample_size = pd.merge(df_2017_12_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [175]:
# Step 6: Perform the stratified sampling
df_2017_12_neu_sample = df_2017_12_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2017_12_neu_sample)} rows.")


Sampled 2382 rows.


In [176]:
text_gpt_2017_12_neu = '\n'.join(df_2017_12_neu_sample['text'].astype(str))

### Prompts and loops

In [177]:
prompt_12 = prompt_raw.replace('TWEETS', text_gpt_2017_12_neu)

In [178]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_12}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [179]:
response.choices[0].message.content

'The text discusses various aspects of vaccination, including the importance and requirements for vaccines against diseases like rabies, dengue, and yellow fever. It highlights recommendations from health authorities such as Anvisa regarding who should receive specific vaccines and the implications of vaccination for both humans and pets. There are personal anecdotes about experiences with vaccinations, awareness campaigns, and the emotional responses associated with them. Additionally, the text mentions the expansion of vaccination services in pharmacies and the ongoing public health initiatives to combat diseases.'

In [180]:
#summary = []
#dataset = []

In [181]:
dataset.append("2017_12_neutral")

In [182]:
summary.append(response.choices[0].message.content)

In [183]:
df_2017_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2017_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  12 non-null     object
 1   summary  12 non-null     object
dtypes: object(2)
memory usage: 320.0+ bytes


In [189]:
df_2017_neu.tail(14)

Unnamed: 0,dataset,summary
0,2017_01_neutral,The sentences discuss the importance and logis...
1,2017_02_neutral,The text discusses the importance of vaccinati...
2,2017_03_neutral,The text discusses the ongoing vaccination cam...
3,2017_04_neutral,The text discusses the vaccination campaigns a...
4,2017_05_neutral,The text discusses the ongoing vaccination cam...
5,2017_06_neutral,The text discusses various vaccination campaig...
6,2017_07_neutral,The texts discuss various vaccination events a...
7,2017_08_neutral,The text discusses various aspects of vaccinat...
8,2017_09_neutral,The text discusses various vaccination campaig...
9,2017_10_neutral,The text discusses various aspects of vaccinat...


In [185]:
response

<OpenAIObject chat.completion id=chatcmpl-AUBiLZ1Vci4N7Rzs5KRxA0X4wJKaW at 0x21599f7c310> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various aspects of vaccination, including the importance and requirements for vaccines against diseases like rabies, dengue, and yellow fever. It highlights recommendations from health authorities such as Anvisa regarding who should receive specific vaccines and the implications of vaccination for both humans and pets. There are personal anecdotes about experiences with vaccinations, awareness campaigns, and the emotional responses associated with them. Additionally, the text mentions the expansion of vaccination services in pharmacies and the ongoing public health initiatives to combat diseases.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731758233,
  "id": "chatcmpl-AUBiLZ1Vci4N7Rzs5KRxA0X4wJKaW

## All 2017 neutral

In [188]:
#df_2017_neu['dataset'] = df_2017_neu['dataset'].str.replace('neuitive','neutral')

In [190]:
df_2017_neu.to_csv('../data/summary_2017_neutral_tweets.csv')