# Import libraries and data

In [1]:
# Import libraries
import os
import openai
import pandas as pd
import numpy as np
import openpyxl

In [2]:
# Get OpenAI API secret key (change this file with your own OpenAI key)
open_ai_key = open('../data/local/openai_key.txt', 'r').read()
openai.api_key = open_ai_key
#openai.Model.list()

In [3]:
# Prompt for GPT
prompt_raw = "I would like to summarise in maximum 300 characters and in English what the following sentences in Portuguese talk about. \
Do not use aspects mentioned only in one sentence as part of the summary: TWEETS"

# 2014 neutral

## January

In [4]:
# Get monthly data
df_2014_01_neu = pd.read_csv('../data/local/df_neu_2014-01.csv').reset_index()
df_2014_01_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2275 entries, 0 to 2274
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2275 non-null   int64  
 1   LocationCode            2275 non-null   object 
 2   LocationName            2275 non-null   object 
 3   StatesCode              2275 non-null   object 
 4   ...4                    2275 non-null   int64  
 5   id                      2275 non-null   float64
 6   text                    2275 non-null   object 
 7   label                   2275 non-null   object 
 8   in_reply_to_status_id   236 non-null    float64
 9   in_reply_to_user_id     258 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       388 non-null    float64
 13  retweeted_status_id     388 non-null    float64
 14  created_at              2275 non-null   

### Representative sample based on week and location

In [5]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_01_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_01_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w01,Acre,3,0.001319
1,2014-w01,Amapa,1,0.000440
2,2014-w01,Amazonas,9,0.003956
3,2014-w01,Bahia,20,0.008791
4,2014-w01,Ceara,8,0.003516
...,...,...,...,...
99,2014-w05,Rondonia,2,0.000879
100,2014-w05,Roraima,7,0.003077
101,2014-w05,Santa Catarina,10,0.004396
102,2014-w05,Sao Paulo,84,0.036923


In [6]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w01,Acre,3,0.001319,3
1,2014-w01,Amapa,1,0.000440,1
2,2014-w01,Amazonas,9,0.003956,9
3,2014-w01,Bahia,20,0.008791,21
4,2014-w01,Ceara,8,0.003516,8
...,...,...,...,...,...
99,2014-w05,Rondonia,2,0.000879,2
100,2014-w05,Roraima,7,0.003077,7
101,2014-w05,Santa Catarina,10,0.004396,10
102,2014-w05,Sao Paulo,84,0.036923,90


In [7]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_01_neu_sample_size = pd.merge(df_2014_01_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [8]:
# Step 6: Perform the stratified sampling
#df_2014_01_neu_sample = df_2014_01_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_01_neu_sample = df_2014_01_neu
# Step 7: Check the result
print(f"Sampled {len(df_2014_01_neu_sample)} rows.")


Sampled 2275 rows.


In [9]:
text_gpt_2014_01_neu = '\n'.join(df_2014_01_neu_sample['text'].astype(str))

### Prompts and loops

In [10]:
prompt_1 = prompt_raw.replace('TWEETS', text_gpt_2014_01_neu)

In [11]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_1}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [12]:
response.choices[0].message.content

'The text discusses various aspects of vaccination, focusing particularly on the HPV vaccine being introduced in Brazil. The Ministry of Health plans to offer the HPV vaccine for girls aged 11 to 13 starting March 10. Additionally, there are mentions of other vaccinations, personal experiences, and cultural perceptions surrounding vaccines, including humor and societal commentary about the necessity and timing of vaccinations. Overall, the text highlights the importance of vaccination programs and public health initiatives in Brazil.'

In [13]:
summary = []
dataset = []

In [14]:
dataset.append("2014_01_neutral")

In [15]:
summary.append(response.choices[0].message.content)

In [16]:
df_2014_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  1 non-null      object
 1   summary  1 non-null      object
dtypes: object(2)
memory usage: 144.0+ bytes


In [17]:
df_2014_neu.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_neutral,The text discusses various aspects of vaccinat...


In [18]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdGK7n0tJPktGcxleHMmKNCOL8Q7 at 0x1263d355ef0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various aspects of vaccination, focusing particularly on the HPV vaccine being introduced in Brazil. The Ministry of Health plans to offer the HPV vaccine for girls aged 11 to 13 starting March 10. Additionally, there are mentions of other vaccinations, personal experiences, and cultural perceptions surrounding vaccines, including humor and societal commentary about the necessity and timing of vaccinations. Overall, the text highlights the importance of vaccination programs and public health initiatives in Brazil.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864128,
  "id": "chatcmpl-AUdGK7n0tJPktGcxleHMmKNCOL8Q7",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fin

## February

In [19]:
# Get monthly data
df_2014_02_neu = pd.read_csv('../data/local/df_neu_2014-02.csv').reset_index()
df_2014_02_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1736 entries, 0 to 1735
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1736 non-null   int64  
 1   LocationCode            1736 non-null   object 
 2   LocationName            1736 non-null   object 
 3   StatesCode              1736 non-null   object 
 4   ...4                    1736 non-null   int64  
 5   id                      1736 non-null   float64
 6   text                    1736 non-null   object 
 7   label                   1736 non-null   object 
 8   in_reply_to_status_id   145 non-null    float64
 9   in_reply_to_user_id     189 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       221 non-null    float64
 13  retweeted_status_id     221 non-null    float64
 14  created_at              1736 non-null   

### Representative sample based on week and location

In [20]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_02_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_02_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w05,Alagoas,2,0.001152
1,2014-w05,Amazonas,5,0.002880
2,2014-w05,Bahia,13,0.007488
3,2014-w05,Ceara,15,0.008641
4,2014-w05,Distrito Federal,12,0.006912
...,...,...,...,...
107,2014-w09,Rio Grande Do Sul,21,0.012097
108,2014-w09,Santa Catarina,2,0.001152
109,2014-w09,Sao Paulo,31,0.017857
110,2014-w09,Sergipe,7,0.004032


In [21]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w05,Alagoas,2,0.001152,2
1,2014-w05,Amazonas,5,0.002880,7
2,2014-w05,Bahia,13,0.007488,18
3,2014-w05,Ceara,15,0.008641,21
4,2014-w05,Distrito Federal,12,0.006912,16
...,...,...,...,...,...
107,2014-w09,Rio Grande Do Sul,21,0.012097,29
108,2014-w09,Santa Catarina,2,0.001152,2
109,2014-w09,Sao Paulo,31,0.017857,43
110,2014-w09,Sergipe,7,0.004032,9


In [22]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_02_neu_sample_size = pd.merge(df_2014_02_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [23]:
# Step 6: Perform the stratified sampling
#df_2014_02_neu_sample = df_2014_02_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_02_neu_sample = df_2014_02_neu
# Step 7: Check the result
print(f"Sampled {len(df_2014_02_neu_sample)} rows.")


Sampled 1736 rows.


In [24]:
text_gpt_2014_02_neu = '\n'.join(df_2014_02_neu_sample['text'].astype(str))

### Prompts and loops

In [25]:
prompt_2 = prompt_raw.replace('TWEETS', text_gpt_2014_02_neu)

In [26]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_2}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [27]:
response.choices[0].message.content

'The text discusses various vaccination campaigns and events in Brazil, highlighting the rollout of vaccines against diseases like HPV, HIV, and measles, as well as the importance of vaccination for public health. It mentions the involvement of government initiatives, community reactions, and specific dates for vaccination drives. Additionally, it touches on controversies surrounding certain vaccines and individual experiences related to vaccination.'

In [28]:
#summary = []
#dataset = []

In [29]:
dataset.append("2014_02_neutral")

In [30]:
summary.append(response.choices[0].message.content)

In [31]:
df_2014_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  2 non-null      object
 1   summary  2 non-null      object
dtypes: object(2)
memory usage: 160.0+ bytes


In [32]:
df_2014_neu.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_neutral,The text discusses various aspects of vaccinat...
1,2014_02_neutral,The text discusses various vaccination campaig...


In [33]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdGSK9fQqiIDba2Mhgcus4DBFwN7 at 0x1263ce4c630> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns and events in Brazil, highlighting the rollout of vaccines against diseases like HPV, HIV, and measles, as well as the importance of vaccination for public health. It mentions the involvement of government initiatives, community reactions, and specific dates for vaccination drives. Additionally, it touches on controversies surrounding certain vaccines and individual experiences related to vaccination.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864136,
  "id": "chatcmpl-AUdGSK9fQqiIDba2Mhgcus4DBFwN7",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_9b78b61c52",
  "usage": {
    "completion_tokens": 72,
    "completion

## March

In [34]:
# Get monthly data
df_2014_03_neu = pd.read_csv('../data/local/df_neu_2014-03.csv').reset_index()
df_2014_03_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5260 entries, 0 to 5259
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   5260 non-null   int64  
 1   LocationCode            5260 non-null   object 
 2   LocationName            5260 non-null   object 
 3   StatesCode              5260 non-null   object 
 4   ...4                    5260 non-null   int64  
 5   id                      5260 non-null   float64
 6   text                    5260 non-null   object 
 7   label                   5260 non-null   object 
 8   in_reply_to_status_id   434 non-null    float64
 9   in_reply_to_user_id     506 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       694 non-null    float64
 13  retweeted_status_id     694 non-null    float64
 14  created_at              5260 non-null   

### Representative sample based on week and location

In [35]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_03_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_03_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w09,Acre,1,0.000190
1,2014-w09,Alagoas,9,0.001711
2,2014-w09,Amapa,2,0.000380
3,2014-w09,Amazonas,5,0.000951
4,2014-w09,Bahia,19,0.003612
...,...,...,...,...
111,2014-w13,Roraima,5,0.000951
112,2014-w13,Santa Catarina,28,0.005323
113,2014-w13,Sao Paulo,155,0.029468
114,2014-w13,Sergipe,4,0.000760


In [36]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w09,Acre,1,0.000190,0
1,2014-w09,Alagoas,9,0.001711,4
2,2014-w09,Amapa,2,0.000380,0
3,2014-w09,Amazonas,5,0.000951,2
4,2014-w09,Bahia,19,0.003612,8
...,...,...,...,...,...
111,2014-w13,Roraima,5,0.000951,2
112,2014-w13,Santa Catarina,28,0.005323,13
113,2014-w13,Sao Paulo,155,0.029468,72
114,2014-w13,Sergipe,4,0.000760,1


In [37]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_03_neu_sample_size = pd.merge(df_2014_03_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [38]:
# Step 6: Perform the stratified sampling
df_2014_03_neu_sample = df_2014_03_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2014_03_neu_sample)} rows.")


Sampled 2382 rows.


In [39]:
text_gpt_2014_03_neu = '\n'.join(df_2014_03_neu_sample['text'].astype(str))

### Prompts and loops

In [40]:
prompt_3 = prompt_raw.replace('TWEETS', text_gpt_2014_03_neu)

In [41]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_3}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [42]:
response.choices[0].message.content

'The upcoming vaccination campaign against HPV (Human Papillomavirus) will start next week for girls aged 11 to 13 across Brazil. The initiative aims to prevent cervical cancer and is part of the national health program. However, the vaccine has sparked controversy and divided opinions among parents and healthcare professionals regarding its necessity and safety.'

In [43]:
#summary = []
#dataset = []

In [44]:
dataset.append("2014_03_neutral")

In [45]:
summary.append(response.choices[0].message.content)

In [46]:
df_2014_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  3 non-null      object
 1   summary  3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes


In [47]:
df_2014_neu.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_neutral,The text discusses various aspects of vaccinat...
1,2014_02_neutral,The text discusses various vaccination campaig...
2,2014_03_neutral,The upcoming vaccination campaign against HPV ...


In [48]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdGaYklB0DVBhq8C8yTPlnGBvIv3 at 0x1263d8e7450> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The upcoming vaccination campaign against HPV (Human Papillomavirus) will start next week for girls aged 11 to 13 across Brazil. The initiative aims to prevent cervical cancer and is part of the national health program. However, the vaccine has sparked controversy and divided opinions among parents and healthcare professionals regarding its necessity and safety.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864144,
  "id": "chatcmpl-AUdGaYklB0DVBhq8C8yTPlnGBvIv3",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 65,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
     

## April

In [49]:
# Get monthly data
df_2014_04_neu = pd.read_csv('../data/local/df_neu_2014-04.csv').reset_index()
df_2014_04_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5838 entries, 0 to 5837
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   5838 non-null   int64  
 1   LocationCode            5838 non-null   object 
 2   LocationName            5838 non-null   object 
 3   StatesCode              5838 non-null   object 
 4   ...4                    5838 non-null   int64  
 5   id                      5838 non-null   float64
 6   text                    5838 non-null   object 
 7   label                   5838 non-null   object 
 8   in_reply_to_status_id   219 non-null    float64
 9   in_reply_to_user_id     259 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       787 non-null    float64
 13  retweeted_status_id     787 non-null    float64
 14  created_at              5838 non-null   

### Representative sample based on week and location

In [50]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_04_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_04_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w13,Alagoas,8,0.001370
1,2014-w13,Amapa,1,0.000171
2,2014-w13,Amazonas,1,0.000171
3,2014-w13,Bahia,17,0.002912
4,2014-w13,Ceara,5,0.000856
...,...,...,...,...
137,2014-w18,Pernambuco,12,0.002055
138,2014-w18,Rio Grande Do Sul,35,0.005995
139,2014-w18,Santa Catarina,6,0.001028
140,2014-w18,Sao Paulo,30,0.005139


In [51]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w13,Alagoas,8,0.001370,3
1,2014-w13,Amapa,1,0.000171,0
2,2014-w13,Amazonas,1,0.000171,0
3,2014-w13,Bahia,17,0.002912,7
4,2014-w13,Ceara,5,0.000856,2
...,...,...,...,...,...
137,2014-w18,Pernambuco,12,0.002055,5
138,2014-w18,Rio Grande Do Sul,35,0.005995,14
139,2014-w18,Santa Catarina,6,0.001028,2
140,2014-w18,Sao Paulo,30,0.005139,12


In [52]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_04_neu_sample_size = pd.merge(df_2014_04_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [53]:
# Step 6: Perform the stratified sampling
df_2014_04_neu_sample = df_2014_04_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2014_04_neu_sample)} rows.")


Sampled 2377 rows.


In [54]:
text_gpt_2014_04_neu = '\n'.join(df_2014_04_neu_sample['text'].astype(str))

### Prompts and loops

In [55]:
prompt_4 = prompt_raw.replace('TWEETS', text_gpt_2014_04_neu)

In [56]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_4}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [57]:
response.choices[0].message.content

'The provided sentences discuss various vaccination campaigns in Brazil, focusing on the HPV and flu vaccines. Key points include the ongoing vaccination efforts against HPV for girls aged 11 to 13, extension of measles vaccination campaigns, and the upcoming flu vaccination period starting April 22. Concerns about vaccine hesitancy and the importance of vaccination for public health are also mentioned. The overall tone reflects a mix of humor, urgency, and community involvement in vaccination activities.'

In [58]:
#summary = []
#dataset = []

In [59]:
dataset.append("2014_04_neutral")

In [60]:
summary.append(response.choices[0].message.content)

In [61]:
df_2014_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  4 non-null      object
 1   summary  4 non-null      object
dtypes: object(2)
memory usage: 192.0+ bytes


In [62]:
df_2014_neu.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_neutral,The text discusses various aspects of vaccinat...
1,2014_02_neutral,The text discusses various vaccination campaig...
2,2014_03_neutral,The upcoming vaccination campaign against HPV ...
3,2014_04_neutral,The provided sentences discuss various vaccina...


In [63]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdGh9q3rPuIM3UDiZjeToy8CbOWf at 0x1263da55b30> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided sentences discuss various vaccination campaigns in Brazil, focusing on the HPV and flu vaccines. Key points include the ongoing vaccination efforts against HPV for girls aged 11 to 13, extension of measles vaccination campaigns, and the upcoming flu vaccination period starting April 22. Concerns about vaccine hesitancy and the importance of vaccination for public health are also mentioned. The overall tone reflects a mix of humor, urgency, and community involvement in vaccination activities.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864151,
  "id": "chatcmpl-AUdGh9q3rPuIM3UDiZjeToy8CbOWf",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
 

## May

In [64]:
# Get monthly data
df_2014_05_neu = pd.read_csv('../data/local/df_neu_2014-05.csv').reset_index()
df_2014_05_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6664 entries, 0 to 6663
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   6664 non-null   int64  
 1   LocationCode            6664 non-null   object 
 2   LocationName            6664 non-null   object 
 3   StatesCode              6664 non-null   object 
 4   ...4                    6664 non-null   int64  
 5   id                      6664 non-null   float64
 6   text                    6664 non-null   object 
 7   label                   6664 non-null   object 
 8   in_reply_to_status_id   269 non-null    float64
 9   in_reply_to_user_id     336 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       1422 non-null   float64
 13  retweeted_status_id     1422 non-null   float64
 14  created_at              6664 non-null   

### Representative sample based on week and location

In [65]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_05_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_05_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w18,Acre,12,0.001801
1,2014-w18,Alagoas,28,0.004202
2,2014-w18,Amapa,4,0.000600
3,2014-w18,Amazonas,32,0.004802
4,2014-w18,Bahia,101,0.015156
...,...,...,...,...
116,2014-w22,Rio Grande Do Sul,65,0.009754
117,2014-w22,Roraima,4,0.000600
118,2014-w22,Santa Catarina,13,0.001951
119,2014-w22,Sao Paulo,93,0.013956


In [66]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w18,Acre,12,0.001801,4
1,2014-w18,Alagoas,28,0.004202,10
2,2014-w18,Amapa,4,0.000600,1
3,2014-w18,Amazonas,32,0.004802,11
4,2014-w18,Bahia,101,0.015156,37
...,...,...,...,...,...
116,2014-w22,Rio Grande Do Sul,65,0.009754,23
117,2014-w22,Roraima,4,0.000600,1
118,2014-w22,Santa Catarina,13,0.001951,4
119,2014-w22,Sao Paulo,93,0.013956,34


In [67]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_05_neu_sample_size = pd.merge(df_2014_05_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [68]:
# Step 6: Perform the stratified sampling
df_2014_05_neu_sample = df_2014_05_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2014_05_neu_sample)} rows.")


Sampled 2392 rows.


In [69]:
text_gpt_2014_05_neu = '\n'.join(df_2014_05_neu_sample['text'].astype(str))

### Prompts and loops

In [70]:
prompt_5 = prompt_raw.replace('TWEETS', text_gpt_2014_05_neu)

In [71]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_5}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [72]:
response.choices[0].message.content

'The text discusses various aspects of vaccination campaigns in Brazil, particularly focusing on the influenza (flu) vaccine and the vaccination against foot-and-mouth disease for livestock. It mentions the low turnout for the flu vaccine, the extension of vaccination deadlines, and the importance of reaching vaccination targets for specific populations. Additionally, it addresses public concerns about vaccine safety and efficacy, while promoting awareness through videos and educational materials.'

In [73]:
#summary = []
#dataset = []

In [74]:
dataset.append("2014_05_neutral")

In [75]:
summary.append(response.choices[0].message.content)

In [76]:
df_2014_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  5 non-null      object
 1   summary  5 non-null      object
dtypes: object(2)
memory usage: 208.0+ bytes


In [77]:
df_2014_neu.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_neutral,The text discusses various aspects of vaccinat...
1,2014_02_neutral,The text discusses various vaccination campaig...
2,2014_03_neutral,The upcoming vaccination campaign against HPV ...
3,2014_04_neutral,The provided sentences discuss various vaccina...
4,2014_05_neutral,The text discusses various aspects of vaccinat...


In [78]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdGnr9lYFEchpq1tiMARHMBxGllS at 0x1263ddcf400> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various aspects of vaccination campaigns in Brazil, particularly focusing on the influenza (flu) vaccine and the vaccination against foot-and-mouth disease for livestock. It mentions the low turnout for the flu vaccine, the extension of vaccination deadlines, and the importance of reaching vaccination targets for specific populations. Additionally, it addresses public concerns about vaccine safety and efficacy, while promoting awareness through videos and educational materials.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864157,
  "id": "chatcmpl-AUdGnr9lYFEchpq1tiMARHMBxGllS",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage"

## June

In [79]:
# Get monthly data
df_2014_06_neu = pd.read_csv('../data/local/df_neu_2014-06.csv').reset_index()
df_2014_06_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2049 entries, 0 to 2048
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2049 non-null   int64  
 1   LocationCode            2049 non-null   object 
 2   LocationName            2049 non-null   object 
 3   StatesCode              2049 non-null   object 
 4   ...4                    2049 non-null   int64  
 5   id                      2049 non-null   float64
 6   text                    2049 non-null   object 
 7   label                   2049 non-null   object 
 8   in_reply_to_status_id   203 non-null    float64
 9   in_reply_to_user_id     242 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       372 non-null    float64
 13  retweeted_status_id     372 non-null    float64
 14  created_at              2049 non-null   

### Representative sample based on week and location

In [80]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_06_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_06_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w22,Acre,3,0.001464
1,2014-w22,Alagoas,6,0.002928
2,2014-w22,Amazonas,2,0.000976
3,2014-w22,Bahia,22,0.010737
4,2014-w22,Ceara,3,0.001464
...,...,...,...,...
110,2014-w26,Roraima,1,0.000488
111,2014-w26,Santa Catarina,13,0.006345
112,2014-w26,Sao Paulo,115,0.056125
113,2014-w26,Sergipe,4,0.001952


In [81]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w22,Acre,3,0.001464,3
1,2014-w22,Alagoas,6,0.002928,7
2,2014-w22,Amazonas,2,0.000976,2
3,2014-w22,Bahia,22,0.010737,26
4,2014-w22,Ceara,3,0.001464,3
...,...,...,...,...,...
110,2014-w26,Roraima,1,0.000488,1
111,2014-w26,Santa Catarina,13,0.006345,15
112,2014-w26,Sao Paulo,115,0.056125,137
113,2014-w26,Sergipe,4,0.001952,4


In [82]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_06_neu_sample_size = pd.merge(df_2014_06_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [84]:
# Step 6: Perform the stratified sampling
#df_2014_06_neu_sample = df_2014_06_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_06_neu_sample = df_2014_06_neu
# Step 7: Check the result
print(f"Sampled {len(df_2014_06_neu_sample)} rows.")


Sampled 2049 rows.


In [85]:
text_gpt_2014_06_neu = '\n'.join(df_2014_06_neu_sample['text'].astype(str))

### Prompts and loops

In [86]:
prompt_6 = prompt_raw.replace('TWEETS', text_gpt_2014_06_neu)

In [87]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_6}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [88]:
response.choices[0].message.content

'The text discusses various aspects of vaccination in Brazil, including ongoing campaigns for flu and HPV vaccinations, discrepancies in vaccination rates in different regions, and the importance of maintaining vaccination schedules. It also humorously references the need for vaccinations related to the soccer player Luis Suárez, particularly regarding rabies after his biting incident.'

In [89]:
#summary = []
#dataset = []

In [90]:
dataset.append("2014_06_neutral")

In [91]:
summary.append(response.choices[0].message.content)

In [92]:
df_2014_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  6 non-null      object
 1   summary  6 non-null      object
dtypes: object(2)
memory usage: 224.0+ bytes


In [93]:
df_2014_neu.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_neutral,The text discusses various aspects of vaccinat...
1,2014_02_neutral,The text discusses various vaccination campaig...
2,2014_03_neutral,The upcoming vaccination campaign against HPV ...
3,2014_04_neutral,The provided sentences discuss various vaccina...
4,2014_05_neutral,The text discusses various aspects of vaccinat...
5,2014_06_neutral,The text discusses various aspects of vaccinat...


In [94]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdIjjmQgk0VW3ninfcyIc71nFuFA at 0x12641fb5090> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various aspects of vaccination in Brazil, including ongoing campaigns for flu and HPV vaccinations, discrepancies in vaccination rates in different regions, and the importance of maintaining vaccination schedules. It also humorously references the need for vaccinations related to the soccer player Luis Su\u00e1rez, particularly regarding rabies after his biting incident.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864277,
  "id": "chatcmpl-AUdIjjmQgk0VW3ninfcyIc71nFuFA",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_f8dd36be86",
  "usage": {
    "completion_tokens": 62,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
  

## July

In [95]:
# Get monthly data
df_2014_07_neu = pd.read_csv('../data/local/df_neu_2014-07.csv').reset_index()
df_2014_07_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987 entries, 0 to 1986
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1987 non-null   int64  
 1   LocationCode            1987 non-null   object 
 2   LocationName            1987 non-null   object 
 3   StatesCode              1987 non-null   object 
 4   ...4                    1987 non-null   int64  
 5   id                      1987 non-null   float64
 6   text                    1987 non-null   object 
 7   label                   1987 non-null   object 
 8   in_reply_to_status_id   147 non-null    float64
 9   in_reply_to_user_id     166 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       280 non-null    float64
 13  retweeted_status_id     280 non-null    float64
 14  created_at              1987 non-null   

### Representative sample based on week and location

In [96]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_07_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_07_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w26,Acre,1,0.000503
1,2014-w26,Alagoas,1,0.000503
2,2014-w26,Amapa,10,0.005033
3,2014-w26,Amazonas,1,0.000503
4,2014-w26,Bahia,7,0.003523
...,...,...,...,...
121,2014-w31,Rio Grande Do Sul,17,0.008556
122,2014-w31,Santa Catarina,20,0.010065
123,2014-w31,Sao Paulo,50,0.025164
124,2014-w31,Sergipe,4,0.002013


In [97]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w26,Acre,1,0.000503,1
1,2014-w26,Alagoas,1,0.000503,1
2,2014-w26,Amapa,10,0.005033,12
3,2014-w26,Amazonas,1,0.000503,1
4,2014-w26,Bahia,7,0.003523,8
...,...,...,...,...,...
121,2014-w31,Rio Grande Do Sul,17,0.008556,20
122,2014-w31,Santa Catarina,20,0.010065,24
123,2014-w31,Sao Paulo,50,0.025164,61
124,2014-w31,Sergipe,4,0.002013,4


In [98]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_07_neu_sample_size = pd.merge(df_2014_07_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [100]:
# Step 6: Perform the stratified sampling
#df_2014_07_neu_sample = df_2014_07_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_07_neu_sample = df_2014_07_neu
# Step 7: Check the result
print(f"Sampled {len(df_2014_07_neu_sample)} rows.")


Sampled 1987 rows.


In [101]:
text_gpt_2014_07_neu = '\n'.join(df_2014_07_neu_sample['text'].astype(str))

### Prompts and loops

In [102]:
prompt_7 = prompt_raw.replace('TWEETS', text_gpt_2014_07_neu)

In [103]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_7}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [104]:
response.choices[0].message.content

'The provided Portuguese sentences predominantly discuss various vaccination campaigns in Brazil, such as those against tuberculosis, hepatitis A, and leishmaniasis. They highlight vaccination efforts for children, adults, and animals, along with reminders for individuals to get vaccinated and reports on the coverage and effectiveness of these vaccines. The text also touches on public health initiatives and community reactions to vaccination.'

In [105]:
#summary = []
#dataset = []

In [106]:
dataset.append("2014_07_neutral")

In [107]:
summary.append(response.choices[0].message.content)

In [108]:
df_2014_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  7 non-null      object
 1   summary  7 non-null      object
dtypes: object(2)
memory usage: 240.0+ bytes


In [109]:
df_2014_neu.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_neutral,The text discusses various aspects of vaccinat...
1,2014_02_neutral,The text discusses various vaccination campaig...
2,2014_03_neutral,The upcoming vaccination campaign against HPV ...
3,2014_04_neutral,The provided sentences discuss various vaccina...
4,2014_05_neutral,The text discusses various aspects of vaccinat...
5,2014_06_neutral,The text discusses various aspects of vaccinat...
6,2014_07_neutral,The provided Portuguese sentences predominantl...


In [110]:
response

<OpenAIObject chat.completion id=chatcmpl-AUdJhtdSCJxaBsjfHnLCJh2eE9SYZ at 0x1263e4b8ef0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided Portuguese sentences predominantly discuss various vaccination campaigns in Brazil, such as those against tuberculosis, hepatitis A, and leishmaniasis. They highlight vaccination efforts for children, adults, and animals, along with reminders for individuals to get vaccinated and reports on the coverage and effectiveness of these vaccines. The text also touches on public health initiatives and community reactions to vaccination.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731864337,
  "id": "chatcmpl-AUdJhtdSCJxaBsjfHnLCJh2eE9SYZ",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 72,
    "completion_tokens_

## August

In [111]:
# Get monthly data
df_2014_08_neu = pd.read_csv('../data/local/df_neu_2014-08.csv').reset_index()
df_2014_08_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2302 entries, 0 to 2301
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2302 non-null   int64  
 1   LocationCode            2302 non-null   object 
 2   LocationName            2302 non-null   object 
 3   StatesCode              2302 non-null   object 
 4   ...4                    2302 non-null   int64  
 5   id                      2302 non-null   float64
 6   text                    2302 non-null   object 
 7   label                   2302 non-null   object 
 8   in_reply_to_status_id   153 non-null    float64
 9   in_reply_to_user_id     162 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       365 non-null    float64
 13  retweeted_status_id     365 non-null    float64
 14  created_at              2302 non-null   

### Representative sample based on week and location

In [112]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_08_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_08_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w31,Acre,1,0.000434
1,2014-w31,Alagoas,1,0.000434
2,2014-w31,Amapa,3,0.001303
3,2014-w31,Amazonas,6,0.002606
4,2014-w31,Bahia,25,0.010860
...,...,...,...,...
107,2014-w35,Roraima,3,0.001303
108,2014-w35,Santa Catarina,13,0.005647
109,2014-w35,Sao Paulo,98,0.042572
110,2014-w35,Sergipe,1,0.000434


In [113]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w31,Acre,1,0.000434,1
1,2014-w31,Alagoas,1,0.000434,1
2,2014-w31,Amapa,3,0.001303,3
3,2014-w31,Amazonas,6,0.002606,6
4,2014-w31,Bahia,25,0.010860,26
...,...,...,...,...,...
107,2014-w35,Roraima,3,0.001303,3
108,2014-w35,Santa Catarina,13,0.005647,13
109,2014-w35,Sao Paulo,98,0.042572,104
110,2014-w35,Sergipe,1,0.000434,1


In [114]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_08_neu_sample_size = pd.merge(df_2014_08_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [116]:
# Step 6: Perform the stratified sampling
#df_2014_08_neu_sample = df_2014_08_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_08_neu_sample = df_2014_08_neu
# Step 7: Check the result
print(f"Sampled {len(df_2014_08_neu_sample)} rows.")


Sampled 2302 rows.


In [117]:
text_gpt_2014_08_neu = '\n'.join(df_2014_08_neu_sample['text'].astype(str))

### Prompts and loops

In [118]:
prompt_8 = prompt_raw.replace('TWEETS', text_gpt_2014_08_neu)

In [119]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_8}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [120]:
response.choices[0].message.content

'The text discusses various vaccination campaigns in Brazil, including those for rabies, HPV, hepatitis A, and Ebola. It highlights efforts by health authorities to vaccinate children and animals, the development of vaccines, and the importance of public health initiatives. There are also humorous remarks about the vaccination experience and societal attitudes towards vaccines.'

In [121]:
#summary = []
#dataset = []

In [122]:
dataset.append("2014_08_neutral")

In [123]:
summary.append(response.choices[0].message.content)

In [124]:
df_2014_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  8 non-null      object
 1   summary  8 non-null      object
dtypes: object(2)
memory usage: 256.0+ bytes


In [125]:
df_2014_neu.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_neutral,The text discusses various aspects of vaccinat...
1,2014_02_neutral,The text discusses various vaccination campaig...
2,2014_03_neutral,The upcoming vaccination campaign against HPV ...
3,2014_04_neutral,The provided sentences discuss various vaccina...
4,2014_05_neutral,The text discusses various aspects of vaccinat...
5,2014_06_neutral,The text discusses various aspects of vaccinat...
6,2014_07_neutral,The provided Portuguese sentences predominantl...
7,2014_08_neutral,The text discusses various vaccination campaig...


In [126]:
response

<OpenAIObject chat.completion id=chatcmpl-AUeRs2HelTQZZRJsVTCc0KoBHW315 at 0x12643e72400> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns in Brazil, including those for rabies, HPV, hepatitis A, and Ebola. It highlights efforts by health authorities to vaccinate children and animals, the development of vaccines, and the importance of public health initiatives. There are also humorous remarks about the vaccination experience and societal attitudes towards vaccines.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731868688,
  "id": "chatcmpl-AUeRs2HelTQZZRJsVTCc0KoBHW315",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 64,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_to

## September

In [127]:
# Get monthly data
df_2014_09_neu = pd.read_csv('../data/local/df_neu_2014-09.csv').reset_index()
df_2014_09_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3069 entries, 0 to 3068
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3069 non-null   int64  
 1   LocationCode            3069 non-null   object 
 2   LocationName            3069 non-null   object 
 3   StatesCode              3069 non-null   object 
 4   ...4                    3069 non-null   int64  
 5   id                      3069 non-null   float64
 6   text                    3069 non-null   object 
 7   label                   3069 non-null   object 
 8   in_reply_to_status_id   184 non-null    float64
 9   in_reply_to_user_id     207 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       568 non-null    float64
 13  retweeted_status_id     568 non-null    float64
 14  created_at              3069 non-null   

### Representative sample based on week and location

In [128]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_09_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_09_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w35,Alagoas,3,0.000978
1,2014-w35,Amapa,2,0.000652
2,2014-w35,Amazonas,8,0.002607
3,2014-w35,Bahia,39,0.012708
4,2014-w35,Ceara,31,0.010101
...,...,...,...,...
113,2014-w39,Roraima,8,0.002607
114,2014-w39,Santa Catarina,23,0.007494
115,2014-w39,Sao Paulo,121,0.039427
116,2014-w39,Sergipe,38,0.012382


In [129]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w35,Alagoas,3,0.000978,2
1,2014-w35,Amapa,2,0.000652,1
2,2014-w35,Amazonas,8,0.002607,6
3,2014-w35,Bahia,39,0.012708,31
4,2014-w35,Ceara,31,0.010101,24
...,...,...,...,...,...
113,2014-w39,Roraima,8,0.002607,6
114,2014-w39,Santa Catarina,23,0.007494,18
115,2014-w39,Sao Paulo,121,0.039427,96
116,2014-w39,Sergipe,38,0.012382,30


In [130]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_09_neu_sample_size = pd.merge(df_2014_09_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [131]:
# Step 6: Perform the stratified sampling
df_2014_09_neu_sample = df_2014_09_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2014_09_neu_sample)} rows.")


Sampled 2382 rows.


In [132]:
text_gpt_2014_09_neu = '\n'.join(df_2014_09_neu_sample['text'].astype(str))

### Prompts and loops

In [133]:
prompt_9 = prompt_raw.replace('TWEETS', text_gpt_2014_09_neu)

In [134]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_9}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [135]:
response.choices[0].message.content

'The text discusses the vaccination campaign against HPV for girls aged 11 to 13, emphasizing the importance of receiving the second dose at designated health centers and schools. Various regions in Brazil are implementing these campaigns, with some areas experiencing low participation rates. Additionally, it mentions the availability of vaccines for rabies and hepatitis A, along with other vaccination efforts for animals.'

In [136]:
#summary = []
#dataset = []

In [137]:
dataset.append("2014_09_neutral")

In [138]:
summary.append(response.choices[0].message.content)

In [139]:
df_2014_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  9 non-null      object
 1   summary  9 non-null      object
dtypes: object(2)
memory usage: 272.0+ bytes


In [140]:
df_2014_neu.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_neutral,The text discusses various aspects of vaccinat...
1,2014_02_neutral,The text discusses various vaccination campaig...
2,2014_03_neutral,The upcoming vaccination campaign against HPV ...
3,2014_04_neutral,The provided sentences discuss various vaccina...
4,2014_05_neutral,The text discusses various aspects of vaccinat...
5,2014_06_neutral,The text discusses various aspects of vaccinat...
6,2014_07_neutral,The provided Portuguese sentences predominantl...
7,2014_08_neutral,The text discusses various vaccination campaig...
8,2014_09_neutral,The text discusses the vaccination campaign ag...


In [141]:
response

<OpenAIObject chat.completion id=chatcmpl-AUeS25TuFdbTt55PWzV7w7BKDaKx0 at 0x1264406b040> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses the vaccination campaign against HPV for girls aged 11 to 13, emphasizing the importance of receiving the second dose at designated health centers and schools. Various regions in Brazil are implementing these campaigns, with some areas experiencing low participation rates. Additionally, it mentions the availability of vaccines for rabies and hepatitis A, along with other vaccination efforts for animals.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731868698,
  "id": "chatcmpl-AUeS25TuFdbTt55PWzV7w7BKDaKx0",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_f8dd36be86",
  "usage": {
    "completion_tokens": 72,
    "completion_tokens_details": {
      "a

## October

In [142]:
# Get monthly data
df_2014_10_neu = pd.read_csv('../data/local/df_neu_2014-10.csv').reset_index()
df_2014_10_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4861 entries, 0 to 4860
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   4861 non-null   int64  
 1   LocationCode            4861 non-null   object 
 2   LocationName            4861 non-null   object 
 3   StatesCode              4861 non-null   object 
 4   ...4                    4861 non-null   int64  
 5   id                      4861 non-null   float64
 6   text                    4861 non-null   object 
 7   label                   4861 non-null   object 
 8   in_reply_to_status_id   369 non-null    float64
 9   in_reply_to_user_id     386 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       1445 non-null   float64
 13  retweeted_status_id     1445 non-null   float64
 14  created_at              4861 non-null   

### Representative sample based on week and location

In [143]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_10_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_10_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w40,Acre,1,0.000206
1,2014-w40,Alagoas,5,0.001029
2,2014-w40,Amapa,11,0.002263
3,2014-w40,Amazonas,10,0.002057
4,2014-w40,Bahia,35,0.007200
...,...,...,...,...
113,2014-w44,Roraima,5,0.001029
114,2014-w44,Santa Catarina,7,0.001440
115,2014-w44,Sao Paulo,112,0.023041
116,2014-w44,Sergipe,7,0.001440


In [144]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w40,Acre,1,0.000206,0
1,2014-w40,Alagoas,5,0.001029,2
2,2014-w40,Amapa,11,0.002263,5
3,2014-w40,Amazonas,10,0.002057,5
4,2014-w40,Bahia,35,0.007200,17
...,...,...,...,...,...
113,2014-w44,Roraima,5,0.001029,2
114,2014-w44,Santa Catarina,7,0.001440,3
115,2014-w44,Sao Paulo,112,0.023041,56
116,2014-w44,Sergipe,7,0.001440,3


In [145]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_10_neu_sample_size = pd.merge(df_2014_10_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [146]:
# Step 6: Perform the stratified sampling
#df_2014_10_neu_sample = df_2014_10_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_10_neu_sample = df_2014_10_neu
# Step 7: Check the result
print(f"Sampled {len(df_2014_10_neu_sample)} rows.")


Sampled 4861 rows.


In [147]:
text_gpt_2014_10_neu = '\n'.join(df_2014_10_neu_sample['text'].astype(str))

### Prompts and loops

In [148]:
prompt_10 = prompt_raw.replace('TWEETS', text_gpt_2014_10_neu)

In [149]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_10}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [150]:
response.choices[0].message.content

"The texts discuss various aspects of vaccination, focusing on the controversy surrounding the inclusion of horse vaccines as health expenditures in Minas Gerais, particularly under Aécio Neves' government. Critics highlight this as an inappropriate allocation of health funds, while supporters argue it relates to public health measures. Additionally, there are mentions of vaccination campaigns for both animals and humans, including efforts against rabies and HPV, and the ongoing research and testing of an Ebola vaccine. Overall, the conversation reflects societal attitudes towards vaccination and public health policy."

In [151]:
#summary = []
#dataset = []

In [152]:
dataset.append("2014_10_neutral")

In [153]:
summary.append(response.choices[0].message.content)

In [154]:
df_2014_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  10 non-null     object
 1   summary  10 non-null     object
dtypes: object(2)
memory usage: 288.0+ bytes


In [155]:
df_2014_neu.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_neutral,The text discusses various aspects of vaccinat...
1,2014_02_neutral,The text discusses various vaccination campaig...
2,2014_03_neutral,The upcoming vaccination campaign against HPV ...
3,2014_04_neutral,The provided sentences discuss various vaccina...
4,2014_05_neutral,The text discusses various aspects of vaccinat...
5,2014_06_neutral,The text discusses various aspects of vaccinat...
6,2014_07_neutral,The provided Portuguese sentences predominantl...
7,2014_08_neutral,The text discusses various vaccination campaig...
8,2014_09_neutral,The text discusses the vaccination campaign ag...
9,2014_10_neutral,The texts discuss various aspects of vaccinati...


In [156]:
response

<OpenAIObject chat.completion id=chatcmpl-AUeSCUWZqZtZEBNqZL8NJ64KQ8ThA at 0x126440f10e0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The texts discuss various aspects of vaccination, focusing on the controversy surrounding the inclusion of horse vaccines as health expenditures in Minas Gerais, particularly under A\u00e9cio Neves' government. Critics highlight this as an inappropriate allocation of health funds, while supporters argue it relates to public health measures. Additionally, there are mentions of vaccination campaigns for both animals and humans, including efforts against rabies and HPV, and the ongoing research and testing of an Ebola vaccine. Overall, the conversation reflects societal attitudes towards vaccination and public health policy.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731868708,
  "id": "chatcmpl-AUeSCUWZqZtZEBNqZL8NJ6

## November

In [157]:
# Get monthly data
df_2014_11_neu = pd.read_csv('../data/local/df_neu_2014-11.csv').reset_index()
df_2014_11_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6348 entries, 0 to 6347
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   6348 non-null   int64  
 1   LocationCode            6348 non-null   object 
 2   LocationName            6348 non-null   object 
 3   StatesCode              6348 non-null   object 
 4   ...4                    6348 non-null   int64  
 5   id                      6348 non-null   float64
 6   text                    6348 non-null   object 
 7   label                   6348 non-null   object 
 8   in_reply_to_status_id   163 non-null    float64
 9   in_reply_to_user_id     189 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       944 non-null    float64
 13  retweeted_status_id     944 non-null    float64
 14  created_at              6348 non-null   

### Representative sample based on week and location

In [158]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_11_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_11_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w44,Acre,3,0.000473
1,2014-w44,Alagoas,12,0.001890
2,2014-w44,Amapa,4,0.000630
3,2014-w44,Amazonas,12,0.001890
4,2014-w44,Bahia,52,0.008192
...,...,...,...,...
119,2014-w48,Roraima,8,0.001260
120,2014-w48,Santa Catarina,18,0.002836
121,2014-w48,Sao Paulo,125,0.019691
122,2014-w48,Sergipe,11,0.001733


In [159]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w44,Acre,3,0.000473,1
1,2014-w44,Alagoas,12,0.001890,4
2,2014-w44,Amapa,4,0.000630,1
3,2014-w44,Amazonas,12,0.001890,4
4,2014-w44,Bahia,52,0.008192,20
...,...,...,...,...,...
119,2014-w48,Roraima,8,0.001260,3
120,2014-w48,Santa Catarina,18,0.002836,6
121,2014-w48,Sao Paulo,125,0.019691,48
122,2014-w48,Sergipe,11,0.001733,4


In [160]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_11_neu_sample_size = pd.merge(df_2014_11_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [161]:
# Step 6: Perform the stratified sampling
df_2014_11_neu_sample = df_2014_11_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2014_11_neu_sample)} rows.")


Sampled 2390 rows.


In [162]:
text_gpt_2014_11_neu = '\n'.join(df_2014_11_neu_sample['text'].astype(str))

### Prompts and loops

In [163]:
prompt_11 = prompt_raw.replace('TWEETS', text_gpt_2014_11_neu)

In [164]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_11}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [165]:
response.choices[0].message.content

"The text discusses various vaccination campaigns in Brazil, emphasizing the registration of a new flu vaccine by Anvisa and the government's plans to immunize millions of children against polio and measles. It also highlights the start of vaccination efforts for these diseases, alongside campaigns for rabies vaccination in animals, including details about specific dates and goals for these initiatives."

In [166]:
#summary = []
#dataset = []

In [167]:
dataset.append("2014_11_neutral")

In [168]:
summary.append(response.choices[0].message.content)

In [169]:
df_2014_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  11 non-null     object
 1   summary  11 non-null     object
dtypes: object(2)
memory usage: 304.0+ bytes


In [170]:
df_2014_neu.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_neutral,The text discusses various aspects of vaccinat...
1,2014_02_neutral,The text discusses various vaccination campaig...
2,2014_03_neutral,The upcoming vaccination campaign against HPV ...
3,2014_04_neutral,The provided sentences discuss various vaccina...
4,2014_05_neutral,The text discusses various aspects of vaccinat...
5,2014_06_neutral,The text discusses various aspects of vaccinat...
6,2014_07_neutral,The provided Portuguese sentences predominantl...
7,2014_08_neutral,The text discusses various vaccination campaig...
8,2014_09_neutral,The text discusses the vaccination campaign ag...
9,2014_10_neutral,The texts discuss various aspects of vaccinati...


In [171]:
response

<OpenAIObject chat.completion id=chatcmpl-AUeSVV3Yu50zB8MwblpJjmigMKBTx at 0x12643926ea0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various vaccination campaigns in Brazil, emphasizing the registration of a new flu vaccine by Anvisa and the government's plans to immunize millions of children against polio and measles. It also highlights the start of vaccination efforts for these diseases, alongside campaigns for rabies vaccination in animals, including details about specific dates and goals for these initiatives.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731868727,
  "id": "chatcmpl-AUeSVV3Yu50zB8MwblpJjmigMKBTx",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 69,
    "completion_tokens_details": {
      "accepted_prediction_t

## December

In [172]:
# Get monthly data
df_2014_12_neu = pd.read_csv('../data/local/df_neu_2014-12.csv').reset_index()
df_2014_12_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2918 entries, 0 to 2917
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   2918 non-null   int64  
 1   LocationCode            2918 non-null   object 
 2   LocationName            2918 non-null   object 
 3   StatesCode              2918 non-null   object 
 4   ...4                    2918 non-null   int64  
 5   id                      2918 non-null   float64
 6   text                    2918 non-null   object 
 7   label                   2918 non-null   object 
 8   in_reply_to_status_id   116 non-null    float64
 9   in_reply_to_user_id     128 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       544 non-null    float64
 13  retweeted_status_id     544 non-null    float64
 14  created_at              2918 non-null   

### Representative sample based on week and location

In [173]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2014_12_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2014_12_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2014-w48,Acre,1,0.000343
1,2014-w48,Alagoas,7,0.002399
2,2014-w48,Amapa,1,0.000343
3,2014-w48,Amazonas,6,0.002056
4,2014-w48,Bahia,18,0.006169
...,...,...,...,...
126,2014-w53,Pernambuco,3,0.001028
127,2014-w53,Rio Grande Do Sul,3,0.001028
128,2014-w53,Roraima,3,0.001028
129,2014-w53,Sao Paulo,5,0.001714


In [174]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2014-w48,Acre,1,0.000343,0
1,2014-w48,Alagoas,7,0.002399,5
2,2014-w48,Amapa,1,0.000343,0
3,2014-w48,Amazonas,6,0.002056,5
4,2014-w48,Bahia,18,0.006169,15
...,...,...,...,...,...
126,2014-w53,Pernambuco,3,0.001028,2
127,2014-w53,Rio Grande Do Sul,3,0.001028,2
128,2014-w53,Roraima,3,0.001028,2
129,2014-w53,Sao Paulo,5,0.001714,4


In [175]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2014_12_neu_sample_size = pd.merge(df_2014_12_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [176]:
# Step 6: Perform the stratified sampling
#df_2014_12_neu_sample = df_2014_12_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2014_12_neu_sample = df_2014_12_neu
# Step 7: Check the result
print(f"Sampled {len(df_2014_12_neu_sample)} rows.")


Sampled 2918 rows.


In [177]:
text_gpt_2014_12_neu = '\n'.join(df_2014_12_neu_sample['text'].astype(str))

### Prompts and loops

In [178]:
prompt_12 = prompt_raw.replace('TWEETS', text_gpt_2014_12_neu)

In [179]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_12}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [180]:
response.choices[0].message.content

'The text discusses the extension and completion of vaccination campaigns against poliomyelitis and measles in various regions of Brazil. It highlights that many states, including Acre and Amapá, did not meet vaccination targets, leading to the need for extensions. The campaigns aimed to vaccinate children under five years old, with various announcements reminding parents of deadlines and vaccination locations. Overall, the urgency for increased vaccination rates and compliance with health guidelines is emphasized, particularly in light of ongoing public health challenges.'

In [181]:
#summary = []
#dataset = []

In [182]:
dataset.append("2014_12_neutral")

In [183]:
summary.append(response.choices[0].message.content)

In [184]:
df_2014_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2014_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  12 non-null     object
 1   summary  12 non-null     object
dtypes: object(2)
memory usage: 320.0+ bytes


In [185]:
df_2014_neu.tail(14)

Unnamed: 0,dataset,summary
0,2014_01_neutral,The text discusses various aspects of vaccinat...
1,2014_02_neutral,The text discusses various vaccination campaig...
2,2014_03_neutral,The upcoming vaccination campaign against HPV ...
3,2014_04_neutral,The provided sentences discuss various vaccina...
4,2014_05_neutral,The text discusses various aspects of vaccinat...
5,2014_06_neutral,The text discusses various aspects of vaccinat...
6,2014_07_neutral,The provided Portuguese sentences predominantl...
7,2014_08_neutral,The text discusses various vaccination campaig...
8,2014_09_neutral,The text discusses the vaccination campaign ag...
9,2014_10_neutral,The texts discuss various aspects of vaccinati...


In [186]:
response

<OpenAIObject chat.completion id=chatcmpl-AUeSicXpbrwFa5tPPkKPTtVvmMTTW at 0x12645c5d8b0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses the extension and completion of vaccination campaigns against poliomyelitis and measles in various regions of Brazil. It highlights that many states, including Acre and Amap\u00e1, did not meet vaccination targets, leading to the need for extensions. The campaigns aimed to vaccinate children under five years old, with various announcements reminding parents of deadlines and vaccination locations. Overall, the urgency for increased vaccination rates and compliance with health guidelines is emphasized, particularly in light of ongoing public health challenges.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731868740,
  "id": "chatcmpl-AUeSicXpbrwFa5tPPkKPTtVvmMTTW",
  "model": "gpt-4o-mini-2024-07-18",

## All 2014 neutral

In [187]:
#df_2014_neu['dataset'] = df_2014_neu['dataset'].str.replace('neuitive','neutral')

In [188]:
df_2014_neu.to_csv('../data/summary_2014_neutral_tweets.csv')