# Import libraries and data

In [39]:
# Import libraries
import os
import openai
import pandas as pd
import numpy as np
import openpyxl

In [40]:
# Get OpenAI API secret key (change this file with your own OpenAI key)
open_ai_key = open('../data/local/openai_key.txt', 'r').read()
openai.api_key = open_ai_key
#openai.Model.list()

In [41]:
# Prompt for GPT
prompt_raw = "I would like to summarise in maximum 300 characters and in English what the following sentences in Portuguese talk about. \
Do not use aspects mentioned only in one sentence as part of the summary: TWEETS"

# 2018 neutral

## January

In [42]:
# Get monthly data
df_2018_01_neu = pd.read_csv('../data/local/df_neu_2018-01.csv').reset_index()
df_2018_01_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11249 entries, 0 to 11248
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   11249 non-null  int64  
 1   LocationCode            11249 non-null  object 
 2   LocationName            11249 non-null  object 
 3   StatesCode              11249 non-null  object 
 4   ...4                    11249 non-null  int64  
 5   id                      11249 non-null  float64
 6   text                    11249 non-null  object 
 7   label                   11249 non-null  object 
 8   in_reply_to_status_id   1172 non-null   float64
 9   in_reply_to_user_id     1219 non-null   float64
 10  quoted_user_id          400 non-null    float64
 11  quoted_status_id        400 non-null    float64
 12  retweeted_user_id       4008 non-null   float64
 13  retweeted_status_id     4008 non-null   float64
 14  created_at              11249 non-null

### Representative sample based on week and location

In [43]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2018_01_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2018_01_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2018-w01,Acre,2,0.000178
1,2018-w01,Amapa,4,0.000356
2,2018-w01,Amazonas,6,0.000533
3,2018-w01,Bahia,11,0.000978
4,2018-w01,Ceara,7,0.000622
...,...,...,...,...
116,2018-w05,Roraima,2,0.000178
117,2018-w05,Santa Catarina,10,0.000889
118,2018-w05,Sao Paulo,357,0.031736
119,2018-w05,Sergipe,6,0.000533


In [44]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2018-w01,Acre,2,0.000178,0
1,2018-w01,Amapa,4,0.000356,0
2,2018-w01,Amazonas,6,0.000533,1
3,2018-w01,Bahia,11,0.000978,2
4,2018-w01,Ceara,7,0.000622,1
...,...,...,...,...,...
116,2018-w05,Roraima,2,0.000178,0
117,2018-w05,Santa Catarina,10,0.000889,2
118,2018-w05,Sao Paulo,357,0.031736,77
119,2018-w05,Sergipe,6,0.000533,1


In [45]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2018_01_neu_sample_size = pd.merge(df_2018_01_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [46]:
# Step 6: Perform the stratified sampling
df_2018_01_neu_sample = df_2018_01_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2018_01_neu_sample)} rows.")


Sampled 2386 rows.


In [47]:
text_gpt_2018_01_neu = '\n'.join(df_2018_01_neu_sample['text'].astype(str))

### Prompts and loops

In [50]:
prompt_1 = prompt_raw.replace('TWEETS', text_gpt_2018_01_neu)

In [51]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_1}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [52]:
response.choices[0].message.content

"The text discusses various aspects of vaccination against yellow fever in Brazil, emphasizing its importance due to recent outbreaks. It highlights the need for individuals to be vaccinated, especially if they live in or travel to affected areas. The text mentions the efficacy of the vaccine, with one dose providing long-term immunity, and encourages people to get vaccinated, while also addressing misconceptions and fears surrounding vaccination. Additionally, it notes the logistical efforts to increase vaccination availability and the public's response to the health crisis."

In [53]:
summary = []
dataset = []

In [54]:
dataset.append("2018_01_neuitive")

In [55]:
summary.append(response.choices[0].message.content)

In [56]:
df_2018_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2018_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  1 non-null      object
 1   summary  1 non-null      object
dtypes: object(2)
memory usage: 144.0+ bytes


In [57]:
df_2018_neu.tail(14)

Unnamed: 0,dataset,summary
0,2018_01_positive,The text discusses various aspects of vaccinat...


In [58]:
response

<OpenAIObject chat.completion id=chatcmpl-AUAfHqjLb9baHGPVuzViRZCOl2XYM at 0x2c315572c70> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various aspects of vaccination against yellow fever in Brazil, emphasizing its importance due to recent outbreaks. It highlights the need for individuals to be vaccinated, especially if they live in or travel to affected areas. The text mentions the efficacy of the vaccine, with one dose providing long-term immunity, and encourages people to get vaccinated, while also addressing misconceptions and fears surrounding vaccination. Additionally, it notes the logistical efforts to increase vaccination availability and the public's response to the health crisis.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731754199,
  "id": "chatcmpl-AUAfHqjLb9baHGPVuzViRZCOl2XYM",
  "model": "gpt-4o-mini-2024-07-18",
 

## February

In [59]:
# Get monthly data
df_2018_02_neu = pd.read_csv('../data/local/df_neu_2018-02.csv').reset_index()
df_2018_02_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4108 entries, 0 to 4107
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   4108 non-null   int64  
 1   LocationCode            4108 non-null   object 
 2   LocationName            4108 non-null   object 
 3   StatesCode              4108 non-null   object 
 4   ...4                    4108 non-null   int64  
 5   id                      4108 non-null   float64
 6   text                    4108 non-null   object 
 7   label                   4108 non-null   object 
 8   in_reply_to_status_id   363 non-null    float64
 9   in_reply_to_user_id     384 non-null    float64
 10  quoted_user_id          222 non-null    float64
 11  quoted_status_id        222 non-null    float64
 12  retweeted_user_id       1657 non-null   float64
 13  retweeted_status_id     1657 non-null   float64
 14  created_at              4108 non-null   

### Representative sample based on week and location

In [60]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2018_02_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2018_02_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2018-w05,Acre,6,0.001461
1,2018-w05,Alagoas,7,0.001704
2,2018-w05,Amapa,3,0.000730
3,2018-w05,Amazonas,30,0.007303
4,2018-w05,Bahia,61,0.014849
...,...,...,...,...
112,2018-w09,Rondonia,1,0.000243
113,2018-w09,Roraima,3,0.000730
114,2018-w09,Santa Catarina,5,0.001217
115,2018-w09,Sao Paulo,78,0.018987


In [61]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2018-w05,Acre,6,0.001461,3
1,2018-w05,Alagoas,7,0.001704,4
2,2018-w05,Amapa,3,0.000730,1
3,2018-w05,Amazonas,30,0.007303,17
4,2018-w05,Bahia,61,0.014849,36
...,...,...,...,...,...
112,2018-w09,Rondonia,1,0.000243,0
113,2018-w09,Roraima,3,0.000730,1
114,2018-w09,Santa Catarina,5,0.001217,2
115,2018-w09,Sao Paulo,78,0.018987,46


In [62]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2018_02_neu_sample_size = pd.merge(df_2018_02_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [63]:
# Step 6: Perform the stratified sampling
df_2018_02_neu_sample = df_2018_02_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2018_02_neu_sample)} rows.")


Sampled 2383 rows.


In [64]:
text_gpt_2018_02_neu = '\n'.join(df_2018_02_neu_sample['text'].astype(str))

### Prompts and loops

In [65]:
prompt_2 = prompt_raw.replace('TWEETS', text_gpt_2018_02_neu)

In [66]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_2}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [67]:
response.choices[0].message.content

'The text discusses the importance of vaccination against yellow fever, highlighting a significant increase in vaccine demand due to health concerns and recent fatalities. It emphasizes the efficacy and safety of the vaccine, mentions the need for public awareness campaigns, and criticizes misinformation surrounding vaccinations. Various locations are organizing vaccination events and encouraging residents to get vaccinated to prevent outbreaks.'

In [68]:
#summary = []
#dataset = []

In [69]:
dataset.append("2018_02_neuitive")

In [70]:
summary.append(response.choices[0].message.content)

In [71]:
df_2018_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2018_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  2 non-null      object
 1   summary  2 non-null      object
dtypes: object(2)
memory usage: 160.0+ bytes


In [72]:
df_2018_neu.tail(14)

Unnamed: 0,dataset,summary
0,2018_01_positive,The text discusses various aspects of vaccinat...
1,2018_02_positive,The text discusses the importance of vaccinati...


In [73]:
response

<OpenAIObject chat.completion id=chatcmpl-AUAfL0pm49DfhoUqb8c1iBZEaaLRY at 0x2c315b17bd0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses the importance of vaccination against yellow fever, highlighting a significant increase in vaccine demand due to health concerns and recent fatalities. It emphasizes the efficacy and safety of the vaccine, mentions the need for public awareness campaigns, and criticizes misinformation surrounding vaccinations. Various locations are organizing vaccination events and encouraging residents to get vaccinated to prevent outbreaks.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731754203,
  "id": "chatcmpl-AUAfL0pm49DfhoUqb8c1iBZEaaLRY",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_9b78b61c52",
  "usage": {
    "completion_tokens": 67,
    "completion_toke

## March

In [74]:
# Get monthly data
df_2018_03_neu = pd.read_csv('../data/local/df_neu_2018-03.csv').reset_index()
df_2018_03_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3302 entries, 0 to 3301
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3302 non-null   int64  
 1   LocationCode            3302 non-null   object 
 2   LocationName            3302 non-null   object 
 3   StatesCode              3302 non-null   object 
 4   ...4                    3302 non-null   int64  
 5   id                      3302 non-null   float64
 6   text                    3302 non-null   object 
 7   label                   3302 non-null   object 
 8   in_reply_to_status_id   325 non-null    float64
 9   in_reply_to_user_id     340 non-null    float64
 10  quoted_user_id          121 non-null    float64
 11  quoted_status_id        121 non-null    float64
 12  retweeted_user_id       967 non-null    float64
 13  retweeted_status_id     967 non-null    float64
 14  created_at              3302 non-null   

### Representative sample based on week and location

In [75]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2018_03_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2018_03_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2018-w09,Acre,1,0.000303
1,2018-w09,Alagoas,4,0.001211
2,2018-w09,Amazonas,12,0.003634
3,2018-w09,Bahia,13,0.003937
4,2018-w09,Ceara,3,0.000909
...,...,...,...,...
110,2018-w13,Rondonia,1,0.000303
111,2018-w13,Roraima,2,0.000606
112,2018-w13,Santa Catarina,10,0.003028
113,2018-w13,Sao Paulo,137,0.041490


In [76]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2018-w09,Acre,1,0.000303,0
1,2018-w09,Alagoas,4,0.001211,2
2,2018-w09,Amazonas,12,0.003634,8
3,2018-w09,Bahia,13,0.003937,9
4,2018-w09,Ceara,3,0.000909,2
...,...,...,...,...,...
110,2018-w13,Rondonia,1,0.000303,0
111,2018-w13,Roraima,2,0.000606,1
112,2018-w13,Santa Catarina,10,0.003028,7
113,2018-w13,Sao Paulo,137,0.041490,101


In [79]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2018_03_neu_sample_size = pd.merge(df_2018_03_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [80]:
# Step 6: Perform the stratified sampling
df_2018_03_neu_sample = df_2018_03_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2018_03_neu_sample)} rows.")


Sampled 2384 rows.


In [81]:
text_gpt_2018_03_neu = '\n'.join(df_2018_03_neu_sample['text'].astype(str))

### Prompts and loops

In [82]:
prompt_3 = prompt_raw.replace('TWEETS', text_gpt_2018_03_neu)

In [83]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_3}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [84]:
response.choices[0].message.content

'The provided text discusses various vaccination campaigns in Brazil, particularly focusing on influenza, yellow fever, and measles. It highlights initiatives by local governments and the Ministry of Health to increase vaccination coverage, address public health concerns, and combat misinformation about vaccines. The importance of vaccination as a preventive measure against infectious diseases is emphasized throughout.'

In [85]:
#summary = []
#dataset = []

In [86]:
dataset.append("2018_03_neuitive")

In [87]:
summary.append(response.choices[0].message.content)

In [88]:
df_2018_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2018_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  3 non-null      object
 1   summary  3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes


In [89]:
df_2018_neu.tail(14)

Unnamed: 0,dataset,summary
0,2018_01_positive,The text discusses various aspects of vaccinat...
1,2018_02_positive,The text discusses the importance of vaccinati...
2,2018_03_positive,The provided text discusses various vaccinatio...


In [90]:
response

<OpenAIObject chat.completion id=chatcmpl-AUAg9WS7B3HfGobMvJnTzMC5eAsXs at 0x2c311d55680> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided text discusses various vaccination campaigns in Brazil, particularly focusing on influenza, yellow fever, and measles. It highlights initiatives by local governments and the Ministry of Health to increase vaccination coverage, address public health concerns, and combat misinformation about vaccines. The importance of vaccination as a preventive measure against infectious diseases is emphasized throughout.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731754253,
  "id": "chatcmpl-AUAg9WS7B3HfGobMvJnTzMC5eAsXs",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 64,
    "completion_tokens_details": {
      "accep

## April

In [91]:
# Get monthly data
df_2018_04_neu = pd.read_csv('../data/local/df_neu_2018-04.csv').reset_index()
df_2018_04_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3358 entries, 0 to 3357
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3358 non-null   int64  
 1   LocationCode            3358 non-null   object 
 2   LocationName            3358 non-null   object 
 3   StatesCode              3358 non-null   object 
 4   ...4                    3358 non-null   int64  
 5   id                      3358 non-null   float64
 6   text                    3358 non-null   object 
 7   label                   3358 non-null   object 
 8   in_reply_to_status_id   333 non-null    float64
 9   in_reply_to_user_id     342 non-null    float64
 10  quoted_user_id          115 non-null    float64
 11  quoted_status_id        115 non-null    float64
 12  retweeted_user_id       997 non-null    float64
 13  retweeted_status_id     997 non-null    float64
 14  created_at              3358 non-null   

### Representative sample based on week and location

In [92]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2018_04_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2018_04_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2018-w13,Amazonas,2,0.000596
1,2018-w13,Bahia,3,0.000893
2,2018-w13,Ceara,2,0.000596
3,2018-w13,Distrito Federal,2,0.000596
4,2018-w13,Goias,3,0.000893
...,...,...,...,...
124,2018-w18,Rio Grande Do Sul,2,0.000596
125,2018-w18,Rondonia,1,0.000298
126,2018-w18,Roraima,1,0.000298
127,2018-w18,Santa Catarina,5,0.001489


In [93]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2018-w13,Amazonas,2,0.000596,1
1,2018-w13,Bahia,3,0.000893,2
2,2018-w13,Ceara,2,0.000596,1
3,2018-w13,Distrito Federal,2,0.000596,1
4,2018-w13,Goias,3,0.000893,2
...,...,...,...,...,...
124,2018-w18,Rio Grande Do Sul,2,0.000596,1
125,2018-w18,Rondonia,1,0.000298,0
126,2018-w18,Roraima,1,0.000298,0
127,2018-w18,Santa Catarina,5,0.001489,3


In [94]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2018_04_neu_sample_size = pd.merge(df_2018_04_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [95]:
# Step 6: Perform the stratified sampling
df_2018_04_neu_sample = df_2018_04_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2018_04_neu_sample)} rows.")


Sampled 2380 rows.


In [96]:
text_gpt_2018_04_neu = '\n'.join(df_2018_04_neu_sample['text'].astype(str))

### Prompts and loops

In [97]:
prompt_4 = prompt_raw.replace('TWEETS', text_gpt_2018_04_neu)

In [98]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_4}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [99]:
response.choices[0].message.content

'Cuba has developed a cancer vaccine that has reportedly cured over 4,000 patients. Various discussions emphasize the importance of vaccination, addressing issues like vaccine hesitancy and the need for public health campaigns. There are mentions of other vaccines for diseases like flu, diabetes, and HPV, along with the social responsibility of vaccination to prevent outbreaks.'

In [100]:
#summary = []
#dataset = []

In [101]:
dataset.append("2018_04_neuitive")

In [102]:
summary.append(response.choices[0].message.content)

In [103]:
df_2018_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2018_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  4 non-null      object
 1   summary  4 non-null      object
dtypes: object(2)
memory usage: 192.0+ bytes


In [104]:
df_2018_neu.tail(14)

Unnamed: 0,dataset,summary
0,2018_01_positive,The text discusses various aspects of vaccinat...
1,2018_02_positive,The text discusses the importance of vaccinati...
2,2018_03_positive,The provided text discusses various vaccinatio...
3,2018_04_positive,Cuba has developed a cancer vaccine that has r...


In [105]:
response

<OpenAIObject chat.completion id=chatcmpl-AUAgJSSOPneo2LzMdFMf5piAwRJ9d at 0x2c311df7e50> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Cuba has developed a cancer vaccine that has reportedly cured over 4,000 patients. Various discussions emphasize the importance of vaccination, addressing issues like vaccine hesitancy and the need for public health campaigns. There are mentions of other vaccines for diseases like flu, diabetes, and HPV, along with the social responsibility of vaccination to prevent outbreaks.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731754263,
  "id": "chatcmpl-AUAgJSSOPneo2LzMdFMf5piAwRJ9d",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_9b78b61c52",
  "usage": {
    "completion_tokens": 68,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_to

## May

In [106]:
# Get monthly data
df_2018_05_neu = pd.read_csv('../data/local/df_neu_2018-05.csv').reset_index()
df_2018_05_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3425 entries, 0 to 3424
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3425 non-null   int64  
 1   LocationCode            3425 non-null   object 
 2   LocationName            3425 non-null   object 
 3   StatesCode              3425 non-null   object 
 4   ...4                    3425 non-null   int64  
 5   id                      3425 non-null   float64
 6   text                    3425 non-null   object 
 7   label                   3425 non-null   object 
 8   in_reply_to_status_id   508 non-null    float64
 9   in_reply_to_user_id     527 non-null    float64
 10  quoted_user_id          58 non-null     float64
 11  quoted_status_id        58 non-null     float64
 12  retweeted_user_id       937 non-null    float64
 13  retweeted_status_id     937 non-null    float64
 14  created_at              3425 non-null   

### Representative sample based on week and location

In [107]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2018_05_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2018_05_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2018-w18,Alagoas,3,0.000876
1,2018-w18,Amapa,7,0.002044
2,2018-w18,Amazonas,5,0.001460
3,2018-w18,Bahia,16,0.004672
4,2018-w18,Ceara,30,0.008759
...,...,...,...,...
113,2018-w22,Roraima,1,0.000292
114,2018-w22,Santa Catarina,2,0.000584
115,2018-w22,Sao Paulo,61,0.017810
116,2018-w22,Sergipe,5,0.001460


In [108]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2018-w18,Alagoas,3,0.000876,2
1,2018-w18,Amapa,7,0.002044,5
2,2018-w18,Amazonas,5,0.001460,3
3,2018-w18,Bahia,16,0.004672,11
4,2018-w18,Ceara,30,0.008759,21
...,...,...,...,...,...
113,2018-w22,Roraima,1,0.000292,0
114,2018-w22,Santa Catarina,2,0.000584,1
115,2018-w22,Sao Paulo,61,0.017810,43
116,2018-w22,Sergipe,5,0.001460,3


In [109]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2018_05_neu_sample_size = pd.merge(df_2018_05_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [110]:
# Step 6: Perform the stratified sampling
df_2018_05_neu_sample = df_2018_05_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2018_05_neu_sample)} rows.")


Sampled 2398 rows.


In [111]:
text_gpt_2018_05_neu = '\n'.join(df_2018_05_neu_sample['text'].astype(str))

### Prompts and loops

In [112]:
prompt_5 = prompt_raw.replace('TWEETS', text_gpt_2018_05_neu)

In [113]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_5}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [114]:
response.choices[0].message.content

'The provided text discusses various vaccination campaigns in Brazil, particularly against influenza (the flu), highlighting events like "Dia D" where a large number of vaccination posts are set up to immunize the public. It mentions the importance of vaccinating children, pregnant women, and other priority groups, and emphasizes successful immunization efforts, including milestones and statistics. The text also touches on related health initiatives, such as vaccinations against other diseases like yellow fever and HPV, and the recognition of Brazil as free from foot-and-mouth disease due to successful vaccination programs.'

In [115]:
#summary = []
#dataset = []

In [116]:
dataset.append("2018_05_neuitive")

In [117]:
summary.append(response.choices[0].message.content)

In [118]:
df_2018_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2018_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  5 non-null      object
 1   summary  5 non-null      object
dtypes: object(2)
memory usage: 208.0+ bytes


In [119]:
df_2018_neu.tail(14)

Unnamed: 0,dataset,summary
0,2018_01_positive,The text discusses various aspects of vaccinat...
1,2018_02_positive,The text discusses the importance of vaccinati...
2,2018_03_positive,The provided text discusses various vaccinatio...
3,2018_04_positive,Cuba has developed a cancer vaccine that has r...
4,2018_05_positive,The provided text discusses various vaccinatio...


In [120]:
response

<OpenAIObject chat.completion id=chatcmpl-AUAgSXC9PVwB2z8iE5dkrygw722PX at 0x2c315a3e7c0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided text discusses various vaccination campaigns in Brazil, particularly against influenza (the flu), highlighting events like \"Dia D\" where a large number of vaccination posts are set up to immunize the public. It mentions the importance of vaccinating children, pregnant women, and other priority groups, and emphasizes successful immunization efforts, including milestones and statistics. The text also touches on related health initiatives, such as vaccinations against other diseases like yellow fever and HPV, and the recognition of Brazil as free from foot-and-mouth disease due to successful vaccination programs.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731754272,
  "id": "chatcmpl-AUAgSXC9PVwB2z8iE5d

## June

In [121]:
# Get monthly data
df_2018_06_neu = pd.read_csv('../data/local/df_neu_2018-06.csv').reset_index()
df_2018_06_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3967 entries, 0 to 3966
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3967 non-null   int64  
 1   LocationCode            3967 non-null   object 
 2   LocationName            3967 non-null   object 
 3   StatesCode              3967 non-null   object 
 4   ...4                    3967 non-null   int64  
 5   id                      3967 non-null   float64
 6   text                    3967 non-null   object 
 7   label                   3967 non-null   object 
 8   in_reply_to_status_id   332 non-null    float64
 9   in_reply_to_user_id     337 non-null    float64
 10  quoted_user_id          899 non-null    float64
 11  quoted_status_id        899 non-null    float64
 12  retweeted_user_id       1878 non-null   float64
 13  retweeted_status_id     1878 non-null   float64
 14  created_at              3967 non-null   

### Representative sample based on week and location

In [122]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2018_06_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2018_06_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2018-w22,Acre,3,0.000756
1,2018-w22,Alagoas,8,0.002017
2,2018-w22,Amazonas,4,0.001008
3,2018-w22,Bahia,12,0.003025
4,2018-w22,Ceara,5,0.001260
...,...,...,...,...
115,2018-w26,Roraima,2,0.000504
116,2018-w26,Santa Catarina,35,0.008823
117,2018-w26,Sao Paulo,272,0.068566
118,2018-w26,Sergipe,11,0.002773


In [123]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2018-w22,Acre,3,0.000756,1
1,2018-w22,Alagoas,8,0.002017,4
2,2018-w22,Amazonas,4,0.001008,2
3,2018-w22,Bahia,12,0.003025,7
4,2018-w22,Ceara,5,0.001260,3
...,...,...,...,...,...
115,2018-w26,Roraima,2,0.000504,1
116,2018-w26,Santa Catarina,35,0.008823,21
117,2018-w26,Sao Paulo,272,0.068566,167
118,2018-w26,Sergipe,11,0.002773,6


In [124]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2018_06_neu_sample_size = pd.merge(df_2018_06_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [125]:
# Step 6: Perform the stratified sampling
df_2018_06_neu_sample = df_2018_06_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2018_06_neu_sample)} rows.")


Sampled 2394 rows.


In [126]:
text_gpt_2018_06_neu = '\n'.join(df_2018_06_neu_sample['text'].astype(str))

### Prompts and loops

In [130]:
prompt_6 = prompt_raw.replace('TWEETS', text_gpt_2018_06_neu)

In [131]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_6}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [132]:
response.choices[0].message.content

'The text discusses the ongoing vaccination campaigns in Brazil, particularly for the flu and yellow fever. It highlights the importance of vaccinations for public health, emphasizing that many people are still unvaccinated despite the availability of free vaccines. There is a call to action for people to vaccinate their children and themselves to prevent the resurgence of diseases like polio and measles, which are linked to declining vaccination rates due to misinformation. The text also mentions specific vaccination events and initiatives aimed at increasing public awareness and participation in vaccination programs.'

In [133]:
#summary = []
#dataset = []

In [134]:
dataset.append("2018_06_neuitive")

In [135]:
summary.append(response.choices[0].message.content)

In [136]:
df_2018_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2018_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  6 non-null      object
 1   summary  6 non-null      object
dtypes: object(2)
memory usage: 224.0+ bytes


In [137]:
df_2018_neu.tail(14)

Unnamed: 0,dataset,summary
0,2018_01_positive,The text discusses various aspects of vaccinat...
1,2018_02_positive,The text discusses the importance of vaccinati...
2,2018_03_positive,The provided text discusses various vaccinatio...
3,2018_04_positive,Cuba has developed a cancer vaccine that has r...
4,2018_05_positive,The provided text discusses various vaccinatio...
5,2018_06_positive,The text discusses the ongoing vaccination cam...


In [138]:
response

<OpenAIObject chat.completion id=chatcmpl-AUAhN8ctfq60IhPRU2gbMO7glK9I2 at 0x2c315cf9b80> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses the ongoing vaccination campaigns in Brazil, particularly for the flu and yellow fever. It highlights the importance of vaccinations for public health, emphasizing that many people are still unvaccinated despite the availability of free vaccines. There is a call to action for people to vaccinate their children and themselves to prevent the resurgence of diseases like polio and measles, which are linked to declining vaccination rates due to misinformation. The text also mentions specific vaccination events and initiatives aimed at increasing public awareness and participation in vaccination programs.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731754329,
  "id": "chatcmpl-AUAhN8ctfq60IhPRU2gbMO7glK

## July

In [139]:
# Get monthly data
df_2018_07_neu = pd.read_csv('../data/local/df_neu_2018-07.csv').reset_index()
df_2018_07_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9281 entries, 0 to 9280
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   9281 non-null   int64  
 1   LocationCode            9281 non-null   object 
 2   LocationName            9281 non-null   object 
 3   StatesCode              9281 non-null   object 
 4   ...4                    9281 non-null   int64  
 5   id                      9281 non-null   float64
 6   text                    9281 non-null   object 
 7   label                   9281 non-null   object 
 8   in_reply_to_status_id   661 non-null    float64
 9   in_reply_to_user_id     680 non-null    float64
 10  quoted_user_id          2401 non-null   float64
 11  quoted_status_id        2401 non-null   float64
 12  retweeted_user_id       5087 non-null   float64
 13  retweeted_status_id     5087 non-null   float64
 14  created_at              9281 non-null   

### Representative sample based on week and location

In [140]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2018_07_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2018_07_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2018-w26,Acre,10,0.001077
1,2018-w26,Alagoas,7,0.000754
2,2018-w26,Amapa,2,0.000215
3,2018-w26,Amazonas,22,0.002370
4,2018-w26,Bahia,49,0.005280
...,...,...,...,...
146,2018-w31,Rondonia,3,0.000323
147,2018-w31,Santa Catarina,9,0.000970
148,2018-w31,Sao Paulo,81,0.008728
149,2018-w31,Sergipe,4,0.000431


In [141]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2018-w26,Acre,10,0.001077,2
1,2018-w26,Alagoas,7,0.000754,1
2,2018-w26,Amapa,2,0.000215,0
3,2018-w26,Amazonas,22,0.002370,5
4,2018-w26,Bahia,49,0.005280,12
...,...,...,...,...,...
146,2018-w31,Rondonia,3,0.000323,0
147,2018-w31,Santa Catarina,9,0.000970,2
148,2018-w31,Sao Paulo,81,0.008728,21
149,2018-w31,Sergipe,4,0.000431,1


In [142]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2018_07_neu_sample_size = pd.merge(df_2018_07_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [143]:
# Step 6: Perform the stratified sampling
df_2018_07_neu_sample = df_2018_07_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2018_07_neu_sample)} rows.")


Sampled 2380 rows.


In [144]:
text_gpt_2018_07_neu = '\n'.join(df_2018_07_neu_sample['text'].astype(str))

### Prompts and loops

In [145]:
prompt_7 = prompt_raw.replace('TWEETS', text_gpt_2018_07_neu)

In [146]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_7}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [147]:
response.choices[0].message.content

'The text emphasizes the critical importance of vaccinating children to prevent the resurgence of diseases like polio and measles. It urges parents to discuss the value of vaccination with others, highlighting the societal responsibility in maintaining public health. It also notes the precarious state of support for disabled individuals in a country facing potential outbreaks due to low vaccination rates.'

In [148]:
#summary = []
#dataset = []

In [149]:
dataset.append("2018_07_neuitive")

In [150]:
summary.append(response.choices[0].message.content)

In [151]:
df_2018_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2018_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  7 non-null      object
 1   summary  7 non-null      object
dtypes: object(2)
memory usage: 240.0+ bytes


In [152]:
df_2018_neu.tail(14)

Unnamed: 0,dataset,summary
0,2018_01_positive,The text discusses various aspects of vaccinat...
1,2018_02_positive,The text discusses the importance of vaccinati...
2,2018_03_positive,The provided text discusses various vaccinatio...
3,2018_04_positive,Cuba has developed a cancer vaccine that has r...
4,2018_05_positive,The provided text discusses various vaccinatio...
5,2018_06_positive,The text discusses the ongoing vaccination cam...
6,2018_07_positive,The text emphasizes the critical importance of...


In [153]:
response

<OpenAIObject chat.completion id=chatcmpl-AUAhXKR2HXtwqiPUntYLkfAY8zdU1 at 0x2c315d71590> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text emphasizes the critical importance of vaccinating children to prevent the resurgence of diseases like polio and measles. It urges parents to discuss the value of vaccination with others, highlighting the societal responsibility in maintaining public health. It also notes the precarious state of support for disabled individuals in a country facing potential outbreaks due to low vaccination rates.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731754339,
  "id": "chatcmpl-AUAhXKR2HXtwqiPUntYLkfAY8zdU1",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 68,
    "completion_tokens_details": {
      "accepted_prediction

## August

In [154]:
# Get monthly data
df_2018_08_neu = pd.read_csv('../data/local/df_neu_2018-08.csv').reset_index()
df_2018_08_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7154 entries, 0 to 7153
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   7154 non-null   int64  
 1   LocationCode            7154 non-null   object 
 2   LocationName            7154 non-null   object 
 3   StatesCode              7154 non-null   object 
 4   ...4                    7154 non-null   int64  
 5   id                      7154 non-null   float64
 6   text                    7154 non-null   object 
 7   label                   7154 non-null   object 
 8   in_reply_to_status_id   430 non-null    float64
 9   in_reply_to_user_id     443 non-null    float64
 10  quoted_user_id          299 non-null    float64
 11  quoted_status_id        299 non-null    float64
 12  retweeted_user_id       2082 non-null   float64
 13  retweeted_status_id     2082 non-null   float64
 14  created_at              7154 non-null   

### Representative sample based on week and location

In [155]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2018_08_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2018_08_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2018-w31,Acre,5,0.000699
1,2018-w31,Alagoas,18,0.002516
2,2018-w31,Amapa,3,0.000419
3,2018-w31,Amazonas,24,0.003355
4,2018-w31,Bahia,92,0.012860
...,...,...,...,...
118,2018-w35,Roraima,2,0.000280
119,2018-w35,Santa Catarina,22,0.003075
120,2018-w35,Sao Paulo,205,0.028655
121,2018-w35,Sergipe,8,0.001118


In [156]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2018-w31,Acre,5,0.000699,1
1,2018-w31,Alagoas,18,0.002516,6
2,2018-w31,Amapa,3,0.000419,1
3,2018-w31,Amazonas,24,0.003355,8
4,2018-w31,Bahia,92,0.012860,31
...,...,...,...,...,...
118,2018-w35,Roraima,2,0.000280,0
119,2018-w35,Santa Catarina,22,0.003075,7
120,2018-w35,Sao Paulo,205,0.028655,70
121,2018-w35,Sergipe,8,0.001118,2


In [157]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2018_08_neu_sample_size = pd.merge(df_2018_08_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [158]:
# Step 6: Perform the stratified sampling
df_2018_08_neu_sample = df_2018_08_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2018_08_neu_sample)} rows.")


Sampled 2391 rows.


In [159]:
text_gpt_2018_08_neu = '\n'.join(df_2018_08_neu_sample['text'].astype(str))

### Prompts and loops

In [160]:
prompt_8 = prompt_raw.replace('TWEETS', text_gpt_2018_08_neu)

In [161]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_8}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [162]:
response.choices[0].message.content

'The text discusses the National Vaccination Campaign in Brazil against measles and poliomyelitis running from August 6 to 31. It emphasizes the urgency of vaccinating children aged 1 to 5 years, highlighting that even previously vaccinated children should receive the doses. The goal is to reach a 95% vaccination coverage to prevent outbreaks of these diseases, which were previously considered eradicated. The campaign includes various outreach efforts and community mobilization to ensure participation.'

In [163]:
#summary = []
#dataset = []

In [164]:
dataset.append("2018_08_neuitive")

In [165]:
summary.append(response.choices[0].message.content)

In [166]:
df_2018_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2018_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  8 non-null      object
 1   summary  8 non-null      object
dtypes: object(2)
memory usage: 256.0+ bytes


In [167]:
df_2018_neu.tail(14)

Unnamed: 0,dataset,summary
0,2018_01_positive,The text discusses various aspects of vaccinat...
1,2018_02_positive,The text discusses the importance of vaccinati...
2,2018_03_positive,The provided text discusses various vaccinatio...
3,2018_04_positive,Cuba has developed a cancer vaccine that has r...
4,2018_05_positive,The provided text discusses various vaccinatio...
5,2018_06_positive,The text discusses the ongoing vaccination cam...
6,2018_07_positive,The text emphasizes the critical importance of...
7,2018_08_positive,The text discusses the National Vaccination Ca...


In [168]:
response

<OpenAIObject chat.completion id=chatcmpl-AUAhrTqBbB4YAI3jZBKAvazpRM2gT at 0x2c3165c3e00> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses the National Vaccination Campaign in Brazil against measles and poliomyelitis running from August 6 to 31. It emphasizes the urgency of vaccinating children aged 1 to 5 years, highlighting that even previously vaccinated children should receive the doses. The goal is to reach a 95% vaccination coverage to prevent outbreaks of these diseases, which were previously considered eradicated. The campaign includes various outreach efforts and community mobilization to ensure participation.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731754359,
  "id": "chatcmpl-AUAhrTqBbB4YAI3jZBKAvazpRM2gT",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "u

## September

In [169]:
# Get monthly data
df_2018_09_neu = pd.read_csv('../data/local/df_neu_2018-09.csv').reset_index()
df_2018_09_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7183 entries, 0 to 7182
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   7183 non-null   int64  
 1   LocationCode            7183 non-null   object 
 2   LocationName            7183 non-null   object 
 3   StatesCode              7183 non-null   object 
 4   ...4                    7183 non-null   int64  
 5   id                      7183 non-null   float64
 6   text                    7183 non-null   object 
 7   label                   7183 non-null   object 
 8   in_reply_to_status_id   507 non-null    float64
 9   in_reply_to_user_id     522 non-null    float64
 10  quoted_user_id          816 non-null    float64
 11  quoted_status_id        816 non-null    float64
 12  retweeted_user_id       2499 non-null   float64
 13  retweeted_status_id     2499 non-null   float64
 14  created_at              7183 non-null   

### Representative sample based on week and location

In [170]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2018_09_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2018_09_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2018-w35,Acre,7,0.000975
1,2018-w35,Alagoas,3,0.000418
2,2018-w35,Amapa,4,0.000557
3,2018-w35,Amazonas,13,0.001810
4,2018-w35,Bahia,30,0.004177
...,...,...,...,...
119,2018-w39,Roraima,7,0.000975
120,2018-w39,Santa Catarina,23,0.003202
121,2018-w39,Sao Paulo,392,0.054573
122,2018-w39,Sergipe,5,0.000696


In [171]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2018-w35,Acre,7,0.000975,2
1,2018-w35,Alagoas,3,0.000418,1
2,2018-w35,Amapa,4,0.000557,1
3,2018-w35,Amazonas,13,0.001810,4
4,2018-w35,Bahia,30,0.004177,10
...,...,...,...,...,...
119,2018-w39,Roraima,7,0.000975,2
120,2018-w39,Santa Catarina,23,0.003202,7
121,2018-w39,Sao Paulo,392,0.054573,133
122,2018-w39,Sergipe,5,0.000696,1


In [172]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2018_09_neu_sample_size = pd.merge(df_2018_09_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [173]:
# Step 6: Perform the stratified sampling
df_2018_09_neu_sample = df_2018_09_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2018_09_neu_sample)} rows.")


Sampled 2395 rows.


In [174]:
text_gpt_2018_09_neu = '\n'.join(df_2018_09_neu_sample['text'].astype(str))

### Prompts and loops

In [175]:
prompt_9 = prompt_raw.replace('TWEETS', text_gpt_2018_09_neu)

In [176]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_9}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [177]:
response.choices[0].message.content

'Researchers in the U.S. are developing a vaccine against acne, currently in testing phases and shown to be effective in inhibiting the bacteria responsible for skin inflammation, predominantly affecting teenagers. Additionally, there are ongoing vaccination campaigns targeting measles and polio, with significant coverage achieved across various regions.'

In [178]:
#summary = []
#dataset = []

In [179]:
dataset.append("2018_09_neuitive")

In [180]:
summary.append(response.choices[0].message.content)

In [181]:
df_2018_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2018_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  9 non-null      object
 1   summary  9 non-null      object
dtypes: object(2)
memory usage: 272.0+ bytes


In [182]:
df_2018_neu.tail(14)

Unnamed: 0,dataset,summary
0,2018_01_positive,The text discusses various aspects of vaccinat...
1,2018_02_positive,The text discusses the importance of vaccinati...
2,2018_03_positive,The provided text discusses various vaccinatio...
3,2018_04_positive,Cuba has developed a cancer vaccine that has r...
4,2018_05_positive,The provided text discusses various vaccinatio...
5,2018_06_positive,The text discusses the ongoing vaccination cam...
6,2018_07_positive,The text emphasizes the critical importance of...
7,2018_08_positive,The text discusses the National Vaccination Ca...
8,2018_09_positive,Researchers in the U.S. are developing a vacci...


In [183]:
response

<OpenAIObject chat.completion id=chatcmpl-AUAi1LPRiXZ5pIlgS6wt2rnHpccKm at 0x2c317ca2d60> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Researchers in the U.S. are developing a vaccine against acne, currently in testing phases and shown to be effective in inhibiting the bacteria responsible for skin inflammation, predominantly affecting teenagers. Additionally, there are ongoing vaccination campaigns targeting measles and polio, with significant coverage achieved across various regions.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731754369,
  "id": "chatcmpl-AUAi1LPRiXZ5pIlgS6wt2rnHpccKm",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 58,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reasoni

## October

In [184]:
# Get monthly data
df_2018_10_neu = pd.read_csv('../data/local/df_neu_2018-10.csv').reset_index()
df_2018_10_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7922 entries, 0 to 7921
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   7922 non-null   int64  
 1   LocationCode            7922 non-null   object 
 2   LocationName            7922 non-null   object 
 3   StatesCode              7922 non-null   object 
 4   ...4                    7922 non-null   int64  
 5   id                      7922 non-null   float64
 6   text                    7922 non-null   object 
 7   label                   7922 non-null   object 
 8   in_reply_to_status_id   595 non-null    float64
 9   in_reply_to_user_id     610 non-null    float64
 10  quoted_user_id          479 non-null    float64
 11  quoted_status_id        479 non-null    float64
 12  retweeted_user_id       4287 non-null   float64
 13  retweeted_status_id     4287 non-null   float64
 14  created_at              7922 non-null   

### Representative sample based on week and location

In [185]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2018_10_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2018_10_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2018-w40,Acre,9,0.001136
1,2018-w40,Alagoas,21,0.002651
2,2018-w40,Amapa,9,0.001136
3,2018-w40,Amazonas,48,0.006059
4,2018-w40,Bahia,85,0.010730
...,...,...,...,...
118,2018-w44,Roraima,1,0.000126
119,2018-w44,Santa Catarina,17,0.002146
120,2018-w44,Sao Paulo,78,0.009846
121,2018-w44,Sergipe,5,0.000631


In [186]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2018-w40,Acre,9,0.001136,2
1,2018-w40,Alagoas,21,0.002651,6
2,2018-w40,Amapa,9,0.001136,2
3,2018-w40,Amazonas,48,0.006059,14
4,2018-w40,Bahia,85,0.010730,26
...,...,...,...,...,...
118,2018-w44,Roraima,1,0.000126,0
119,2018-w44,Santa Catarina,17,0.002146,5
120,2018-w44,Sao Paulo,78,0.009846,24
121,2018-w44,Sergipe,5,0.000631,1


In [187]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2018_10_neu_sample_size = pd.merge(df_2018_10_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [188]:
# Step 6: Perform the stratified sampling
df_2018_10_neu_sample = df_2018_10_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2018_10_neu_sample)} rows.")


Sampled 2385 rows.


In [189]:
text_gpt_2018_10_neu = '\n'.join(df_2018_10_neu_sample['text'].astype(str))

### Prompts and loops

In [190]:
prompt_10 = prompt_raw.replace('TWEETS', text_gpt_2018_10_neu)

In [191]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_10}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [192]:
response.choices[0].message.content

'The provided text discusses the importance of vaccination and the impact of misinformation on public health. It emphasizes the need for vaccination against various diseases, warns against the spread of fake news regarding vaccines, and highlights successful vaccination campaigns for both humans and animals. The text also touches on the societal responsibilities of parents to vaccinate their children and the potential consequences of failing to do so. Overall, it promotes vaccination as a critical measure for preventing diseases and protecting public health.'

In [193]:
#summary = []
#dataset = []

In [194]:
dataset.append("2018_10_neuitive")

In [195]:
summary.append(response.choices[0].message.content)

In [196]:
df_2018_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2018_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  10 non-null     object
 1   summary  10 non-null     object
dtypes: object(2)
memory usage: 288.0+ bytes


In [197]:
df_2018_neu.tail(14)

Unnamed: 0,dataset,summary
0,2018_01_positive,The text discusses various aspects of vaccinat...
1,2018_02_positive,The text discusses the importance of vaccinati...
2,2018_03_positive,The provided text discusses various vaccinatio...
3,2018_04_positive,Cuba has developed a cancer vaccine that has r...
4,2018_05_positive,The provided text discusses various vaccinatio...
5,2018_06_positive,The text discusses the ongoing vaccination cam...
6,2018_07_positive,The text emphasizes the critical importance of...
7,2018_08_positive,The text discusses the National Vaccination Ca...
8,2018_09_positive,Researchers in the U.S. are developing a vacci...
9,2018_10_positive,The provided text discusses the importance of ...


In [198]:
response

<OpenAIObject chat.completion id=chatcmpl-AUAiDiHQKyDwgD77J9HyduezebE2q at 0x2c318663ae0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided text discusses the importance of vaccination and the impact of misinformation on public health. It emphasizes the need for vaccination against various diseases, warns against the spread of fake news regarding vaccines, and highlights successful vaccination campaigns for both humans and animals. The text also touches on the societal responsibilities of parents to vaccinate their children and the potential consequences of failing to do so. Overall, it promotes vaccination as a critical measure for preventing diseases and protecting public health.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731754381,
  "id": "chatcmpl-AUAiDiHQKyDwgD77J9HyduezebE2q",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.c

## November

In [199]:
# Get monthly data
df_2018_11_neu = pd.read_csv('../data/local/df_neu_2018-11.csv').reset_index()
df_2018_11_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6917 entries, 0 to 6916
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   6917 non-null   int64  
 1   LocationCode            6917 non-null   object 
 2   LocationName            6917 non-null   object 
 3   StatesCode              6917 non-null   object 
 4   ...4                    6917 non-null   int64  
 5   id                      6917 non-null   float64
 6   text                    6917 non-null   object 
 7   label                   6917 non-null   object 
 8   in_reply_to_status_id   535 non-null    float64
 9   in_reply_to_user_id     547 non-null    float64
 10  quoted_user_id          1057 non-null   float64
 11  quoted_status_id        1057 non-null   float64
 12  retweeted_user_id       3751 non-null   float64
 13  retweeted_status_id     3751 non-null   float64
 14  created_at              6917 non-null   

### Representative sample based on week and location

In [200]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2018_11_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2018_11_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2018-w44,Acre,5,0.000723
1,2018-w44,Alagoas,25,0.003614
2,2018-w44,Amapa,1,0.000145
3,2018-w44,Amazonas,16,0.002313
4,2018-w44,Bahia,18,0.002602
...,...,...,...,...
115,2018-w48,Roraima,3,0.000434
116,2018-w48,Santa Catarina,14,0.002024
117,2018-w48,Sao Paulo,145,0.020963
118,2018-w48,Sergipe,1,0.000145


In [201]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2018-w44,Acre,5,0.000723,1
1,2018-w44,Alagoas,25,0.003614,8
2,2018-w44,Amapa,1,0.000145,0
3,2018-w44,Amazonas,16,0.002313,5
4,2018-w44,Bahia,18,0.002602,6
...,...,...,...,...,...
115,2018-w48,Roraima,3,0.000434,1
116,2018-w48,Santa Catarina,14,0.002024,4
117,2018-w48,Sao Paulo,145,0.020963,51
118,2018-w48,Sergipe,1,0.000145,0


In [202]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2018_11_neu_sample_size = pd.merge(df_2018_11_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [203]:
# Step 6: Perform the stratified sampling
df_2018_11_neu_sample = df_2018_11_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2018_11_neu_sample)} rows.")


Sampled 2392 rows.


In [204]:
text_gpt_2018_11_neu = '\n'.join(df_2018_11_neu_sample['text'].astype(str))

### Prompts and loops

In [205]:
prompt_11 = prompt_raw.replace('TWEETS', text_gpt_2018_11_neu)

In [206]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_11}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [207]:
response.choices[0].message.content

'The provided Portuguese text discusses various aspects of vaccination, emphasizing its importance in preventing diseases such as measles, yellow fever, and rabies. It highlights the need for adults and children to stay up to date with their vaccinations, as well as the consequences of neglecting immunizations. The text also mentions campaigns aimed at increasing vaccination coverage, addressing misinformation about vaccines, and promoting public health initiatives. Additionally, it touches on the role of vaccination in animal health and the societal responsibility of parents to vaccinate their children.'

In [208]:
#summary = []
#dataset = []

In [209]:
dataset.append("2018_11_neuitive")

In [210]:
summary.append(response.choices[0].message.content)

In [211]:
df_2018_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2018_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  11 non-null     object
 1   summary  11 non-null     object
dtypes: object(2)
memory usage: 304.0+ bytes


In [212]:
df_2018_neu.tail(14)

Unnamed: 0,dataset,summary
0,2018_01_positive,The text discusses various aspects of vaccinat...
1,2018_02_positive,The text discusses the importance of vaccinati...
2,2018_03_positive,The provided text discusses various vaccinatio...
3,2018_04_positive,Cuba has developed a cancer vaccine that has r...
4,2018_05_positive,The provided text discusses various vaccinatio...
5,2018_06_positive,The text discusses the ongoing vaccination cam...
6,2018_07_positive,The text emphasizes the critical importance of...
7,2018_08_positive,The text discusses the National Vaccination Ca...
8,2018_09_positive,Researchers in the U.S. are developing a vacci...
9,2018_10_positive,The provided text discusses the importance of ...


In [213]:
response

<OpenAIObject chat.completion id=chatcmpl-AUAiSgQwAETawwr0nNOSfqEqg67eQ at 0x2c3113a6b80> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided Portuguese text discusses various aspects of vaccination, emphasizing its importance in preventing diseases such as measles, yellow fever, and rabies. It highlights the need for adults and children to stay up to date with their vaccinations, as well as the consequences of neglecting immunizations. The text also mentions campaigns aimed at increasing vaccination coverage, addressing misinformation about vaccines, and promoting public health initiatives. Additionally, it touches on the role of vaccination in animal health and the societal responsibility of parents to vaccinate their children.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731754396,
  "id": "chatcmpl-AUAiSgQwAETawwr0nNOSfqEqg67eQ",
  "model"

## December

In [214]:
# Get monthly data
df_2018_12_neu = pd.read_csv('../data/local/df_neu_2018-12.csv').reset_index()
df_2018_12_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5926 entries, 0 to 5925
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   5926 non-null   int64  
 1   LocationCode            5926 non-null   object 
 2   LocationName            5926 non-null   object 
 3   StatesCode              5926 non-null   object 
 4   ...4                    5926 non-null   int64  
 5   id                      5926 non-null   float64
 6   text                    5926 non-null   object 
 7   label                   5926 non-null   object 
 8   in_reply_to_status_id   565 non-null    float64
 9   in_reply_to_user_id     576 non-null    float64
 10  quoted_user_id          306 non-null    float64
 11  quoted_status_id        306 non-null    float64
 12  retweeted_user_id       3602 non-null   float64
 13  retweeted_status_id     3602 non-null   float64
 14  created_at              5926 non-null   

### Representative sample based on week and location

In [215]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2018_12_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2018_12_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2018-w48,Alagoas,5,0.000844
1,2018-w48,Amapa,2,0.000337
2,2018-w48,Amazonas,9,0.001519
3,2018-w48,Bahia,13,0.002194
4,2018-w48,Ceara,9,0.001519
...,...,...,...,...
133,2018-w53,Paraiba,8,0.001350
134,2018-w53,Parana,6,0.001012
135,2018-w53,Pernambuco,1,0.000169
136,2018-w53,Rio Grande Do Sul,8,0.001350


In [216]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2018-w48,Alagoas,5,0.000844,2
1,2018-w48,Amapa,2,0.000337,0
2,2018-w48,Amazonas,9,0.001519,3
3,2018-w48,Bahia,13,0.002194,5
4,2018-w48,Ceara,9,0.001519,3
...,...,...,...,...,...
133,2018-w53,Paraiba,8,0.001350,3
134,2018-w53,Parana,6,0.001012,2
135,2018-w53,Pernambuco,1,0.000169,0
136,2018-w53,Rio Grande Do Sul,8,0.001350,3


In [217]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2018_12_neu_sample_size = pd.merge(df_2018_12_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [218]:
# Step 6: Perform the stratified sampling
df_2018_12_neu_sample = df_2018_12_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2018_12_neu_sample)} rows.")


Sampled 2381 rows.


In [219]:
text_gpt_2018_12_neu = '\n'.join(df_2018_12_neu_sample['text'].astype(str))

### Prompts and loops

In [220]:
prompt_12 = prompt_raw.replace('TWEETS', text_gpt_2018_12_neu)

In [221]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_12}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [222]:
response.choices[0].message.content

"The provided Portuguese sentences discuss various topics, primarily focusing on the importance of vaccinations for children and pets, the author's favorite books, and the controversial issue of anti-vaccination sentiments. The author expresses a strong belief in the necessity of vaccines to protect public health and criticizes misinformation surrounding vaccines. They also share personal anecdotes, mention their favorite book series, and advocate for responsible pet ownership and adoption."

In [223]:
#summary = []
#dataset = []

In [224]:
dataset.append("2018_12_neuitive")

In [225]:
summary.append(response.choices[0].message.content)

In [226]:
df_2018_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2018_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  12 non-null     object
 1   summary  12 non-null     object
dtypes: object(2)
memory usage: 320.0+ bytes


In [227]:
df_2018_neu.tail(14)

Unnamed: 0,dataset,summary
0,2018_01_positive,The text discusses various aspects of vaccinat...
1,2018_02_positive,The text discusses the importance of vaccinati...
2,2018_03_positive,The provided text discusses various vaccinatio...
3,2018_04_positive,Cuba has developed a cancer vaccine that has r...
4,2018_05_positive,The provided text discusses various vaccinatio...
5,2018_06_positive,The text discusses the ongoing vaccination cam...
6,2018_07_positive,The text emphasizes the critical importance of...
7,2018_08_positive,The text discusses the National Vaccination Ca...
8,2018_09_positive,Researchers in the U.S. are developing a vacci...
9,2018_10_positive,The provided text discusses the importance of ...


In [228]:
response

<OpenAIObject chat.completion id=chatcmpl-AUAigTsJ6MCtgJTxca7WSjjrw0hG7 at 0x2c315d52810> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The provided Portuguese sentences discuss various topics, primarily focusing on the importance of vaccinations for children and pets, the author's favorite books, and the controversial issue of anti-vaccination sentiments. The author expresses a strong belief in the necessity of vaccines to protect public health and criticizes misinformation surrounding vaccines. They also share personal anecdotes, mention their favorite book series, and advocate for responsible pet ownership and adoption.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731754410,
  "id": "chatcmpl-AUAigTsJ6MCtgJTxca7WSjjrw0hG7",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
   

## All 2018 neutral

In [229]:
df_2018_neu.to_csv('../data/summary_2018_neutral_tweets.csv')