# Import libraries and data

In [146]:
# Import libraries
import os
import openai
import pandas as pd
import numpy as np
import openpyxl

In [147]:
# Get OpenAI API secret key (change this file with your own OpenAI key)
open_ai_key = open('../data/local/openai_key.txt', 'r').read()
openai.api_key = open_ai_key
#openai.Model.list()

In [148]:
# Prompt for GPT
prompt_raw = "I would like to summarise in maximum 300 characters and in English what the following sentences in Portuguese talk about. \
Do not use aspects mentioned only in one sentence as part of the summary: TWEETS"

# 2015 neutral

## January

In [149]:
# Get monthly data
df_2015_01_neu = pd.read_csv('../data/local/df_neu_2015-01.csv').reset_index()
df_2015_01_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482 entries, 0 to 1481
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   1482 non-null   int64  
 1   LocationCode            1482 non-null   object 
 2   LocationName            1482 non-null   object 
 3   StatesCode              1482 non-null   object 
 4   ...4                    1482 non-null   int64  
 5   id                      1482 non-null   float64
 6   text                    1482 non-null   object 
 7   label                   1482 non-null   object 
 8   in_reply_to_status_id   168 non-null    float64
 9   in_reply_to_user_id     174 non-null    float64
 10  quoted_user_id          0 non-null      float64
 11  quoted_status_id        0 non-null      float64
 12  retweeted_user_id       284 non-null    float64
 13  retweeted_status_id     284 non-null    float64
 14  created_at              1482 non-null   

### Representative sample based on week and location

In [150]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_01_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_01_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2015-w01,Acre,1,0.000675
1,2015-w01,Alagoas,4,0.002699
2,2015-w01,Amapa,4,0.002699
3,2015-w01,Amazonas,8,0.005398
4,2015-w01,Bahia,14,0.009447
...,...,...,...,...
100,2015-w05,Rio Grande Do Sul,13,0.008772
101,2015-w05,Rondonia,1,0.000675
102,2015-w05,Santa Catarina,6,0.004049
103,2015-w05,Sao Paulo,32,0.021592


In [151]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2015-w01,Acre,1,0.000675,1
1,2015-w01,Alagoas,4,0.002699,6
2,2015-w01,Amapa,4,0.002699,6
3,2015-w01,Amazonas,8,0.005398,13
4,2015-w01,Bahia,14,0.009447,23
...,...,...,...,...,...
100,2015-w05,Rio Grande Do Sul,13,0.008772,21
101,2015-w05,Rondonia,1,0.000675,1
102,2015-w05,Santa Catarina,6,0.004049,9
103,2015-w05,Sao Paulo,32,0.021592,52


In [152]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_01_neu_sample_size = pd.merge(df_2015_01_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [153]:
# Step 6: Perform the stratified sampling
#df_2015_01_neu_sample = df_2015_01_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2015_01_neu_sample = df_2015_01_neu
# Step 7: Check the result
print(f"Sampled {len(df_2015_01_neu_sample)} rows.")


Sampled 1482 rows.


In [154]:
text_gpt_2015_01_neu = '\n'.join(df_2015_01_neu_sample['text'].astype(str))

### Prompts and loops

In [155]:
prompt_1 = prompt_raw.replace('TWEETS', text_gpt_2015_01_neu)

In [156]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_1}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

KeyboardInterrupt: 

In [None]:
response.choices[0].message.content

In [None]:
summary = []
dataset = []

In [None]:
dataset.append("2015_01_neutral")

In [None]:
summary.append(response.choices[0].message.content)

In [None]:
df_2015_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_neu.info()

In [None]:
df_2015_neu.tail(14)

In [None]:
response

## February

In [None]:
# Get monthly data
df_2015_02_neu = pd.read_csv('../data/local/df_neu_2015-02.csv').reset_index()
df_2015_02_neu.info()

### Representative sample based on week and location

In [None]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_02_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_02_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

In [None]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

In [None]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_02_neu_sample_size = pd.merge(df_2015_02_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [None]:
# Step 6: Perform the stratified sampling
#df_2015_02_neu_sample = df_2015_02_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2015_02_neu_sample = df_2015_02_neu
# Step 7: Check the result
print(f"Sampled {len(df_2015_02_neu_sample)} rows.")


In [None]:
text_gpt_2015_02_neu = '\n'.join(df_2015_02_neu_sample['text'].astype(str))

### Prompts and loops

In [None]:
prompt_2 = prompt_raw.replace('TWEETS', text_gpt_2015_02_neu)

In [None]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_2}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [None]:
response.choices[0].message.content

In [None]:
#summary = []
#dataset = []

In [None]:
dataset.append("2015_02_neutral")

In [None]:
summary.append(response.choices[0].message.content)

In [None]:
df_2015_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_neu.info()

In [None]:
df_2015_neu.tail(14)

In [None]:
response

## March

In [None]:
# Get monthly data
df_2015_03_neu = pd.read_csv('../data/local/df_neu_2015-03.csv').reset_index()
df_2015_03_neu.info()

### Representative sample based on week and location

In [None]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_03_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_03_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

In [None]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

In [None]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_03_neu_sample_size = pd.merge(df_2015_03_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [None]:
# Step 6: Perform the stratified sampling
df_2015_03_neu_sample = df_2015_03_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2015_03_neu_sample)} rows.")


In [None]:
text_gpt_2015_03_neu = '\n'.join(df_2015_03_neu_sample['text'].astype(str))

### Prompts and loops

In [None]:
prompt_3 = prompt_raw.replace('TWEETS', text_gpt_2015_03_neu)

In [None]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_3}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [None]:
response.choices[0].message.content

In [None]:
#summary = []
#dataset = []

In [None]:
dataset.append("2015_03_neutral")

In [None]:
summary.append(response.choices[0].message.content)

In [None]:
df_2015_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_neu.info()

In [None]:
df_2015_neu.tail(14)

In [None]:
response

## April

In [None]:
# Get monthly data
df_2015_04_neu = pd.read_csv('../data/local/df_neu_2015-04.csv').reset_index()
df_2015_04_neu.info()

### Representative sample based on week and location

In [None]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_04_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_04_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

In [None]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

In [None]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_04_neu_sample_size = pd.merge(df_2015_04_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [None]:
# Step 6: Perform the stratified sampling
df_2015_04_neu_sample = df_2015_04_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2015_04_neu_sample)} rows.")


In [None]:
text_gpt_2015_04_neu = '\n'.join(df_2015_04_neu_sample['text'].astype(str))

### Prompts and loops

In [None]:
prompt_4 = prompt_raw.replace('TWEETS', text_gpt_2015_04_neu)

In [None]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_4}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [None]:
response.choices[0].message.content

In [None]:
#summary = []
#dataset = []

In [None]:
dataset.append("2015_04_neutral")

In [None]:
summary.append(response.choices[0].message.content)

In [None]:
df_2015_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_neu.info()

In [None]:
df_2015_neu.tail(14)

In [None]:
response

## May

In [None]:
# Get monthly data
df_2015_05_neu = pd.read_csv('../data/local/df_neu_2015-05.csv').reset_index()
df_2015_05_neu.info()

### Representative sample based on week and location

In [None]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_05_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_05_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

In [None]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

In [None]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_05_neu_sample_size = pd.merge(df_2015_05_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [None]:
# Step 6: Perform the stratified sampling
df_2015_05_neu_sample = df_2015_05_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2015_05_neu_sample)} rows.")


In [None]:
text_gpt_2015_05_neu = '\n'.join(df_2015_05_neu_sample['text'].astype(str))

### Prompts and loops

In [None]:
prompt_5 = prompt_raw.replace('TWEETS', text_gpt_2015_05_neu)

In [None]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_5}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [None]:
response.choices[0].message.content

In [None]:
#summary = []
#dataset = []

In [None]:
dataset.append("2015_05_neutral")

In [None]:
summary.append(response.choices[0].message.content)

In [None]:
df_2015_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_neu.info()

In [None]:
df_2015_neu.tail(14)

In [None]:
response

## June

In [None]:
# Get monthly data
df_2015_06_neu = pd.read_csv('../data/local/df_neu_2015-06.csv').reset_index()
df_2015_06_neu.info()

### Representative sample based on week and location

In [None]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_06_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_06_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

In [None]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

In [None]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_06_neu_sample_size = pd.merge(df_2015_06_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [None]:
# Step 6: Perform the stratified sampling
df_2015_06_neu_sample = df_2015_06_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2015_06_neu_sample)} rows.")


In [None]:
text_gpt_2015_06_neu = '\n'.join(df_2015_06_neu_sample['text'].astype(str))

### Prompts and loops

In [None]:
prompt_6 = prompt_raw.replace('TWEETS', text_gpt_2015_06_neu)

In [None]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_6}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [None]:
response.choices[0].message.content

In [None]:
#summary = []
#dataset = []

In [None]:
dataset.append("2015_06_neutral")

In [None]:
summary.append(response.choices[0].message.content)

In [None]:
df_2015_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_neu.info()

In [None]:
df_2015_neu.tail(14)

In [None]:
response

## July

In [None]:
# Get monthly data
df_2015_07_neu = pd.read_csv('../data/local/df_neu_2015-07.csv').reset_index()
df_2015_07_neu.info()

### Representative sample based on week and location

In [None]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_07_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_07_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

In [None]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

In [None]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_07_neu_sample_size = pd.merge(df_2015_07_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [None]:
# Step 6: Perform the stratified sampling
df_2015_07_neu_sample = df_2015_07_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2015_07_neu_sample)} rows.")


In [None]:
text_gpt_2015_07_neu = '\n'.join(df_2015_07_neu_sample['text'].astype(str))

### Prompts and loops

In [None]:
prompt_7 = prompt_raw.replace('TWEETS', text_gpt_2015_07_neu)

In [None]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_7}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [None]:
response.choices[0].message.content

In [None]:
#summary = []
#dataset = []

In [None]:
dataset.append("2015_07_neutral")

In [None]:
summary.append(response.choices[0].message.content)

In [None]:
df_2015_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_neu.info()

In [None]:
df_2015_neu.tail(14)

In [None]:
response

## August

In [None]:
# Get monthly data
df_2015_08_neu = pd.read_csv('../data/local/df_neu_2015-08.csv').reset_index()
df_2015_08_neu.info()

### Representative sample based on week and location

In [None]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_08_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_08_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

In [None]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

In [None]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_08_neu_sample_size = pd.merge(df_2015_08_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [None]:
# Step 6: Perform the stratified sampling
df_2015_08_neu_sample = df_2015_08_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2015_08_neu_sample)} rows.")


In [None]:
text_gpt_2015_08_neu = '\n'.join(df_2015_08_neu_sample['text'].astype(str))

### Prompts and loops

In [None]:
prompt_8 = prompt_raw.replace('TWEETS', text_gpt_2015_08_neu)

In [None]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_8}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [None]:
response.choices[0].message.content

In [None]:
#summary = []
#dataset = []

In [None]:
dataset.append("2015_08_neutral")

In [None]:
summary.append(response.choices[0].message.content)

In [None]:
df_2015_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_neu.info()

In [None]:
df_2015_neu.tail(14)

In [None]:
response

## September

In [None]:
# Get monthly data
df_2015_09_neu = pd.read_csv('../data/local/df_neu_2015-09.csv').reset_index()
df_2015_09_neu.info()

### Representative sample based on week and location

In [None]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_09_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_09_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

In [None]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

In [None]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_09_neu_sample_size = pd.merge(df_2015_09_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [None]:
# Step 6: Perform the stratified sampling
df_2015_09_neu_sample = df_2015_09_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2015_09_neu_sample)} rows.")


In [None]:
text_gpt_2015_09_neu = '\n'.join(df_2015_09_neu_sample['text'].astype(str))

### Prompts and loops

In [None]:
prompt_9 = prompt_raw.replace('TWEETS', text_gpt_2015_09_neu)

In [None]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_9}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [None]:
response.choices[0].message.content

In [None]:
#summary = []
#dataset = []

In [None]:
dataset.append("2015_09_neutral")

In [None]:
summary.append(response.choices[0].message.content)

In [None]:
df_2015_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_neu.info()

In [None]:
df_2015_neu.tail(14)

In [None]:
response

## October

In [None]:
# Get monthly data
df_2015_10_neu = pd.read_csv('../data/local/df_neu_2015-10.csv').reset_index()
df_2015_10_neu.info()

### Representative sample based on week and location

In [None]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_10_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_10_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

In [None]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

In [157]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_10_neu_sample_size = pd.merge(df_2015_10_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [158]:
# Step 6: Perform the stratified sampling
#df_2015_10_neu_sample = df_2015_10_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2015_10_neu_sample = df_2015_10_neu
# Step 7: Check the result
print(f"Sampled {len(df_2015_10_neu_sample)} rows.")


Sampled 2230 rows.


In [159]:
text_gpt_2015_10_neu = '\n'.join(df_2015_10_neu_sample['text'].astype(str))

### Prompts and loops

In [160]:
prompt_10 = prompt_raw.replace('TWEETS', text_gpt_2015_10_neu)

In [161]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_10}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [162]:
response.choices[0].message.content

'The text primarily discusses various vaccination campaigns and their importance in preventing diseases for both humans and animals. It highlights the rollout of vaccines against diseases like HPV, dengue, and rabies, as well as mentions specific events and campaigns in different regions. There are also references to personal experiences with vaccinations, humor related to vaccination fears, and the role of vaccination in public health initiatives.'

In [163]:
#summary = []
#dataset = []

In [164]:
dataset.append("2015_10_neutral")

In [165]:
summary.append(response.choices[0].message.content)

In [166]:
df_2015_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  10 non-null     object
 1   summary  10 non-null     object
dtypes: object(2)
memory usage: 288.0+ bytes


In [167]:
df_2015_neu.tail(14)

Unnamed: 0,dataset,summary
0,2015_01_neutral,The text discusses various aspects of vaccinat...
1,2015_02_neutral,The text discusses various aspects of vaccinat...
2,2015_03_neutral,A vaccination campaign against HPV is starting...
3,2015_04_neutral,The text discusses various vaccination campaig...
4,2015_05_neutral,The sentences discuss the ongoing vaccination ...
5,2015_06_neutral,The texts discuss the vaccination campaigns ag...
6,2015_07_neutral,A vaccination campaign against rabies for dogs...
7,2015_08_neutral,The text discusses multiple vaccination campai...
8,2015_09_neutral,The texts discuss various vaccination campaign...
9,2015_10_neutral,The text primarily discusses various vaccinati...


In [168]:
response

<OpenAIObject chat.completion id=chatcmpl-AUd9L37kzWssNTB1ACPsIHZtBXgT1 at 0x2d5c718f9f0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text primarily discusses various vaccination campaigns and their importance in preventing diseases for both humans and animals. It highlights the rollout of vaccines against diseases like HPV, dengue, and rabies, as well as mentions specific events and campaigns in different regions. There are also references to personal experiences with vaccinations, humor related to vaccination fears, and the role of vaccination in public health initiatives.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731863695,
  "id": "chatcmpl-AUd9L37kzWssNTB1ACPsIHZtBXgT1",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_9b78b61c52",
  "usage": {
    "completion_tokens": 74,
    "completion_t

## November

In [169]:
# Get monthly data
df_2015_11_neu = pd.read_csv('../data/local/df_neu_2015-11.csv').reset_index()
df_2015_11_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3179 entries, 0 to 3178
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   3179 non-null   int64  
 1   LocationCode            3179 non-null   object 
 2   LocationName            3179 non-null   object 
 3   StatesCode              3179 non-null   object 
 4   ...4                    3179 non-null   int64  
 5   id                      3179 non-null   float64
 6   text                    3179 non-null   object 
 7   label                   3179 non-null   object 
 8   in_reply_to_status_id   174 non-null    float64
 9   in_reply_to_user_id     216 non-null    float64
 10  quoted_user_id          19 non-null     float64
 11  quoted_status_id        19 non-null     float64
 12  retweeted_user_id       334 non-null    float64
 13  retweeted_status_id     334 non-null    float64
 14  created_at              3179 non-null   

### Representative sample based on week and location

In [170]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_11_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_11_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2015-w44,Alagoas,5,0.001573
1,2015-w44,Amapa,2,0.000629
2,2015-w44,Amazonas,9,0.002831
3,2015-w44,Bahia,56,0.017616
4,2015-w44,Ceara,16,0.005033
...,...,...,...,...
110,2015-w48,Roraima,7,0.002202
111,2015-w48,Santa Catarina,4,0.001258
112,2015-w48,Sao Paulo,39,0.012268
113,2015-w48,Sergipe,1,0.000315


In [171]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2015-w44,Alagoas,5,0.001573,3
1,2015-w44,Amapa,2,0.000629,1
2,2015-w44,Amazonas,9,0.002831,6
3,2015-w44,Bahia,56,0.017616,43
4,2015-w44,Ceara,16,0.005033,12
...,...,...,...,...,...
110,2015-w48,Roraima,7,0.002202,5
111,2015-w48,Santa Catarina,4,0.001258,3
112,2015-w48,Sao Paulo,39,0.012268,30
113,2015-w48,Sergipe,1,0.000315,0


In [172]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_11_neu_sample_size = pd.merge(df_2015_11_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [173]:
# Step 6: Perform the stratified sampling
df_2015_11_neu_sample = df_2015_11_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
    lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
).reset_index(drop=True)

# Step 7: Check the result
print(f"Sampled {len(df_2015_11_neu_sample)} rows.")


Sampled 2387 rows.


In [174]:
text_gpt_2015_11_neu = '\n'.join(df_2015_11_neu_sample['text'].astype(str))

### Prompts and loops

In [175]:
prompt_11 = prompt_raw.replace('TWEETS', text_gpt_2015_11_neu)

In [176]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_11}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [177]:
response.choices[0].message.content

'The text discusses various campaigns and phases of vaccination against foot-and-mouth disease affecting livestock in Brazil, emphasizing the importance of vaccinating the herd to maintain disease-free status. It highlights deadlines for vaccination, awareness campaigns for rural producers, and the impact on meat exportation.'

In [178]:
#summary = []
#dataset = []

In [179]:
dataset.append("2015_11_neutral")

In [180]:
summary.append(response.choices[0].message.content)

In [181]:
df_2015_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  11 non-null     object
 1   summary  11 non-null     object
dtypes: object(2)
memory usage: 304.0+ bytes


In [182]:
df_2015_neu.tail(14)

Unnamed: 0,dataset,summary
0,2015_01_neutral,The text discusses various aspects of vaccinat...
1,2015_02_neutral,The text discusses various aspects of vaccinat...
2,2015_03_neutral,A vaccination campaign against HPV is starting...
3,2015_04_neutral,The text discusses various vaccination campaig...
4,2015_05_neutral,The sentences discuss the ongoing vaccination ...
5,2015_06_neutral,The texts discuss the vaccination campaigns ag...
6,2015_07_neutral,A vaccination campaign against rabies for dogs...
7,2015_08_neutral,The text discusses multiple vaccination campai...
8,2015_09_neutral,The texts discuss various vaccination campaign...
9,2015_10_neutral,The text primarily discusses various vaccinati...


In [183]:
response

<OpenAIObject chat.completion id=chatcmpl-AUd9SVlelM9qGOtdmczyrZZ9CamQ4 at 0x2d5ca960a40> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various campaigns and phases of vaccination against foot-and-mouth disease affecting livestock in Brazil, emphasizing the importance of vaccinating the herd to maintain disease-free status. It highlights deadlines for vaccination, awareness campaigns for rural producers, and the impact on meat exportation.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731863702,
  "id": "chatcmpl-AUd9SVlelM9qGOtdmczyrZZ9CamQ4",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 53,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_tokens": 0,
      "reasoning_tokens": 0,
      "rejecte

## December

In [184]:
# Get monthly data
df_2015_12_neu = pd.read_csv('../data/local/df_neu_2015-12.csv').reset_index()
df_2015_12_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4472 entries, 0 to 4471
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   4472 non-null   int64  
 1   LocationCode            4472 non-null   object 
 2   LocationName            4472 non-null   object 
 3   StatesCode              4472 non-null   object 
 4   ...4                    4472 non-null   int64  
 5   id                      4472 non-null   float64
 6   text                    4472 non-null   object 
 7   label                   4472 non-null   object 
 8   in_reply_to_status_id   214 non-null    float64
 9   in_reply_to_user_id     392 non-null    float64
 10  quoted_user_id          33 non-null     float64
 11  quoted_status_id        33 non-null     float64
 12  retweeted_user_id       1335 non-null   float64
 13  retweeted_status_id     1335 non-null   float64
 14  created_at              4472 non-null   

### Representative sample based on week and location

In [185]:
# Step 1: Calculate the size of each group (year_week, LocationName)
group_sizes = df_2015_12_neu.groupby(['year_week', 'LocationName']).size().reset_index(name='group_size')

# Step 2: Calculate the proportion of each group in the dataset
total_rows = len(df_2015_12_neu)
group_sizes['proportion'] = group_sizes['group_size'] / total_rows
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion
0,2015-w48,Alagoas,2,0.000447
1,2015-w48,Amazonas,2,0.000447
2,2015-w48,Bahia,5,0.001118
3,2015-w48,Ceara,2,0.000447
4,2015-w48,Espirito Santo,2,0.000447
...,...,...,...,...
122,2015-w53,Parana,1,0.000224
123,2015-w53,Pernambuco,1,0.000224
124,2015-w53,Rio Grande Do Sul,2,0.000447
125,2015-w53,Santa Catarina,1,0.000224


In [186]:
# Step 3: Define the target sample size
target_sample_size = 2450

# Step 4: Calculate the sample size for each group (based on its proportion)
group_sizes['sample_size'] = np.floor(group_sizes['proportion'] * target_sample_size).astype(int)
group_sizes

Unnamed: 0,year_week,LocationName,group_size,proportion,sample_size
0,2015-w48,Alagoas,2,0.000447,1
1,2015-w48,Amazonas,2,0.000447,1
2,2015-w48,Bahia,5,0.001118,2
3,2015-w48,Ceara,2,0.000447,1
4,2015-w48,Espirito Santo,2,0.000447,1
...,...,...,...,...,...
122,2015-w53,Parana,1,0.000224,0
123,2015-w53,Pernambuco,1,0.000224,0
124,2015-w53,Rio Grande Do Sul,2,0.000447,1
125,2015-w53,Santa Catarina,1,0.000224,0


In [187]:
# Step 5: Merge this sample size information back to the original DataFrame
df_2015_12_neu_sample_size = pd.merge(df_2015_12_neu, group_sizes[['year_week', 'LocationName', 'sample_size']], 
                               on=['year_week', 'LocationName'], how='left')

In [188]:
# Step 6: Perform the stratified sampling
#df_2015_12_neu_sample = df_2015_12_neu_sample_size.groupby(['year_week', 'LocationName']).apply(
 #   lambda group: group.sample(n=group['sample_size'].iloc[0], random_state=42)
#).reset_index(drop=True)
df_2015_12_neu_sample = df_2015_12_neu
# Step 7: Check the result
print(f"Sampled {len(df_2015_12_neu_sample)} rows.")


Sampled 4472 rows.


In [189]:
text_gpt_2015_12_neu = '\n'.join(df_2015_12_neu_sample['text'].astype(str))

### Prompts and loops

In [190]:
prompt_12 = prompt_raw.replace('TWEETS', text_gpt_2015_12_neu)

In [191]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",  
    messages = [
        {"role": "user", "content": prompt_12}
    ],
    max_tokens=400,  # Maximum number of tokens in the response
    temperature=0.7  # Controls the randomness of the output
)

In [192]:
response.choices[0].message.content

'The text discusses various aspects of vaccination, particularly focusing on the approval and details of the first dengue vaccine in Brazil by Anvisa. It addresses the target demographics for the vaccine, including children up to 14 years and elderly over 60, while noting concerns about the spread of misinformation relating to other vaccines, such as those for rubella and Zika.'

In [193]:
#summary = []
#dataset = []

In [194]:
dataset.append("2015_12_neutral")

In [195]:
summary.append(response.choices[0].message.content)

In [196]:
df_2015_neu = pd.DataFrame(list(zip(dataset, summary)),
                            columns = ['dataset', 'summary'])
df_2015_neu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  12 non-null     object
 1   summary  12 non-null     object
dtypes: object(2)
memory usage: 320.0+ bytes


In [197]:
df_2015_neu.tail(14)

Unnamed: 0,dataset,summary
0,2015_01_neutral,The text discusses various aspects of vaccinat...
1,2015_02_neutral,The text discusses various aspects of vaccinat...
2,2015_03_neutral,A vaccination campaign against HPV is starting...
3,2015_04_neutral,The text discusses various vaccination campaig...
4,2015_05_neutral,The sentences discuss the ongoing vaccination ...
5,2015_06_neutral,The texts discuss the vaccination campaigns ag...
6,2015_07_neutral,A vaccination campaign against rabies for dogs...
7,2015_08_neutral,The text discusses multiple vaccination campai...
8,2015_09_neutral,The texts discuss various vaccination campaign...
9,2015_10_neutral,The text primarily discusses various vaccinati...


In [198]:
response

<OpenAIObject chat.completion id=chatcmpl-AUd9b0bAmun5P4Zsu6I1xw5I7JDkx at 0x2d5cac5bdb0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The text discusses various aspects of vaccination, particularly focusing on the approval and details of the first dengue vaccine in Brazil by Anvisa. It addresses the target demographics for the vaccine, including children up to 14 years and elderly over 60, while noting concerns about the spread of misinformation relating to other vaccines, such as those for rubella and Zika.",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1731863711,
  "id": "chatcmpl-AUd9b0bAmun5P4Zsu6I1xw5I7JDkx",
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_0ba0d124f1",
  "usage": {
    "completion_tokens": 71,
    "completion_tokens_details": {
      "accepted_prediction_tokens": 0,
      "audio_to

## All 2015 neutral

In [199]:
#df_2015_neu['dataset'] = df_2015_neu['dataset'].str.replace('neuitive','neutral')

In [200]:
df_2015_neu.to_csv('../data/summary_2015_neutral_tweets.csv')