## Atividades Práticas Integradoras
Este script é destinado à **coleta** e **estruturação de dados** do fórum online 4chan, visando a construção de **datasets** para análise posterior.<br />
 
A priori este script objetivará a coleta de dados apenas da board /pol/, dado a natureza do estudo que consumirá esses datasets. <br />

### Regras da API: 

 1. Uma requisição por segundo; 
 2. Atualização do thread deve ser definida
    para um mínimo de 10 segundos, de preferência mais alto; 
 3. Uso do 'If-Modified-Since' nas requisições para não solicitar dados de
    threads sem atualização.



#### Script

In [1]:
import requests
import json
import os
import schedule
import time
import pandas as pd
import warnings

from bs4 import BeautifulSoup

warnings.filterwarnings('ignore')

In [2]:
def job():
    # cria o diretório 'images', se ele não existir
    if not os.path.exists('images'):
        os.makedirs('images')

    board_url = 'https://a.4cdn.org/pol/threads.json'
    last_modified = {}

    all_posts_data = []
    # carrega os dados existentes, se não existir, inicializa uma lista vazia
    if os.path.exists('pol_posts_data.json'):
        with open('pol_posts_data.json', 'r') as f:
            all_posts_data = json.load(f)

    response = requests.get(board_url)
    data = response.json()

    # percorre cada página de dados
    for page in data:
        # percorre cada thread na página
        for thread in page['threads']:
            thread_number = thread['no']
            headers = {}
            if str(thread_number) in last_modified:
                headers['If-Modified-Since'] = last_modified[str(thread_number)]

            thread_url = f'https://a.4cdn.org/pol/thread/{thread_number}.json'
            response = requests.get(thread_url, headers=headers)

            if response.status_code == 304:
                continue

            thread_data = response.json()

            # percorre cada post na thread
            for post in thread_data['posts']:
                post_data = {}
                post_data['thread_no'] = thread_number
                post_data['post_no'] = post.get('no', '')
                post_data['comment'] = post.get('com', '')
                post_data['image_replies'] = post.get('images', 0)
                post_data['timestamp'] = post.get('time', '')
                post_data['reply_to'] = post.get('resto', '')  # captura a quem o post está respondendo

                # verifica se o post possui imagem
                if 'tim' in post and 'ext' in post:
                    image_name = str(thread_number) + '_' + str(post['no']) + post['ext']
                    image_url = f'https://i.4cdn.org/pol/{str(post["tim"])}{post["ext"]}'

                    # realiza download da imagem
                    image_response = requests.get(image_url)
                    with open(f'images/{image_name}', 'wb') as f:
                        f.write(image_response.content)

                    post_data['image_id'] = image_name

                all_posts_data.append(post_data)

            last_modified[str(thread_number)] = response.headers.get('Last-Modified')

            time.sleep(1)

        # salva os dados a cada 60 segundos
        with open('pol_posts_data.json', 'w') as f:
            json.dump(all_posts_data, f)
        time.sleep(60)


In [4]:
job()

schedule.every(60).seconds.do(job)

while True:
    schedule.run_pending()
    time.sleep(1)

KeyboardInterrupt: 

In [5]:
all_posts_data = []
if os.path.exists('pol_posts_data.json'):
    with open('pol_posts_data.json', 'r') as f:
        all_posts_data = json.load(f)

# remove as tags HTML do campo 'comment'
for post in all_posts_data:
    soup = BeautifulSoup(post['comment'], "html.parser")
    post['comment'] = soup.get_text()

df_posts = pd.DataFrame(all_posts_data)

df_posts.to_csv('df_posts.csv', index=False)

In [6]:
df_posts

Unnamed: 0,thread_no,post_no,comment,image_replies,timestamp,reply_to,image_id
0,124205675,124205675,"This board is for the discussion of news, worl...",0,1493993226,0,124205675_124205675.jpg
1,259848258,259848258,Check the catalog before posting a new thread!...,0,1590952059,0,259848258_259848258.jpg
2,432105412,432105412,"I present you Javier Milei, the next president...",18,1687714898,0,432105412_432105412.jpg
3,432105412,432105680,>Replace pesos with dollars to end with inflat...,0,1687715038,432105412,432105412_432105680.jpg
4,432105412,432105682,>Milei was born in 1970 to a bus driver.[4] Mi...,0,1687715039,432105412,
...,...,...,...,...,...,...,...
5073,432129133,432132334,"In america they have juneteenth, another kneel...",0,1687728720,432129133,
5074,432129133,432132512,>>432132334,0,1687728812,432129133,432129133_432132512.png
5075,432129133,432132794,>>432129133Their buildings aren't even ESG-cer...,0,1687728967,432129133,
5076,432129133,432133167,>>432129133>when Italians don’t treat their di...,0,1687729175,432129133,
