In [None]:
import os
import json
import re
import requests
import logging
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from dotenv import load_dotenv

# Carregar variáveis de ambiente do arquivo .env
load_dotenv()

# Configurar o logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class GetHtml:
    def __init__(self, json_file='groups.json', days=7):
        self.json_file = json_file
        self.days = days
        self.df = pd.DataFrame()

    def load_data(self):
        """Carrega dados do arquivo JSON e cria um DataFrame."""
        with open(self.json_file, 'r') as file:
            data = json.load(file)

        self.df = pd.DataFrame.from_dict(data, orient='index').explode('online_links')
        self.df.reset_index(inplace=True)
        self.df.columns = ['grupo', 'link_grupo', 'links_online']

        # Adiciona colunas para armazenar resultados
        self.df['response_status_code'] = None
        self.df['html'] = None
        self.df['infos'] = None
        self.df['datas'] = None
        self.df['qtd_datas'] = None

    @staticmethod
    def extract_dates(html, days):
        """Extrai datas do HTML fornecido, filtrando por um intervalo de dias."""
        date_patterns = [
            r'\d{4}-\d{2}-\d{2}',           # AAAA-MM-DD
            r'\d{2}/\d{2}/\d{4}',           # DD/MM/AAAA
            r'\d{1,2} de \w+ de \d{4}',     # DD de Mês de AAAA
            r'\d{4}\.\d{2}\.\d{2}',         # AAAA.MM.DD
            r'\w+ \d{1,2}, \d{4}',          # Mês DD, AAAA
            r'\d{2}-\d{2}-\d{4}',           # DD-MM-AAAA
            r'\w{3} \d{1,2}, \d{4}',        # Mês Abreviado DD, AAAA
            r'\d{1,2} \w{3} \d{4}',         # DD Mon AAAA
            r'\d{4}/\d{2}/\d{2}',           # AAAA/MM/DD
            r'\d{8}',                       # AAAAMMDD
            r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}',  # ISO 8601
            r'\w+ de \d{4}',                # Mês por Extenso e Ano
            r'\w+ \d{1,2}(?:st|nd|rd|th), \d{4}', # Mês e Dia por Extenso
            r'\d{4}\w{3}\d{2}',             # AAAA-MêsAbreviado-DD
            r'\d{2}/\d{2}/\d{2}',           # Dia/Mês/Ano Abreviado
            r'\w{3}, \d{4}',                # Mês Abreviado e Ano
            r'\d{4}-\d{2}',                 # Ano e Mês
            r'\d{4}, \w{3} \d{1,2}(?:st|nd|rd|th)', # Ano, Mês Abreviado e Dia por Extenso
            r'\d{2}\.\d{2}\.\d{4}',         # Formato Alemão
            r'\d{4}年\d{1,2}月\d{1,2}日',   # Formato Chinês
        ]

        soup = BeautifulSoup(html, 'html.parser')
        text = soup.get_text()
        dates = []
        for pattern in date_patterns:
            found_dates = re.findall(pattern, text)
            dates.extend(found_dates)

        # Filtrar datas para mostrar apenas os últimos dias especificados
        collected_dates = []
        for date_str in dates:
            for fmt in ['%Y-%m-%d', '%d/%m/%Y', '%d de %B de %Y', '%Y.%m.%d', '%B %d, %Y']:
                try:
                    date = datetime.strptime(date_str, fmt)
                    if (datetime.now() - timedelta(days)) <= date <= datetime.now():
                        collected_dates.append(date_str)
                    break
                except ValueError:
                    continue

        return collected_dates

    def fetch_data(self):
        """Faz requisições HTTP para os links no DataFrame e processa os dados."""
        proxies = {
            'http': f"socks5h://{os.getenv('PROXY_HOST')}:{os.getenv('PROXY_PORT')}",
            'https': f"socks5h://{os.getenv('PROXY_HOST')}:{os.getenv('PROXY_PORT')}"
        }

        for index, row in self.df.iterrows():
            url = row['links_online']
            try:
                response = requests.get(url, proxies=proxies, timeout=10)
                self.df.at[index, 'response_status_code'] = response.status_code

                if response.status_code == 200:
                    logger.info(f"Acesso bem-sucedido ao link {url}")
                    self.df.at[index, 'html'] = response.text
                    datas = self.extract_dates(response.text, self.days)
                    self.df.at[index, 'infos'] = bool(datas)
                    self.df.at[index, 'datas'] = datas
                    self.df.at[index, 'qtd_datas'] = len(datas)
                else:
                    logger.warning(f"Status {response.status_code} ao acessar {url}")

            except requests.RequestException as e:
                logger.error(f"Erro ao acessar {url}: {e}")
                self.df.at[index, 'response_status_code'] = 'erro'

    def run(self):
        """Método principal para executar as operações da classe."""
        self.load_data()
        self.fetch_data()
        return self.df

# Exemplo de uso
if __name__ == "__main__":
    get_html = GetHtml()
    result_df = get_html.run()
    logger.info(f"DataFrame gerado:\n{result_df}")


In [2]:
result_df

Unnamed: 0,grupo,link_grupo,links_online,response_status_code,html,infos,datas,qtd_datas
0,lorenz,https://cti.fyi/groups/lorenz.html,http://lorenzmlwpzgxq736jzseuterytjueszsvznuib...,erro,,,,
1,ragnarlocker,https://cti.fyi/groups/ragnarlocker.html,http://rgleaktxuey67yrgspmhvtnrqtgogur35lwdrup...,200,<!DOCTYPE html>\r\n<html>\r\n<style>\r\n\r\n\r...,False,[],0
2,ragnarlocker,https://cti.fyi/groups/ragnarlocker.html,http://ragnarnwvli32xnmwudsvhbl7klzmofxeylyhcq...,200,<!DOCTYPE html>\r\n<html>\r\n<style>\r\n\r\n\r...,False,[],0
3,clop,https://cti.fyi/groups/clop.html,http://santat7kpllt6iyvqbr7q4amdv6dzrh6paatvyr...,200,<!DOCTYPE html>\n<html lang=en><title>Access Q...,False,[],0
4,clop,https://cti.fyi/groups/clop.html,http://toznnag5o3ambca56s2yacteu7q7x2avrfherzm...,erro,,,,
...,...,...,...,...,...,...,...,...
194,dispossessor,https://cti.fyi/groups/dispossessor.html,http://e27z5kd2rjsern2gpgukhcioysqlfquxgf7rxpv...,erro,,,,
195,nullbulge,https://cti.fyi/groups/nullbulge.html,https://nullbulge.se/blog.html,403,,,,
196,nullbulge,https://cti.fyi/groups/nullbulge.html,https://nullbulge.co/blog.html,403,,,,
197,lynx,https://cti.fyi/groups/lynx.html,http://lynxblog.net/api/v1/blog/get/announceme...,200,"{""type"":true,""message"":""Success: got announcem...",False,[],0
