In [None]:
import pandas as pd
import json
import requests
import re
import dateparser
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# Configurar o proxy TOR (SOCKS5)
proxies = {
    'http': 'socks5h://localhost:9050',
    'https': 'socks5h://localhost:9050'
}

with open('grupos.json', 'r') as file:
    data = json.load(file)

df = pd.DataFrame.from_dict(data, orient='index').explode('links_online')

df.reset_index(inplace=True)
df.columns = ['grupo', 'link_grupo', 'links_online']

# Adicionando as novas colunas
df['response_status_code'] = None 
df['html'] = None
df['info_D-7'] = None
df['datas_D-7'] = None
df['qtd_datas_D-7'] = None




In [None]:
def extract_dates(html):
    # Padrões de datas que queremos capturar
date_patterns = [
    r'\d{4}-\d{2}-\d{2}',           # AAAA-MM-DD
    r'\d{2}/\d{2}/\d{4}',           # DD/MM/AAAA
    r'\d{1,2} de \w+ de \d{4}',     # DD de Mês de AAAA
    r'\d{4}\.\d{2}\.\d{2}',         # AAAA.MM.DD
    r'\w+ \d{1,2}, \d{4}',          # Mês DD, AAAA
    
    # Formatos adicionais
    r'\d{2}/\d{2}/\d{4}',           # MM/DD/AAAA
    r'\d{2}-\d{2}-\d{4}',           # DD-MM-AAAA
    r'\w{3} \d{1,2}, \d{4}',        # Mês Abreviado DD, AAAA
    r'\d{1,2} \w{3} \d{4}',         # DD Mon AAAA
    r'\d{4}/\d{2}/\d{2}',           # AAAA/MM/DD
    r'\d{8}',                       # AAAAMMDD ou DDMMYYYY (sem delimitadores)
    r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})',  # ISO 8601
    r'\w+ de \d{4}',                # Mês por Extenso e Ano
    r'\w+ \d{1,2}(?:st|nd|rd|th), \d{4}', # Mês e Dia por Extenso (Oct 1st, 2024)
    r'\d{4}\w{3}\d{2}',             # AAAA-MêsAbreviado-DD (2024Oct31)
    r'\d{2}/\d{2}/\d{2}',           # Dia/Mês/Ano Abreviado (10/12/24)
    r'\w{3}, \d{4}',                # Mês Abreviado e Ano (Oct, 2024)
    r'\d{4}-\d{2}',                 # Ano e Mês (2024-10)
    r'\d{4}, \w{3} \d{1,2}(?:st|nd|rd|th)', # Ano, Mês Abreviado e Dia por Extenso (2024, Oct 10th)
    
    # Formatos Internacionais
    r'\d{4}-\d{2}-\d{2}',                  # Formato Francês (AAAA-MM-JJ)
    r'\d{2}\.\d{2}\.\d{4}',                # Formato Alemão (TT.MM.JJJJ)
    r'\d{4}年\d{1,2}月\d{1,2}日',            # Formato Chinês (AAAA年MM月DD日)
]

    
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text()
    dates = []
    for pattern in date_patterns:
        found_dates = re.findall(pattern, text)
        dates.extend(found_dates)

    # Filtrar datas para mostrar apenas os últimos 7 dias
    last_7_days_dates = []
    for date_str in dates:
        # Tentar converter a data de acordo com o formato
        for fmt in ['%Y-%m-%d', '%d/%m/%Y', '%d de %B de %Y', '%Y.%m.%d', '%B %d, %Y']:
            try:
                date = datetime.strptime(date_str, fmt)
                if (datetime.now() - timedelta(days=7)) <= date <= datetime.now():
                    last_7_days_dates.append(date_str)
                break  # Se a data foi convertida com sucesso, saia do loop
            except ValueError:
                continue  # Tente o próximo formato

    return last_7_days_dates

In [None]:
for index, row in df.iterrows():
    url = row['links_online']
    try:
        # Fazendo a requisição para o link usando requests
        response = requests.get(url, proxies=proxies, timeout=30)
        
        # Salvando o status code na coluna 'disponivel'
        df.at[index, 'response_status_code'] = response.status_code
        
        # Se o status code for 200, salvamos o HTML na coluna 'html'
        if response.status_code == 200:
            print("Acesso bem-sucedido!")
            df.at[index, 'html'] = response.text
            datas = extract_dates(response.text)
            if datas:
                df.at[index, 'info_D-7'] = True
            else:
                df.at[index, 'info_D-7'] = False
            df.at[index, 'datas_D-7'] = datas
            df.at[index, 'qtd_datas_D-7'] = len(datas)

    except requests.RequestException as e:
        # Em caso de erro na requisição, salvamos o status como None ou erro apropriado
        df.at[index, 'response_status_code'] = 'erro'


df.to_csv("raw_data.csv")

In [None]:
df

In [None]:
# df = pd.read_csv("raw_data.csv", index_col=False)
df_disponivel = df[df['info_D-7'] == True]
df_to_analize = df_disponivel.drop_duplicates(subset='html')
df_to_analize = df_disponivel.drop_duplicates(subset='grupo')
df_to_analize.reset_index(drop=True, inplace=True)

In [63]:
df_to_analize

Unnamed: 0,grupo,link_grupo,links_online,response_status_code,html,info_D-7,datas_D-7,qtd_datas_D-7
0,ransomhouse,https://cti.fyi/groups/ransomhouse.html,http://zohlm7ahjwegcedoz7lrdrti7bvpofymcayotp7...,200,"{""data"":[{""id"":""a1894b76b7004c75a3a0845799af49...",True,[11/10/2024],1
1,monti,https://cti.fyi/groups/monti.html,http://mblogci3rudehaagbryjznltdp33ojwzkq6hn2p...,200,"<!doctype html>\n<html lang=""en"">\n <head>\n ...",True,[2024-10-07],1
2,play,https://cti.fyi/groups/play.html,http://k7kg3jqxang3wh7hnmaiokchk7qoebupfgoik6r...,200,"<!DOCTYPE html><html lang=""en""><script>functio...",True,"[2024-10-09, 2024-10-11, 2024-10-09, 2024-10-1...",17
3,flocker,https://cti.fyi/groups/flocker.html,http://flock4cvoeqm4c62gyohvmncx6ck2e7ugvyqgyx...,200,"<!DOCTYPE html>\r\n<html lang=""en-US"">\r\n<hea...",True,"[October 10, 2024]",1
4,handala,https://cti.fyi/groups/handala.html,https://handala-hack.to,200,"<!DOCTYPE html>\n<html lang=""en-US"">\n<head>\n...",True,"[2024-10-10, 2024-10-08, 2024-10-08, 2024-10-06]",4


In [None]:
html_content = df_to_analize['html'][4]

In [67]:
html_content = df_to_analize['html'][4]
target_dates  = df_to_analize['datas_D-7'][4]

def extract_info_from_html(html_content, target_dates):
    # Fazendo o parsing do HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Coletando informações
    results = []
    for li in soup.find_all('li', class_='wp-block-post'):
        title = li.find('h2').find('a').text
        date = li.find('time').get('datetime').split('T')[0]  # pegando apenas a data
        description = li.find('p', class_='wp-block-post-excerpt__excerpt').text

        # Verificando se a data está na lista de datas
        if date in target_dates:
            # Extraindo o site da descrição
            site_match = re.search(r'\b(?:https?://|www\.)?([\w.-]+(?:\.[a-z]{2,}))\b', description)
            site = site_match.group(0) if site_match else None
            
            # Adicionando os resultados
            results.append({
                'title': title,
                'date': date,
                'site': site
            })

    return results

# Chamando a função e imprimindo os resultados
results = extract_info_from_html(html_content, target_dates)
for result in results:
    print(result)


{'title': 'Doscast Hacked', 'date': '2024-10-10', 'site': 'doscast.co.il'}
{'title': 'Ambassador of Israel in Germany Emails', 'date': '2024-10-08', 'site': None}
{'title': 'Max Shop Hacked', 'date': '2024-10-08', 'site': None}
{'title': 'IIB ( Israeli Industrial Batteries ) Leaked', 'date': '2024-10-06', 'site': None}
