In [25]:
pip install requests pandas beautifulsoup4 tqdm unicode rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0


In [2]:
import re
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import pandas as pd
from collections import defaultdict

# Sessão persistente para performance
session = requests.Session()

# Armazena as lutas por slug de lutador
lutas_por_lutador = defaultdict(list)

def slugify_name(full_name: str) -> str:
    """
    Converte um nome completo em slug:
    - remove caracteres que não sejam letra, número, espaço ou hífen
    - divide em palavras e junta com hífen
    - transforma em minúsculo
    Ex.: "Márcio de Guerra" → "marcio-de-guerra"
    """
    clean = re.sub(r"[^\w\s-]", "", full_name, flags=re.UNICODE)
    parts = re.split(r"[\s]+", clean.strip())
    return "-".join(parts).lower()

def get_fighters():
    """
    Coleta da página A–Z os hrefs originais (?p=####) de cada lutador,
    garantindo que a URL exista e evitando 404 por slug incorreto.
    """
    url = "https://www.bjjheroes.com/a-z-bjj-fighters-list"
    resp = session.get(url, timeout=10)
    if resp.status_code != 200:
        print("Erro ao acessar a lista de lutadores.")
        return []
    soup = BeautifulSoup(resp.content, 'html.parser')
    fighter_ids = []
    for a in soup.select("td.column-1 a"):
        href = a.get("href")
        if href:
            fighter_ids.append(href)  # ex: "?p=8141"
    return fighter_ids

def scrape_fighter(fighter_id):
    """
    Usa o ID  para acessar o perfil:
    1) extrai o nome real do <h1>
    2) gera um slug amigável via slugify_name()
    3) coleta o histórico de lutas sob esse slug
    Retorna (slug, erro).
    """
    url = f"https://www.bjjheroes.com/bjj-fighters/{fighter_id}"
    try:
        resp = session.get(url, timeout=5)
        if resp.status_code != 200:
            return fighter_id, f"HTTP {resp.status_code}"

        soup = BeautifulSoup(resp.content, 'html.parser')

        # 1) pega nome completo do <h1>
        h1 = soup.find("h1")
        real_name = h1.get_text(strip=True) if h1 else fighter_id
        slug = slugify_name(real_name)

        # 2) localiza o bloco de lutas
        content = soup.find("div", class_="text-content")
        if not content:
            return slug, "sem text-content"
        plug = content.find("div", class_="fighter_info_plug")
        if not plug:
            return slug, "sem fighter_info_plug"
        wrapper = plug.find("div", class_="table-responsive")
        if not wrapper:
            return slug, "sem table-responsive"
        table = wrapper.find("table")
        if not table:
            return slug, "sem <table>"
        tbody = table.find("tbody")
        if not tbody:
            return slug, "sem <tbody>"

        # 3) itera cada linha de luta
        for row in tbody.find_all("tr"):
            cols = row.find_all("td")
            if len(cols) < 8:
                continue
            lutas_por_lutador[slug].append({
                "fighter":     slug,
                "fight_id":    cols[0].get_text(strip=True),
                "opponent":    cols[1].get_text(strip=True),
                "result":      cols[2].get_text(strip=True),
                "method":      cols[3].get_text(strip=True),
                "competition": cols[4].get_text(strip=True),
                "weight":      cols[5].get_text(strip=True),
                "stage":       cols[6].get_text(strip=True),
                "year":        cols[7].get_text(strip=True),
            })

        return slug, None

    except requests.exceptions.RequestException:
        return fighter_id, "erro de rede"

def process_fighters(fighter_ids, max_workers=10):
    """
    Processa em paralelo todos os IDs, exibe progresso e coleta eventuais erros.
    """
    errors = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(scrape_fighter, fid): fid for fid in fighter_ids}
        with tqdm(total=len(futures), desc="Lutadores processados", unit="lutador") as pbar:
            for future in as_completed(futures):
                slug, err = future.result()
                count = len(lutas_por_lutador.get(slug, []))
                pbar.set_postfix_str(f"{slug}: {count} lutas")
                pbar.update(1)
                if err:
                    errors.append(f"{slug} → {err}")

    if errors:
        print("\nErros encontrados:")
        for e in errors:
            print(" •", e)

def salvar_csv(path='historico_lutas.csv'):
    """
    Concatena todas as lutas coletadas em um DataFrame e salva em CSV.
    """
    total = sum(len(v) for v in lutas_por_lutador.values())
    if total == 0:
        print("Nenhuma luta coletada. Verifique os seletores.")
        return
    df = pd.DataFrame([item for sub in lutas_por_lutador.values() for item in sub])
    df.to_csv(path, index=False, encoding='utf-8-sig')
    print(f"✅ {len(df)} lutas salvas em '{path}'.")

if __name__ == "__main__":
    print("📥 Coletando lista de lutadores (IDs)...")
    fighters = get_fighters()
    print(f"🔍 {len(fighters)} perfis encontrados:")
    print(fighters[:20])  # inspecione os primeiros 20 IDs
    process_fighters(fighters, max_workers=10)
    salvar_csv()


📥 Coletando lista de lutadores (IDs)...
🔍 1428 perfis encontrados:
['/?p=8141', '/?p=9246', '/?p=8494', '/?p=390', '/?p=3083', '/?p=14157', '/?p=8814', '/?p=8131', '/?p=1133', '/?p=7478', '/?p=13245', '/?p=6339', '/?p=8968', '/?p=9541', '/?p=13498', '/?p=699', '/?p=1116', '/?p=909', '/?p=4328', '/?p=2189']


Lutadores processados: 100%|██████████| 1428/1428 [05:23<00:00,  4.41lutador/s, vinicius-garcia: 58 lutas]



Erros encontrados:
 • aarae-alexander → sem fighter_info_plug
 • adam-shahir-kayoom → sem fighter_info_plug
 • abmar-barbosa → sem fighter_info_plug
 • admilson-juquinha-brites → sem fighter_info_plug
 • adriana-martins → sem fighter_info_plug
 • adriano-martins → sem fighter_info_plug
 • adriano-silva → sem fighter_info_plug
 • alan-moraes → sem fighter_info_plug
 • beto-carmona → sem fighter_info_plug
 • alberto-crane → sem fighter_info_plug
 • alex-martins → sem fighter_info_plug
 • leka-vieira → sem fighter_info_plug
 • alessandro-capodeferro → sem fighter_info_plug
 • alexandre-baraúna → sem fighter_info_plug
 • alexandre-de-souza → sem fighter_info_plug
 • alexandro-ceconi → sem fighter_info_plug
 • alexandre-paiva-gigi → sem fighter_info_plug
 • alexandre-pulga-pimentel → sem fighter_info_plug
 • alexandre-puga → sem fighter_info_plug
 • alexandre-baiano-santos → sem fighter_info_plug
 • alexsandro-leke-machado → sem fighter_info_plug
 • allan-góes → sem fighter_info_plug
 • al

In [38]:
  df_fights = pd.read_csv('historico_lutas.csv', encoding='utf-8')
  df_fights

Unnamed: 0,fighter,fight_id,opponent,result,method,competition,weight,stage,year
0,adam-ferrara,14004,Joao MiyaoJoao Miyao,L,RNC,No Gi Pan Am.,61KG,SF,2017
1,adam-ferrara,14630,Silvio DuranSilvio Duran,L,Pts: 4x2,Cincinnati Open,70KG,SF,2017
2,adam-ferrara,15144,Thiago MacedoThiago Macedo,L,Choke from back,Houston Open,70KG,F,2018
3,adam-ferrara,15583,Rene Lopez,L,Adv,Chicago Sp. Open,64KG,SF,2018
4,adam-ferrara,16392,Thiago MacedoThiago Macedo,L,"Pts: 2x2, Adv",Miami SPO,70KG,SF,2018
...,...,...,...,...,...,...,...,...,...
55183,vinicius-garcia,22014,Cody Heller,W,,Atlanta SM Open,ABS,4F,2019
55184,vinicius-garcia,23363,Daniel Olivier,W,Canto choke,New Orleans Open,88KG,SF,2020
55185,vinicius-garcia,23371,Joshua Murdock,W,Points,New Orleans Open,ABS,SF,2020
55186,vinicius-garcia,24039,Kyle Raemisch,W,Mounted X choke,F2W 153,85KG,SPF,2020


In [13]:
print(df_fights.isnull().sum())


fighter           0
fight_id          0
opponent        148
result            0
method         1371
competition      20
weight          657
stage           332
year              0
dtype: int64


In [39]:
# prompt: Veja todos o nome dos lutadores que estao na coluna fighter, após isso,  faça a busca apenas pelos nomes que nao estao nessa coluna em opponent, e me mostre os 20 que mais aparecem

from collections import Counter

if df_fights is not None:
  fighters = df_fights['fighter'].unique()
  opponents = df_fights['opponent'].unique()

  # Find opponents not present as fighters
  missing_fighters = set(opponents) - set(fighters)

  # Count the occurrences of the missing fighters in the 'opponent' column
  opponent_counts = df_fights['opponent'].value_counts()
  missing_fighters_counts = opponent_counts[opponent_counts.index.isin(missing_fighters)]

  # Display the top 20 missing fighters
  print("Top 20 Opponents not present in the 'fighter' column:")
  print(missing_fighters_counts.head(100))
else:
  print("DataFrame not loaded correctly, please check the previous code.")


Top 20 Opponents not present in the 'fighter' column:
opponent
Unknown                           399
Leandro LoLeandro Lo              208
Fellipe AndrewFellipe Andrew      206
Gianni GrippoGianni Grippo        182
Adam WardzinskiAdam Wardzinski    168
                                 ... 
JT TorresJT Torres                 71
Mica GalvaoMica Galvao             71
Roosevelt SousaRoosevelt Sousa     71
Cleber SousaCleber Sousa           71
Lucas PinheiroLucas Pinheiro       71
Name: count, Length: 100, dtype: int64


In [40]:
df_fights = (
    df_fights
    .drop(columns=['method', 'stage'])
    .dropna(subset=['opponent'])
)
df_fights = df_fights[df_fights['opponent'] != 'Unknown']

df_fights

Unnamed: 0,fighter,fight_id,opponent,result,competition,weight,year
0,adam-ferrara,14004,Joao MiyaoJoao Miyao,L,No Gi Pan Am.,61KG,2017
1,adam-ferrara,14630,Silvio DuranSilvio Duran,L,Cincinnati Open,70KG,2017
2,adam-ferrara,15144,Thiago MacedoThiago Macedo,L,Houston Open,70KG,2018
3,adam-ferrara,15583,Rene Lopez,L,Chicago Sp. Open,64KG,2018
4,adam-ferrara,16392,Thiago MacedoThiago Macedo,L,Miami SPO,70KG,2018
...,...,...,...,...,...,...,...
55183,vinicius-garcia,22014,Cody Heller,W,Atlanta SM Open,ABS,2019
55184,vinicius-garcia,23363,Daniel Olivier,W,New Orleans Open,88KG,2020
55185,vinicius-garcia,23371,Joshua Murdock,W,New Orleans Open,ABS,2020
55186,vinicius-garcia,24039,Kyle Raemisch,W,F2W 153,85KG,2020


In [17]:
print(df_fights.isnull().sum())

fighter          0
fight_id         0
opponent         0
result           0
competition     20
weight         650
year             0
dtype: int64


In [41]:
# Preenche NaN em competition e weight com "Unknown"
df_fights['competition'].fillna('Unknown', inplace=True)
df_fights['weight'].fillna('Unknown', inplace=True)

# Verifique
print(df_fights.isnull().sum())


fighter        0
fight_id       0
opponent       0
result         0
competition    0
weight         0
year           0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_fights['competition'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_fights['weight'].fillna('Unknown', inplace=True)


In [42]:
import unicodedata

def normalize_text(s):
    if pd.isna(s):
        return s
    # Remove acentos
    s = unicodedata.normalize('NFKD', s)
    s = s.encode('ascii', 'ignore').decode('utf-8')
    # Converte para minúsculas
    return s.lower().strip()

# Aplica a normalização em df_unico
df_fights['fighter']  = df_fights['fighter'].apply(normalize_text)
df_fights['opponent'] = df_fights['opponent'].apply(normalize_text)

df_fights['fighter'] = (
    df_fights['fighter']
    .str.replace(r'\s+', '-', regex=True)
    .str.strip('-')
)

df_fights['opponent'] = (
    df_fights['opponent']
    .str.replace(r'\s+', '-', regex=True)
    .str.strip('-')
)
# Opcional: converte para categoria para otimizar memória
df_fights['fighter']  = df_fights['fighter'].astype('category')
df_fights['opponent'] = df_fights['opponent'].astype('category')


def corrigir_nome(nome):
    meio = len(nome) // 2
    if nome[:meio] == nome[meio:]:
        return nome[:meio]
    return nome

df_fights['opponent'] = df_fights['opponent'].apply(corrigir_nome)


df_fights

Unnamed: 0,fighter,fight_id,opponent,result,competition,weight,year
0,adam-ferrara,14004,joao-miyao,L,No Gi Pan Am.,61KG,2017
1,adam-ferrara,14630,silvio-duran,L,Cincinnati Open,70KG,2017
2,adam-ferrara,15144,thiago-macedo,L,Houston Open,70KG,2018
3,adam-ferrara,15583,rene-lopez,L,Chicago Sp. Open,64KG,2018
4,adam-ferrara,16392,thiago-macedo,L,Miami SPO,70KG,2018
...,...,...,...,...,...,...,...
55183,vinicius-garcia,22014,cody-heller,W,Atlanta SM Open,ABS,2019
55184,vinicius-garcia,23363,daniel-olivier,W,New Orleans Open,88KG,2020
55185,vinicius-garcia,23371,joshua-murdock,W,New Orleans Open,ABS,2020
55186,vinicius-garcia,24039,kyle-raemisch,W,F2W 153,85KG,2020


In [43]:
# prompt: Veja todos o nome dos lutadores que estao na coluna fighter, após isso,  faça a busca apenas pelos nomes que nao estao nessa coluna em opponent, e me mostre os 20 que mais aparecem

from collections import Counter

if df_fights is not None:
  fighters = df_fights['fighter'].unique()
  opponents = df_fights['opponent'].unique()

  # Find opponents not present as fighters
  missing_fighters = set(opponents) - set(fighters)

  # Count the occurrences of the missing fighters in the 'opponent' column
  opponent_counts = df_fights['opponent'].value_counts()
  missing_fighters_counts = opponent_counts[opponent_counts.index.isin(missing_fighters)]

  # Display the top 20 missing fighters
  print("Top 20 Opponents not present in the 'fighter' column:")
  print(missing_fighters_counts.head(100))
else:
  print("DataFrame not loaded correctly, please check the previous code.")


Top 20 Opponents not present in the 'fighter' column:
opponent
lucas-barbosa       163
roberto-abreu       130
felipe-pena         126
r.-evangelista      115
osvaldo-moizinho    110
                   ... 
carlos-farias        33
orlando-castillo     32
matheus-onda         32
athos-miranda        32
leandro-lima         32
Name: count, Length: 100, dtype: int64


In [44]:
import pandas as pd


# 1) Dicionário manual mapeando opponent → fighter correto
manual_map = {
    'lucas-barbosa':    'lucas-hulk-barbosa',
    'roberto-abreu':    'roberto-cyborg-abreu',
    'felipe-pena':      'felipe-pena-preguica',
    'r.-evangelista':   'ricardo-evangelista',
    'osvaldo-moizinho': 'osvaldo-queixinho-moizinho',
    'rafael-lovato':    'rafael-lovato-jr',
    'helton-jose':      'helton-jose-junior',
    'marcus-almeida':   'marcus-buchecha-almeida',
    'diego-oliveira':   'diego-pato-oliveira',
    'joao-rocha':       'joao-gabriel-rocha',
    'michael-liera':    'michael-liera-jr',
    'c.-negromonte':    'charles-negromonte',
    'leo-saggioro':     'leonardo-saggioro',
    'vitor-oliveira':   'vitor-henrique-oliveira',
    'augusto-mendes':   'augusto-mendes-tanquinho',
    'mica-galvao':      'micael-galvao',
    'rubens-charles':   'rubens-charles-cobrinha',
    'alexandre-jesus':  'alexandre-de-jesus',
    'wellington-luis':  'wellington-alemao-luis',
    'thiago-sa':        'thiago-sa-fortes',
    'henrique-cardoso': 'henrique-cardoso-ceconi',
    'pedro-alex':       'pedro-alex-bombom',
    'israel-sousa':     'israel-sousa-almeida',
    'lucas-valle':      'lucas-valente',
    'diogo-sampaio':    'diogo-sampaio-moreno',
    'celso-vinicius':   'celso-celsinho-venicius',
    'bruno-matias':     'jose-bruno-matias',
    'gilbert-burns':    'gilbert-burns-durinho',
    'levi-jones':       'levi-jones-leary',
    'yan-lucas':               'yan-pica-pau-lucas',
    'leo-nogueira':            'leonardo-nogueira',
    'marcos-martins':          'marcos-petcho-martins',
    'gabriel-costa':           'gabriel-costa-maranhao',
    'gabriel-henrique':        'gabriel-henrique-oliveira',
    'braga-neto':              'antonio-braga-neto',
    'rodrigo-tatu':            'rodrigo-lopes-tatu',
    'ademir-barreto':          'ademir-barreto-araujo',
    'rafael-anjos':            'rafael-dos-anjos-torres',
    'mikey-musumeci':          'michael-musumeci',
    'valdir-araujo':           'valdir-araujo-bb-monstro',
    'leo-silva':               'leonardo-silva',
    'rodrigo-fajardo':         'rodrigo-francioni',
    'ruan-oliveira':           'ruan-de-oliveira',
    'mauricio-oliveira':       'mauricio-oliveira-neto',
    'yago-souza':              'yago-de-souza',
    'alan-finfou':             'alan-nascimento-finfou',
    'eldar-rafigaev':          'eldar-yakuza-rafigaev',
    'orlando-castillo':        'orlando-sanchez',
    'leandro-lima':            'leandro-rounaud-lima',
    'catriel-oliveira':        'catriel-oliveira-rodrigues',
    'gabriel-oliveira':        'gabriel-de-oliveira',
    'g.-lambertucci':          'guilherme-lambertucci',
    'claudio-mattos':          'claudio-mattos-caloquinha',
    'inacio-santos':           'inacio-dos-santos',
    'bill-cooper':             'bill-the-grill-cooper',
    'luis-oliveira':           'luis-oliveira-cantareira',
    'faisal-alkitbe':          'faisal-al-kitbe',
    'marcio-cruz':             'marcio-cruz-pe-de-pano',
    'marcos-costa':            'marcos-paulo-costa'
}

# 2) Verificação anterior ao replace: quantas ocorrências cada chave tinha
counts_before = df_fights['opponent'].value_counts()
print("Ocorrências antes da substituição:")
print(counts_before.reindex(manual_map.keys(), fill_value=0))

# 3) Aplica apenas essas substituições em opponent
df_fights['opponent'] = df_fights['opponent'].replace(manual_map)

# 4) Converte para categoria (recomendado)
df_fights['opponent'] = df_fights['opponent'].astype('category')

# 5) Verificação após o replace
print("\nTotal de fighters únicos em 'opponent' após mapeamento:", df_fights['opponent'].nunique())

# (Opcional) Confira que as chaves foram realmente substituídas, mostrando zeros
counts_after = df_fights['opponent'].value_counts()
print("\nOcorrências após a substituição (esperado zeros nas chaves):")
print(counts_after.reindex(manual_map.keys(), fill_value=0))

# 6) Se tudo estiver certo, df_limpado agora contém as correções
df_fights


Ocorrências antes da substituição:
opponent
lucas-barbosa        163
roberto-abreu        130
felipe-pena          126
r.-evangelista       115
osvaldo-moizinho     110
rafael-lovato        106
helton-jose          102
marcus-almeida       102
diego-oliveira       100
joao-rocha            99
michael-liera         88
c.-negromonte         86
leo-saggioro          80
vitor-oliveira        78
augusto-mendes        73
mica-galvao           71
rubens-charles        68
alexandre-jesus       67
wellington-luis       67
thiago-sa             66
henrique-cardoso      66
pedro-alex            65
israel-sousa          64
lucas-valle           60
diogo-sampaio         53
celso-vinicius        52
bruno-matias          47
gilbert-burns         47
levi-jones            47
yan-lucas             99
leo-nogueira          50
marcos-martins        49
gabriel-costa         48
gabriel-henrique      44
braga-neto            43
rodrigo-tatu          41
ademir-barreto        38
rafael-anjos          37
mikey-

Unnamed: 0,fighter,fight_id,opponent,result,competition,weight,year
0,adam-ferrara,14004,joao-miyao,L,No Gi Pan Am.,61KG,2017
1,adam-ferrara,14630,silvio-duran,L,Cincinnati Open,70KG,2017
2,adam-ferrara,15144,thiago-macedo,L,Houston Open,70KG,2018
3,adam-ferrara,15583,rene-lopez,L,Chicago Sp. Open,64KG,2018
4,adam-ferrara,16392,thiago-macedo,L,Miami SPO,70KG,2018
...,...,...,...,...,...,...,...
55183,vinicius-garcia,22014,cody-heller,W,Atlanta SM Open,ABS,2019
55184,vinicius-garcia,23363,daniel-olivier,W,New Orleans Open,88KG,2020
55185,vinicius-garcia,23371,joshua-murdock,W,New Orleans Open,ABS,2020
55186,vinicius-garcia,24039,kyle-raemisch,W,F2W 153,85KG,2020


In [29]:
df_fights.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54641 entries, 0 to 55187
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   fighter      54641 non-null  category
 1   fight_id     54641 non-null  int64   
 2   opponent     54641 non-null  category
 3   result       54641 non-null  object  
 4   competition  54641 non-null  object  
 5   weight       54641 non-null  object  
 6   year         54641 non-null  int64   
dtypes: category(2), int64(2), object(3)
memory usage: 3.1+ MB


In [45]:
from rapidfuzz import process, fuzz
import pandas as pd

# 1. Garante que as colunas sejam do tipo string (resolve o erro)
df_fights['opponent'] = df_fights['opponent'].astype(str)
df_fights['fighter'] = df_fights['fighter'].astype(str)

# 2. Nomes únicos da coluna opponent (removendo nulos)
opponent_names = df_fights['opponent'].dropna().unique().tolist()
threshold = 85
matches = {}

# 3. Mapa com nomes corretos (quando opponent == fighter)
opponent_fighter_map = (
    df_fights[df_fights['opponent'] == df_fights['fighter']]
    [['opponent', 'fighter']]
    .drop_duplicates()
    .set_index('opponent')['fighter']
    .to_dict()
)

# 4. Encontra variações ortográficas
for name in opponent_names:
    initial = name.split('-', 1)[0][0]  # primeira letra do primeiro nome
    raw_matches = process.extract(name, opponent_names, scorer=fuzz.token_sort_ratio, limit=10)

    similares = [
        (match, score)
        for match, score, _ in raw_matches
        if score >= threshold
        and match != name
        and match.split('-', 1)[0][0] == initial
    ]

    if similares:
        matches[name] = similares

# 5. Gera sugestões baseadas na coluna 'fighter'
correcoes_sugeridas = {}

for nome_errado, similares in matches.items():
    candidatos_validos = [s for s, _ in similares if s in opponent_fighter_map]

    if candidatos_validos:
        nome_correto = opponent_fighter_map[candidatos_validos[0]]
        correcoes_sugeridas[nome_errado] = nome_correto
    else:
        correcoes_sugeridas[nome_errado] = similares[0][0]

# 6. Exibe as sugestões
print("Sugestões de correção (sem aplicar):")
for errado, certo in correcoes_sugeridas.items():
    print(f"'{errado}' ➜ '{certo}'")


Sugestões de correção (sem aplicar):
'silvio-duran' ➜ 'sivio-duran'
'thiago-macedo' ➜ 'tiago-macedo'
'rene-lopez' ➜ 'renan-lopez'
'rodrigo-francioni' ➜ 'rodrigo-francione'
'rodrigo-oliveira' ➜ 'rodrigo-limera'
'martin-davilla' ➜ 'martin-davila'
'felipe-pena-preguica' ➜ 'felipe-preguica'
'yuri-simoes' ➜ 'yuri-simes'
'igor-silva' ➜ 'igor-da-silva'
'jackson-sousa' ➜ 'jackson-souza'
'abdulbary-guseinov' ➜ 'abdulbari-guseinov'
'basel-fanous' ➜ 'basel-fanos'
'kamil-umiski' ➜ 'kamil-uminski'
'nicolas-penzer' ➜ 'nicholas-penzer'
'pouya-rahman' ➜ 'pouya-rahmani'
'quentin-rosensweig' ➜ 'quentin-rosenzweig'
'neiman-gracie' ➜ 'nieman-gracie'
'joao-soares' ➜ 'jason-soares'
'jonathan-satava' ➜ 'johnathan-satava'
'mahamed-aly' ➜ 'mahamad-aly'
'james-puopolo' ➜ 'james-poupolo'
'victor-hugo' ➜ 'vitor-hugo'
'vitor-schlosser' ➜ 'victor-schlosser'
'ricardo-bastos' ➜ 'rico-bastos'
'joseph-cashin' ➜ 'joe-cashin'
'wellington-peroto' ➜ 'wellington-perotto'
'leo-davila' ➜ 'leo-davilla'
'samir-abdolkader' ➜ 'sa

In [46]:
df_fights['opponent'] = df_fights['opponent'].replace(correcoes_sugeridas)


In [47]:
from rapidfuzz import process, fuzz

cats = df_fights['opponent'].dropna().unique().tolist()
threshold = 85

matches = {}
for cat in cats:
    # inicial do primeiro nome de cat
    cat_initial = cat.split('-', 1)[0][0]

    # busca candidatos
    raw = process.extract(cat, cats, scorer=fuzz.token_sort_ratio, limit=5)

    # só mantém score>=threshold, nomes diferentes, e mesma inicial
    filtered = [
        (match, score)
        for match, score, _ in raw
        if score >= threshold
           and match != cat
           and match.split('-', 1)[0][0] == cat_initial
    ]
    if filtered:
        matches[cat] = filtered

print("Possíveis variações ortográficas em opponent (mesma inicial):")
for cat, vars in matches.items():
    print(f"{cat!r}: {vars}")


Possíveis variações ortográficas em opponent (mesma inicial):
'sivio-duran': [('silvio-duran', 95.65217391304348)]
'tiago-macedo': [('thiago-macedo', 96.0)]
'renan-lopez': [('rene-lopez', 85.71428571428572)]
'rodrigo-francione': [('rodrigo-francioni', 94.11764705882352)]
'rodrigo-limera': [('rodrigo-lima', 92.3076923076923)]
'martin-davila': [('martin-davilla', 96.2962962962963)]
'felipe-preguica': [('felipe-pena-preguica', 85.71428571428572)]
'yuri-simes': [('yuri-simoes', 95.23809523809523)]
'igor-da-silva': [('igor-silva', 86.95652173913044)]
'jackson-souza': [('jackson-sousa', 92.3076923076923)]
'abdulbari-guseinov': [('abdulbary-guseinov', 94.44444444444444)]
'basel-fanos': [('basel-fanous', 95.65217391304348)]
'kamil-uminski': [('kamil-huminski', 96.2962962962963)]
'nicholas-penzer': [('nicolas-penzer', 96.55172413793103), ('nicholas-renier', 86.66666666666667), ('nicholas-reiner', 86.66666666666667)]
'pouya-rahmani': [('pouya-rahman', 96.0)]
'quentin-rosenzweig': [('quentin-rose

In [48]:


# 2) Monte o mapping variant → canonical
mapping = {}
for canonical, variants in matches.items():
    for variant, score in variants:
        mapping[variant] = canonical


# 2) Aplique as substituições na coluna opponent
df_fights['opponent'] = df_fights['opponent'].replace(mapping)


In [49]:
df_fights = df_fights.sort_values('fight_id', ascending=True)
df_fights

Unnamed: 0,fighter,fight_id,opponent,result,competition,weight,year
18768,master-helio-gracie,2,antonio-portugal,W,Unknown,ABS,1932
18769,master-helio-gracie,3,takashi-namiki,D,Unknown,ABS,1932
18770,master-helio-gracie,4,fred-ebert,D,Unknown,ABS,1932
18771,master-helio-gracie,5,wladek-zbyszko,D,Unknown,ABS,1934
18772,master-helio-gracie,6,miyake,W,Unknown,ABS,1934
...,...,...,...,...,...,...,...
51360,manuel-ribamar,58855,seonghyeon-joo,W,Oklahoma CO,ABS,2025
12351,enderson-dias,58856,gabriel-cardoso,W,Oklahoma CO,ABS,2025
47658,vitoria-assis,58857,mona-bailey,W,Oklahoma CO,58KG,2025
11898,emily-fernandez,58858,gabriele-schuck,W,Oklahoma CO,69KG,2025


In [51]:
# Cria uma nova coluna com valores únicos e incrementais
df_fights['id_fight'] = range(1, len(df_fights) + 1)
df_fights = df_fights.drop(columns=['fight_id'])
df_fights

Unnamed: 0,fighter,opponent,result,competition,weight,year,id_fight
18768,master-helio-gracie,antonio-portugal,W,Unknown,ABS,1932,1
18769,master-helio-gracie,takashi-namiki,D,Unknown,ABS,1932,2
18770,master-helio-gracie,fred-ebert,D,Unknown,ABS,1932,3
18771,master-helio-gracie,wladek-zbyszko,D,Unknown,ABS,1934,4
18772,master-helio-gracie,miyake,W,Unknown,ABS,1934,5
...,...,...,...,...,...,...,...
51360,manuel-ribamar,seonghyeon-joo,W,Oklahoma CO,ABS,2025,54637
12351,enderson-dias,gabriel-cardoso,W,Oklahoma CO,ABS,2025,54638
47658,vitoria-assis,mona-bailey,W,Oklahoma CO,58KG,2025,54639
11898,emily-fernandez,gabriele-schuck,W,Oklahoma CO,69KG,2025,54640


In [59]:
# 2) Calcule value_counts e nomeie a Series
weight_counts = df_fights['weight'].value_counts()
weight_counts.name = 'count'
weight_counts.index.name = 'weight_category'

# 3) Converta em DataFrame
weight_counts_df = weight_counts.reset_index()


from rapidfuzz import process, fuzz

cats = df_fights['weight'].dropna().unique().tolist()
threshold = 85

matches = {
    cat: [
        (match, score)
        for match, score, _ in process.extract(cat, cats, scorer=fuzz.token_sort_ratio, limit=5)
        if score >= threshold and match != cat
    ]
    for cat in cats
}

# Exiba só as categorias que têm correspondentes semelhantes
similar = {k: v for k, v in matches.items() if v}
print("Possíveis variações ortográficas em weight:")
for cat, vars in similar.items():
    print(f"{cat!r}: {vars}")


Possíveis variações ortográficas em weight:
'82KG': [('O82KG', 88.88888888888889)]
'76KG': [('O76KG', 88.88888888888889)]
'70KG': [('O70KG', 88.88888888888889)]
'100KG': [('O100KG', 90.9090909090909), ('100KG+', 90.9090909090909)]
'88KG': [('O88KG', 88.88888888888889), ('88K', 85.71428571428572), ('88G', 85.71428571428572)]
'O88KG': [('88KG', 88.88888888888889)]
'77KG': [('U77KG', 88.88888888888889), ('O77KG', 88.88888888888889)]
'99KG': [('O99KG', 88.88888888888889)]
'O99KG': [('99KG', 88.88888888888889)]
'94KG': [('O94KG', 88.88888888888889)]
'O100KG': [('100KG', 90.9090909090909)]
'74KG': [('O74KG', 88.88888888888889)]
'75KG': [('O75KG', 88.88888888888889)]
'O91KG': [('91KG', 88.88888888888889)]
'98KG': [('O98KG', 88.88888888888889)]
'108KG': [('O108KG', 90.9090909090909)]
'O108KG': [('108KG', 90.9090909090909)]
'O75KG': [('75KG', 88.88888888888889)]
'72KG': [('O72KG', 88.88888888888889)]
'81KG': [('O81KG', 88.88888888888889)]
'91KG': [('O91KG', 88.88888888888889)]
'63KG': [('O63KG'

In [60]:
import re

def normalize_weight(w):
    w = str(w).upper().strip()
    # tenta extrair dígitos (p.ex. 62, 100, 88)
    m = re.search(r'(\d+)', w)
    if m:
        return f"{m.group(1)}KG"
    # caso não seja um peso numérico, mantém (ex.: ABS, UNKNOWN)
    return w

# aplica ao seu df_unico
df_fights['weight'] = df_fights['weight'].apply(normalize_weight)

# converte para categoria (opcional, mas economiza memória)
df_fights['weight'] = df_fights['weight'].astype('category')

# confira as novas categorias e frequências
print(df_fights['weight'].value_counts())


weight
ABS      11543
70KG      3788
76KG      3733
82KG      3649
88KG      3515
         ...  
54KG         2
125KG        1
//KG         1
4KG          1
SF           1
Name: count, Length: 80, dtype: int64


In [61]:
standard_kg = [48, 52, 57, 58, 63, 64, 69, 70, 76, 82, 88, 94, 98, 100]

def standardize_weight(w):
    w_str = str(w).upper().strip()
    # Preserve ABS e UNKNOWN como estão
    if w_str in ("ABS", "UNKNOWN"):
        return w_str
    # Tenta extrair o número
    m = re.search(r"(\d+)", w_str)
    if not m:
        # Se não for numérico, vira UNKNOWN
        return "UNKNOWN"
    num = int(m.group(1))
    # Qualquer peso >= 100 vira 100KG (ultra-pesado sem limite)
    if num >= 100:
        return "100KG"
    # Encontre o padrão mais próximo para pesos < 100
    diffs = [(abs(num - std), std) for std in standard_kg if std < 100]
    _, best_std = min(diffs, key=lambda x: x[0])
    return f"{best_std}KG"

# Aplique a padronização
df_fights["weight"] = df_fights["weight"].apply(standardize_weight).astype("category")

# Veja as contagens finais
print(df_fights["weight"].value_counts())


weight
ABS        11543
76KG        8483
82KG        6750
70KG        4701
88KG        4205
94KG        4136
100KG       3955
64KG        2820
69KG        2038
98KG        1445
63KG        1351
57KG        1135
UNKNOWN      746
58KG         614
48KG         468
52KG         251
Name: count, dtype: int64


In [64]:
df_fights = df_fights.sort_values('id_fight', ascending=True)
df_fights

df_fights.to_csv('df_fights_final.csv', index=False)