In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
# URL of the page to scrape
url = "https://www.cagematch.net/?id=8&nr=1&page=8"

# Send a GET request to fetch the page's HTML content
response = requests.get(url)
if response.status_code != 200:
    print(f"Failed to fetch page: {response.status_code}")
else:
    print("Page fetched successfully!")
    
# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all <div> elements with class 'QuickResultsHeader'
headers = soup.find_all('div', {'class': 'QuickResultsHeader'})

# Extract the text content of each header
results = []
for header in headers:
    results.append(header.text.strip())

# Display the extracted data
for result in results:
    print(result)

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Initialize a list to store match data
matches = []


base_url = "https://www.cagematch.net/?id=8&nr=1&page=8"

# Loop through offsets from 0 to 1000, incrementing by 100
for offset in range(0, 27001, 100):  # Offset stops at 1000
    url = f"{base_url}&s={offset}" if offset > 0 else base_url
    print(f"Scraping URL: {url}")
    
    
    # Send a GET request
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch page: {response.status_code}")
        continue
    
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all <div> elements representing match sections
    match_sections = soup.find_all('div', class_='QuickResultsHeader')

    # Function to extract match time robustly
    def extract_match_time(match_results_text):
        """
        Extract match time, even if it appears after (c) and before TITLE CHANGE.
        Example:
            "Giulia defeats Roxanne Perez (c) (11:14) - TITLE CHANGE !!!" -> "11:14"
        """
        time_match = re.search(r'\((\d{1,2}:\d{2})\)(?:\s+-\s+TITLE CHANGE !!!)?', match_results_text)
        if time_match:
            return time_match.group(1)
        return None

    # Loop through each match section
    for section in match_sections:
        # Skip if "House Show" or "LFG" is mentioned in the section header (case-insensitive)
        if re.search(r'\b(house show|lfg)\b', section.get_text(strip=True), re.IGNORECASE):
            continue  # Skip House Show or LFG sections entirely

        # Find all <li> elements (each represents a match) within this section
        match_elements = section.find_next_sibling('ul').find_all('li') if section.find_next_sibling('ul') else []

        for match in match_elements:
            # Extract match type from <span class="MatchType">
            match_type_elem = match.find('span', class_='MatchType')
            match_type = match_type_elem.get_text(separator=' ', strip=True) if match_type_elem else None
            
            # Skip if Match Type contains "Dark" (case-insensitive, handles complex structures)
            if match_type and re.search(r'\bdark\b', match_type, re.IGNORECASE):
                continue  # Skip Dark matches
            
            match_type = re.sub(r':$', '', match_type) if match_type else None

            # Extract match results from <span class="MatchResults">
            match_results_elem = match.find('span', class_='MatchResults')
            if match_results_elem:
                # Extract time robustly
                match_time = extract_match_time(match_results_elem.text)

                # Remove time, (c), and TITLE CHANGE from the match results text
                match_results_text = re.sub(r'\(c\)', '', match_results_elem.text)
                match_results_text = re.sub(r'\s*\(\d{1,2}:\d{2}\)(?:\s+-\s+TITLE CHANGE !!!)?', '', match_results_text).strip()

                # Determine the match finish
                lower_results = match_results_text.lower()
                finish = 'pinfall'
                if 'by dq' in lower_results:
                    finish = 'DQ'
                elif 'by count out' in lower_results or 'by countout' in lower_results:
                    finish = 'countout'
                elif 'by submission' in lower_results:
                    finish = 'submission'
                elif 'by no contest' in lower_results:
                    finish = 'no contest'
                elif 'double count out' in lower_results:
                    finish = 'double count out'

                results_split = re.split(r' defeat[s]? ', match_results_elem.text, maxsplit=1)

                if len(results_split) == 2:
                    winners_section = results_split[0]
                    losers_section = results_split[1]

                    # Clean names
                    winners_section = re.sub(r'\(c\)', '', winners_section)
                    losers_section = re.sub(r'\(c\)', '', losers_section)

                    winners_section = re.sub(r'\(w/.*\)', '', winners_section)
                    losers_section = re.sub(r'\(w/.*\)', '', losers_section)

                    winners_section = re.sub(r'\(\d{1,2}:\d{2}\)', '', winners_section)
                    losers_section = re.sub(r'\(\d{1,2}:\d{2}\)', '', losers_section)

                    winners_section = re.sub(r'\s+-\s*$', '', winners_section)
                    losers_section = re.sub(r'\s+-\s*$', '', losers_section)

                    losers_section = re.sub(r'\s+by\s+\w+.*$', '', losers_section).strip()

                    title_change = False
                    if 'TITLE CHANGE !!!' in losers_section:
                        title_change = True
                        losers_section = re.sub(r'TITLE CHANGE !!!', '', losers_section).strip()

                    # Replace '&' and 'and' with ', '
                    def replace_and_symbols(name):
                        name = name.replace(" & ", ", ")
                        name = name.replace(" and ", ", ")
                        return name.strip(", ")

                    winners_section = replace_and_symbols(winners_section)
                    losers_section = replace_and_symbols(losers_section)

                    # Store match data
                    matches.append({
                        'Match Type': match_type,
                        'Winners': winners_section,
                        'Losers': losers_section,
                        'Time': match_time,
                        'Finish': finish,
                        'Title Change': title_change
                    })

# Convert matches to a DataFrame
df = pd.DataFrame(matches)



In [None]:
import re
from bs4 import BeautifulSoup

# Example HTML snippet
html = '''
<li>
  <span class="MatchType">Dark <a href="?id=5&amp;nr=20">WWE Title</a> / <a href="?id=5&amp;nr=3102">WWE Universal Title</a>:</span>
  <span class="MatchResults">
    <a href="?id=2&amp;nr=3686&amp;name=Cody+Rhodes">Cody Rhodes</a> (c) defeats 
    <a href="?id=2&amp;nr=15859&amp;name=Carmelo+Hayes">Carmelo Hayes</a>
  </span>
</li>
'''

# Parse the HTML
soup = BeautifulSoup(html, 'html.parser')

# Extract MatchType
match_type_elem = soup.find('span', class_='MatchType')

# Debug Step 1: Show the raw extracted MatchType element
print("RAW MatchType Element:", match_type_elem)

# Get text, preserving spaces for debugging
match_type_text = match_type_elem.get_text(separator=' ', strip=True) if match_type_elem else ''

# Debug Step 2: Show the extracted text
print("Extracted MatchType Text:", repr(match_type_text))

# Check for 'Dark'
def is_dark_match(match_type_text):
    """
    Determines if 'Dark' exists in the MatchType text after parsing.
    """
    debug_result = re.search(r'\bdark\b', match_type_text, re.IGNORECASE)
    print("Regex Match Object:", debug_result)
    return bool(debug_result)

# Run the function and print the result
result = is_dark_match(match_type_text)
print("Contains 'Dark':", result)


In [None]:
import pandas as pd
import re

def split_tag_teams_from_columns(df, winners_col='Winners', losers_col='Losers'):
    """
    Splits tag teams into individual names in specified Winners and Losers columns of an existing DataFrame.
    Example: "#DIY (Johnny Gargano, Tommaso Ciampa)" -> "Johnny Gargano, Tommaso Ciampa"
    
    Args:
        df (pd.DataFrame): The DataFrame to process.
        winners_col (str): Name of the column containing Winners.
        losers_col (str): Name of the column containing Losers.
    
    Returns:
        pd.DataFrame: DataFrame with updated Winners and Losers columns.
    """
    def split_tag_teams(value):
        if pd.isna(value):  # Handle NaN values
            return value
        
        # Match content inside parentheses
        pattern = r'\((.*?)\)'
        matches = re.findall(pattern, value)
        
        individuals = []  # Store individual names
        
        if matches:
            for match in matches:
                # Split the matched content by comma and add each member
                individuals.extend([member.strip() for member in match.split(',')])
        
        # Remove the tag team names and return only the extracted names
        return ', '.join(individuals) if individuals else value
    
    # Apply the function to Winners and Losers columns
    if winners_col in df.columns:
        df[winners_col] = df[winners_col].apply(split_tag_teams)
    if losers_col in df.columns:
        df[losers_col] = df[losers_col].apply(split_tag_teams)
    
    return df

split_tag_teams_from_columns(df)




In [None]:
import re

# Function to clean text in Winners and Losers columns
def clean_column(value):
    if pd.isna(value):  # Handle NaN values
        return value
    
    # Remove any bracketed content like [2:0], [Runde 3]
    value = re.sub(r'\[.*?\]', '', value)
    
    # Remove trailing dashes and extra spaces
    value = re.sub(r'\s*-\s*$', '', value)
    
    # Strip extra whitespace
    return value.strip()

# Apply the cleaning function to both columns
df['Winners'] = df['Winners'].apply(clean_column)
df['Losers'] = df['Losers'].apply(clean_column)



In [None]:
# Enhanced logic to identify Multi-Man matches
def is_multi_man(row):
    match_type = str(row['Match Type']).lower()
    losers = str(row.get('Losers', '')).split(',')
    winners = str(row.get('Winners', '')).split(',')

    # Check for specific keywords in the match type
    if any(kw in match_type for kw in ['fatal four way', 'triple threat', 'gauntlet', 'battle royal', 'ten man']):
        return 'Multi-Man'
    # Check if there is one winner and multiple losers
    elif len(winners) == 1 and len(losers) > 1:
        return 'Multi-Man'
    else:
        return None

# Apply the updated logic to create the Multi-Man column
df['Multi-Man'] = df.apply(is_multi_man, axis=1)

# Expanded list of stipulations
stipulation_keywords = [
    'hardcore', 'casket', 'ambulance', 'anything goes', 'sudden death',
    'tables', 'devil\'s playground', 'ladder', 'chair', 'chairs', 'bull rope',
    'strap', 'kendo stick', 'singapore cane', 'steel cage', 'no holds barred',
    'hell in a cell', 'street fight', 'falls count anywhere', 'last man standing',
    'i quit', 'submission', 'buried alive', 'inferno', 'punjabi prison',
    'blindfold', 'lumberjack', 'tribal combat', 'second city strap',
    'elimination chamber', 'tower of doom', 'beat the clock', 'three stages of hell',
    'survivors match', 'iron man', 'texas death', 'extreme rules',
    'best two out of three falls', 'death', 'double dog collar', 'pure rules',
    'new japan rambo', 'wargames', 'barbed wire board'
]

# Add a new Stipulation column
df['Stipulation'] = df['Match Type'].apply(
    lambda x: 'Stipulation' if any(kw in str(x).lower() for kw in stipulation_keywords) else None
)

# Update Category column logic
def classify_match_type(match_type):
    match_type = str(match_type).lower()
    if ('#1 contendership' in match_type and 'final' in match_type) or ('#1 contendership' in match_type and 'tournament' not in match_type):
        return '#1 Contendership'
    elif 'title' in match_type and not any(kw in match_type for kw in ['semi final', 'tournament first round']):
        return 'Title'
    elif 'battle royal' in match_type:
        return 'Battle Royal'
    else:
        return None

# Apply the classification logic
df['Category'] = df['Match Type'].apply(classify_match_type)

# Ensure Tornado Tag is excluded from Multi-Man
df.loc[df['Match Type'].str.contains('tornado tag', case=False, na=False), 'Multi-Man'] = None

# Display results with Match Type, Category, Multi-Man, and Stipulation
df_subset = df[['Match Type', 'Category', 'Multi-Man', 'Stipulation']]

# Filter and display only rows without Multi-Man matches
df_non_multi_man_only = df[df['Multi-Man'].notnull()]



In [None]:
import pandas as pd

# Initialize ELO ratings
elo_ratings = {}
elo_history = {}

# Helper function to calculate new ELOs
def calculate_elo(winner_elo, loser_avg_elo, k=32):
    expected = 1 / (1 + 10 ** ((loser_avg_elo - winner_elo) / 400))
    elo_change = k * (1 - expected)
    return elo_change

# Update ELO ratings and log history
def update_elo(match_row):
    winners = match_row['Winners'].split(', ')
    losers = match_row['Losers'].split(', ')
    
    # Calculate average ELO of losers
    loser_elos = [elo_ratings.get(loser, 1000) for loser in losers]
    avg_loser_elo = sum(loser_elos) / len(loser_elos)
    
    for winner in winners:
        winner_elo = elo_ratings.get(winner, 1000)
        elo_change = calculate_elo(winner_elo, avg_loser_elo)
        new_elo = winner_elo + elo_change
        
        # Update ELO rating
        elo_ratings[winner] = new_elo
        
        # Log ELO history with Win/Loss
        if winner not in elo_history:
            elo_history[winner] = []
        elo_history[winner].append({
            'Match': match_row['Match Type'],
            'Opponents': ', '.join(losers),
            'ELO Before': winner_elo,
            'ELO Change': elo_change,
            'ELO After': new_elo,
            'Win/Loss': 'Win'
        })
    
    for loser in losers:
        loser_elo = elo_ratings.get(loser, 1000)
        avg_winner_elo = sum([elo_ratings.get(winner, 1000) for winner in winners]) / len(winners)
        elo_change = calculate_elo(loser_elo, avg_winner_elo, k=-32)
        new_elo = loser_elo + elo_change
        
        # Update ELO rating
        elo_ratings[loser] = new_elo
        
        # Log ELO history with Win/Loss
        if loser not in elo_history:
            elo_history[loser] = []
        elo_history[loser].append({
            'Match': match_row['Match Type'],
            'Opponents': ', '.join(winners),
            'ELO Before': loser_elo,
            'ELO Change': elo_change,
            'ELO After': new_elo,
            'Win/Loss': 'Loss'
        })

# Assume df is the DataFrame containing match data
df_matches = df

# Process each match in reverse order
for _, row in df_matches[::-1].iterrows():
    update_elo(row)


# Retrieve ELO history for a specific wrestler with accurate Win/Loss determination
def get_elo_history(wrestler_name):
    if wrestler_name in elo_history:
        # Convert ELO history to DataFrame
        history = pd.DataFrame(elo_history[wrestler_name])
        return history
    else:
        return f"No matches found for {wrestler_name}."

# Example usage
cody_history = get_elo_history('Giulia')
cody_history

In [None]:
# Display current ELO ratings for all wrestlers
elo_df = pd.DataFrame(list(elo_ratings.items()), columns=['Wrestler', 'ELO']).sort_values(by='ELO', ascending=False)
elo_df


In [None]:
# Example usage
candice_history = get_elo_history('Candice LeRae')
candice_history

In [None]:
import matplotlib.pyplot as plt

# Create a unified DataFrame from ELO history with match indices
def create_elo_history_df(elo_history):
    records = []
    for wrestler, matches in elo_history.items():
        for i, match in enumerate(matches):
            records.append({
                'Wrestler': wrestler,
                'Match': match['Match'],
                'ELO': match['ELO After'],
                'Match Number': i + 1  # Sequential match number for the wrestler
            })
    return pd.DataFrame(records)

elo_history_df = create_elo_history_df(elo_history)

# Plot ELO changes over matches for all wrestlers
plt.figure(figsize=(14, 8))
for wrestler in elo_history_df['Wrestler'].unique():
    wrestler_data = elo_history_df[elo_history_df['Wrestler'] == wrestler]
    plt.plot(wrestler_data['Match Number'], wrestler_data['ELO'], label=wrestler)
    
    # Annotate at the endpoint of each line
    plt.text(
        x=wrestler_data['Match Number'].iloc[-1],  # Last match number
        y=wrestler_data['ELO'].iloc[-1],          # Last ELO value
        s=wrestler,                               # Wrestler's name
        fontsize=8,                               # Font size
        ha='left',                                # Horizontal alignment
        va='center',                              # Vertical alignment
        color='black'                             # Optional: Match text color with line color
    )

plt.title("ELO Ratings Over Matches")
plt.xlabel("Match Number")
plt.ylabel("ELO Rating")
plt.grid()
plt.tight_layout()
plt.show()


In [None]:
# Example usage
history = get_elo_history('Gunther')
history




In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Initialize tracking for average ELO over time
average_elo_over_time = []

# Iterate over matches and update ELOs
for _, row in df_matches.iterrows():
    update_elo(row)  # This function updates ELO ratings for each match
    
    # Calculate the average ELO after processing this match
    average_elo = sum(elo_ratings.values()) / len(elo_ratings)
    average_elo_over_time.append(average_elo)

# Plot the average ELO over time
plt.figure(figsize=(12, 6))
plt.plot(range(len(average_elo_over_time)), average_elo_over_time, label='Average ELO', color='blue')
plt.axhline(y=1000, color='red', linestyle='--', alpha=0.7, label='Baseline (1000 ELO)')
plt.title('Average ELO Over Time', fontsize=16)
plt.xlabel('Match Index', fontsize=12)
plt.ylabel('Average ELO', fontsize=12)
plt.legend(fontsize=10)
plt.grid(alpha=0.5)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Create a unified DataFrame from ELO history with match indices
def create_elo_history_df(elo_history):
    records = []
    for wrestler, matches in elo_history.items():
        for i, match in enumerate(matches):
            records.append({
                'Wrestler': wrestler,
                'Match': match['Match'],
                'ELO': match['ELO After'],
                'Match Number': i + 1  # Sequential match number for the wrestler
            })
    return pd.DataFrame(records)

elo_history_df = create_elo_history_df(elo_history)

# Get top 20 and bottom 20 wrestlers by final ELO
final_elos = {wrestler: matches[-1]['ELO After'] for wrestler, matches in elo_history.items()}
sorted_wrestlers = sorted(final_elos.items(), key=lambda x: x[1], reverse=True)
top_20_wrestlers = [w[0] for w in sorted_wrestlers[:20]]
bottom_20_wrestlers = [w[0] for w in sorted_wrestlers[-20:]]

# Filter ELO history DataFrame for these wrestlers
filtered_elo_history_df = elo_history_df[elo_history_df['Wrestler'].isin(top_20_wrestlers + bottom_20_wrestlers)]

# Plot ELO changes over matches for top 20 and bottom 20 wrestlers
plt.figure(figsize=(14, 8))
for wrestler in filtered_elo_history_df['Wrestler'].unique():
    wrestler_data = filtered_elo_history_df[filtered_elo_history_df['Wrestler'] == wrestler]
    plt.plot(wrestler_data['Match Number'], wrestler_data['ELO'], label=wrestler)
    
    # Annotate at the endpoint of each line
    plt.text(
        x=wrestler_data['Match Number'].iloc[-1],  # Last match number
        y=wrestler_data['ELO'].iloc[-1],          # Last ELO value
        s=wrestler,                               # Wrestler's name
        fontsize=8,                               # Font size
        ha='left',                                # Horizontal alignment
        va='center',                              # Vertical alignment
        color='black'                             # Optional: Match text color with line color
    )

plt.title("Top 20 and Bottom 20 ELO Ratings Over Matches")
plt.xlabel("Match Number")
plt.ylabel("ELO Rating")
plt.grid()
plt.tight_layout()
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=8)
plt.show()
