In [1]:
import requests
import pandas as pd
import json
import requests
import time
from bs4 import BeautifulSoup
from _DESAdapter import DESAdapter, environment_requires_DES_adapter
import mechanicalsoup

In [2]:
def login(email, password):
    """
    Logs in to kenpom.com using user credentials and returns an authenticated session.
    
    Args:
        email (str): User e-mail for login to kenpom.com.
        password (str): User password for login to kenpom.com.
        
    Returns:
        session (requests.Session): Authenticated session with full access to kenpom.com.
    """

    # Fix for Cloudflare SSL profiling
    session = requests.Session()
    if environment_requires_DES_adapter():
        session.mount('https://kenpom.com/', DESAdapter())

    # Create a StatefulBrowser to manage login
    browser = mechanicalsoup.StatefulBrowser(session)
    browser.set_user_agent('Mozilla/5.0')
    browser.open('https://kenpom.com/index.php')

    if 'Cloudflare' in browser.page.title.string:
        raise Exception('Opening kenpom.com failed - request was intercepted by Cloudflare protection')
    # Select and fill the login form
    browser.select_form('form[action="handlers/login_handler.php"]')
    browser['email'] = email
    browser['password'] = password
    # Submit login form
    response = browser.submit_selected()

    if response.status_code != 200 or 'PHPSESSID=' not in response.headers.get('set-cookie', ''):
        raise Exception('Logging in to kenpom.com failed - check that the site is available and your credentials are correct.')
    
    if 'subscription expired' in str(browser.get('https://kenpom.com/index.php').content):
        raise Exception('Logging in to kenpom.com failed - account subscription is expired')
    time.sleep(5)
    # Extract cookies from browser to use in the requests session
    for cookie in browser.session.cookies:
        session.cookies.set(cookie.name, cookie.value)
    
    return session
    
def get_table_data(email, password, table_url):
    """
    Logs in to KenPom and retrieves a table from a specified URL.

    Args:
        email (str): User e-mail for login to kenpom.com.
        password (str): User password for login to kenpom.com.
        table_url (str): URL of the page containing the table to retrieve.

    Returns:
        pd.DataFrame: Parsed table data.
    """
    # Define the first 11 columns
    columns = ["Rk", "Team", "Conf", "W-L", "AdjEM", "AdjO", "AdjO_Rk", "AdjD", "AdjD_Rk", "AdjT", "AdjT_Rk", "Luck", "Luck_Rk", "SOS","SOS1","SOS2","SOS3","SOS4","SOS5","SOS6","SOS7"]

    session = login(email, password)
    headers = {
        "User-Agent": "Mozilla/5.0",
        "Referer": "https://kenpom.com/",
        "Accept-Language": "en-US,en;q=0.9",
    }
    response = session.get(table_url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        table = soup.find('table', {'id': 'ratings-table'})

        data = []
        for row in table.find_all('tr')[2:]:  # Skip first two header rows
            row_data = []
            cells = row.find_all(['td', 'th'])

            # Only parse the first 11 cells, then skip to the next row
            for i in range(min(21, len(cells))):  # Ensure no index error
                row_data.append(cells[i].get_text(strip=True))

            # Only add row if it has all 11 columns filled
            if len(row_data) == 21:
                data.append(row_data)

        # Create DataFrame
        return pd.DataFrame(data,columns=columns)
    else:
        raise Exception(f"Failed to load data page: {response.status_code}")

email = "nick@ncaainsiders.com"
password = "dIMMITT2021$"
table_url = "https://kenpom.com"  # Adjust URL format as needed

table_data = get_table_data(email, password, table_url)
print(table_data.shape)

LinkNotFoundError: 

In [14]:
table_data.to_csv("/Users/nickdimmitt/Desktop/lumber/ncaab/team-stats.csv")

NameError: name 'table_data' is not defined