In [7]:
import requests
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup

import kenpompy

In [8]:
import requests
from bs4 import BeautifulSoup
from _DESAdapter import DESAdapter, environment_requires_DES_adapter
import mechanicalsoup

def login(email, password):
    """
    Logs in to kenpom.com using user credentials and returns an authenticated session.
    
    Args:
        email (str): User e-mail for login to kenpom.com.
        password (str): User password for login to kenpom.com.
        
    Returns:
        session (requests.Session): Authenticated session with full access to kenpom.com.
    """

    # Fix for Cloudflare SSL profiling
    session = requests.Session()
    if environment_requires_DES_adapter():
        session.mount('https://kenpom.com/', DESAdapter())

    # Create a StatefulBrowser to manage login
    browser = mechanicalsoup.StatefulBrowser(session)
    browser.set_user_agent('Mozilla/5.0')
    browser.open('https://kenpom.com/index.php')

    if 'Cloudflare' in browser.page.title.string:
        raise Exception('Opening kenpom.com failed - request was intercepted by Cloudflare protection')

    # Select and fill the login form
    browser.select_form('form[action="handlers/login_handler.php"]')
    browser['email'] = email
    browser['password'] = password

    # Submit login form
    response = browser.submit_selected()

    if response.status_code != 200 or 'PHPSESSID=' not in response.headers.get('set-cookie', ''):
        raise Exception('Logging in to kenpom.com failed - check that the site is available and your credentials are correct.')
    
    if 'subscription expired' in str(browser.get('https://kenpom.com/index.php').content):
        raise Exception('Logging in to kenpom.com failed - account subscription is expired')

    # Extract cookies from browser to use in the requests session
    for cookie in browser.session.cookies:
        session.cookies.set(cookie.name, cookie.value)
    
    return session
    
def get_table_data(email, password, table_url):
    """
    Logs in to KenPom and retrieves a table from a specified URL.

    Args:
        email (str): User e-mail for login to kenpom.com.
        password (str): User password for login to kenpom.com.
        table_url (str): URL of the page containing the table to retrieve.

    Returns:
        pd.DataFrame: Parsed table data.
    """
    # Define the first 11 columns
    columns = ["Rk", "Team", "Conf", "AdjEM", "AdjO", "AdjO_Rk", "AdjD", "AdjD_Rk", "AdjT", "AdjT_Rk"]

    session = login(email, password)
    headers = {
        "User-Agent": "Mozilla/5.0",
        "Referer": "https://kenpom.com/",
        "Accept-Language": "en-US,en;q=0.9",
    }
    
    response = session.get(table_url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        table = soup.find('table', {'id': 'ratings-table'})

        data = []
        for row in table.find_all('tr')[2:]:  # Skip first two header rows
            row_data = []
            cells = row.find_all(['td', 'th'])

            # Only parse the first 11 cells, then skip to the next row
            for i in range(min(10, len(cells))):  # Ensure no index error
                row_data.append(cells[i].get_text(strip=True))

            # Only add row if it has all 11 columns filled
            if len(row_data) == 10:
                data.append(row_data)

        # Create DataFrame
        return pd.DataFrame(data, columns=columns)
    else:
        raise Exception(f"Failed to load data page: {response.status_code}")

email = "nick@ncaainsiders.com"
password = "dIMMITT2021$"
date = pd.to_datetime("2024-11-06")
formatted_date = date.strftime('%Y-%m-%d')
table_url = f"https://kenpom.com/archive.php?d={formatted_date}"  # Adjust URL format as needed
print(f"Fetching data for {formatted_date}...")

table_data = get_table_data(email, password, table_url)
table_data['date'] = str(date)
print(table_data.shape)

Fetching data for 2024-11-06...


LinkNotFoundError: 

In [4]:
table_data.to_csv("today_stats.csv")

In [9]:
from kenpompy.utils import login

email = "nick@ncaainsiders.com"
password = "dIMMITT2021$"
# Returns an authenticated browser that can then be used to scrape pages that require authorization.
browser = login(email, password)
import kenpompy.summary as kp

# Returns a pandas dataframe containing the efficiency and tempo stats for the current season (https://kenpom.com/summary.php).
eff_stats = kp.get_efficiency(browser)

LinkNotFoundError: 

In [2]:
eff_stats.to_csv("today_stats.csv")

In [6]:
import time

df = pd.DataFrame()
email = "nick@ncaainsiders.com"
password = "dIMMITT2021$"
start_date = "2024-11-06"
end_date = "2024-11-14"

date_range = pd.date_range(start=start_date, end=end_date)

for date in date_range:
        formatted_date = date.strftime('%Y-%m-%d')
        table_url = f"https://kenpom.com/archive.php?d={formatted_date}"  # Adjust URL format as needed
        print(f"Fetching data for {formatted_date}...")

        table_data = get_table_data(email, password, table_url)
        table_data['date'] = str(date)
        print(table_data.shape)
        df = pd.concat([df, table_data])
        time.sleep(1)

Fetching data for 2024-11-06...


LinkNotFoundError: 

Fetching data for 2024-11-06...


LinkNotFoundError: 

In [4]:
df.to_csv("data/2024_11_06.csv")

In [5]:
df

Unnamed: 0,Rk,Team,Conf,AdjEM,AdjO,AdjO_Rk,AdjD,AdjD_Rk,AdjT,AdjT_Rk,date
0,1,Baylor1,B12,+30.10,117.1,5,87.0,4,68.7,171,2022-01-01 00:00:00
1,2,Gonzaga1,WCC,+30.01,119.4,2,89.4,14,73.0,25,2022-01-01 00:00:00
2,3,Houston5,Amer,+27.54,115.0,7,87.5,5,66.3,297,2022-01-01 00:00:00
3,4,Kansas1,B12,+27.21,118.5,3,91.3,31,72.1,39,2022-01-01 00:00:00
4,5,Purdue3,B10,+27.16,121.6,1,94.4,59,68.3,191,2022-01-01 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...
368,Rk,Team,Conf,AdjEM,AdjO,AdjD,AdjT,Rk,AdjEM,AdjO,2024-11-04 00:00:00
369,361,IU Indy,Horz,-21.83,90.3,356,112.2,364,71.0,177,2024-11-04 00:00:00
370,362,Maryland Eastern Shore,MEAC,-22.47,85.5,363,108.0,327,71.0,179,2024-11-04 00:00:00
371,363,Coppin St.,MEAC,-22.58,86.7,362,109.3,349,70.0,268,2024-11-04 00:00:00


In [6]:
df1 = pd.read_csv("data/2018_04_07.csv")
df2 = pd.read_csv("data/2022_01_01.csv")
df3 = pd.read_csv("data/2024_11_06.csv")

In [7]:
df = pd.concat([df1,df2,df3])

In [9]:
df.to_csv("data/data.csv")