<a href="https://colab.research.google.com/github/frankwillard/NBA-Rookie-Success-ML-Model/blob/main/Prospect_Rankings_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load Packages

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import numpy as np

## Scrape Data

In [4]:
# Initialize Dataframe
data = []

# Iterate through year pages and append prospect rankings 
for year in range(2010, 2023):
  print(year)
  url = "https://www.nbadraft.net/ranking/bigboard/?year-ranking=" + str(year)
  res = requests.get(url)
  
  soup = BeautifulSoup(res.text)
  table = soup.find("table")

  for row in table.find_all("tr"):
    cols = row.find_all("td")
    prospect = [col.text for col in cols]
    prospect.append(year)
    data.append(prospect)

2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022


## Clean Data

In [5]:
# Turn nested list into pandas dataframe
rankings = pd.DataFrame(data)

# Rename columns and select relevant columns
rankings.columns = ['Ranking', 'Change', 'Name', 'Height', 'Weight', 'Position', 'School', 'Grade', 'Draft Year']
rankings = rankings[rankings['Name'].notna()][['Name', 'Draft Year', 'Ranking', 'School', 'Grade']]

# Add space between first and last name
rankings['Name'] = rankings['Name'].apply(lambda x: re.sub(r"(\w)([A-Z])", r"\1 \2", x))

# Add categorical variable for most recent basketball experience
rankings['Experience'] = 'College'
rankings.loc[~rankings['Grade'].isin(['Fr.', 'So.', 'Jr.', 'Sr.', 'HSSr.']), 'Experience'] = 'International'
rankings.loc[rankings['Grade'] == 'HSSr.', 'Experience'] = 'High School'

# Change draft year to int type
rankings['Draft Year'] = rankings['Draft Year'].astype('int')

rankings

Unnamed: 0,Name,Draft Year,Ranking,School,Grade,Experience
1,John Wall,2010,1,Kentucky,Fr.,College
2,Evan Turner,2010,2,Ohio St.,Jr.,College
3,Derrick Favors,2010,3,Georgia Tech,Fr.,College
4,Wesley Johnson,2010,4,Syracuse,Jr.,College
5,Greg Monroe,2010,5,Georgetown,So.,College
...,...,...,...,...,...,...
1308,Collin Gillespie,2022,96,Villanova,Sr.,College
1309,Jee Nathan Williams,2022,97,Buffalo,Sr.,College
1310,Buddy Boeheim,2022,98,Syracuse,Sr.,College
1311,Kai Sotto,2022,99,Adelaide 36ers,2002,International
