In [16]:
#We will be using sports-reference.com to gather all of our data. They are a great site with a tons of stats for every major sport

In [17]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

In [18]:
#First we want to scrape the team stats of all available NCAA Men's Basketball teams

In [19]:
#url's for 2018 and 2019 regular season
basic_stats_url19 = 'https://www.sports-reference.com/cbb/seasons/2019-school-stats.html'
basic_stats_url18 = 'https://www.sports-reference.com/cbb/seasons/2018-school-stats.html'

#list holding the header of each stat category
basic_stats = ['SCHOOL','RANK','GAMES','WINS','LOSS','W/L%','SRS','SOS','CONF. W','CONF. L','HOME W','HOME L','AWAY W','AWAY L','POINTS SCORED','POINTS ALLOWED','MINUTES PLAYED','FG','FGA','FG%','3P','3PA','3P%','FT','FTA','FT%','OFFENSIVE REBOUNDS','TOTAL REBOUNDS','ASISTS','STEALS','BLOCKS','TURN OVERS','PF']

In [20]:
#This function scrapes the given webpage and saves the information to a csv file
#Setting it up in this way allows us to grab either basic or advanced stats of every team in any year we choose
def NCAA_scrape_teams(url,csv_name,stats_cols):

    #Creates an empty csv file with a header of each category we will be scraping 
    with open(csv_name,'w',newline='') as writefile:
        writer = csv.writer(writefile)
        writer.writerow(stats_cols)

    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    data = soup.find(class_='table_outer_container').find('tbody').find_all('tr')


    for i in data:#this for loop grabs each row of stats and picks out the school name
        stats = []
        try: #try statement because the table is broken up by headers
            stats.append(i.find('a').contents[0])

            for j in i.find_all(class_='right'): #this loop iterates through each col of each row of stats, saving them to a list

                try:	#try statement because their is a random gap in the table	
                    stats.append(float(j.contents[0]))

                except IndexError:
                    continue

            #Our stats list will hold one team and all of its respective stats
            #We need to append it to the csv file we created above before it gets rewritten at the start of the first for loop
            with open(csv_name,'a',newline='') as appendfile:
                append = csv.writer(appendfile)
                
                append.writerow(stats)

        except AttributeError:
            continue

In [22]:
#Call the function and save all of 2019 and 2018 team statistics to a csv file

NCAA_scrape_teams(basic_stats_url19,'NCAA_Team_Data_Basic19.csv',basic_stats)
NCAA_scrape_teams(basic_stats_url18,'NCAA_Team_Data_Basic18.csv',basic_stats)

In [23]:
#Now we want to gather the results of each regular season game from any given NCAA season

#2019 Season
base_games19 ='https://www.sports-reference.com/cbb/play-index/matchup_finder.cgi?request=1&year_min=2019&year_max=2019&school_id=&opp_id=&game_type=A&game_month=&game_location=&game_result=&is_overtime=&comp_school=le&comp_opp=le&rank_school=ANY&rank_opp=ANY&order_by=date_game&order_by_asc=&offset='
pages19 = 11700

#2018 Season
base_games18 = 'https://www.sports-reference.com/cbb/play-index/matchup_finder.cgi?request=1&year_min=2018&year_max=2018&school_id=&opp_id=&game_type=A&game_month=&game_location=&game_result=&is_overtime=&comp_school=le&comp_opp=le&rank_school=ANY&rank_opp=ANY&order_by=date_game&order_by_asc=&offset='
pages18 = 11600

In [26]:
def NCAA_reg_season_scraper(base_url, page_num, csv_name):

    #to cycle through the many pages of basketball games, we need to create list of all the webpages using our base url 
    scroll = []
    for i in range(0,page_num,100):
        scroll.append(base_url + str(i))

    #Create a new csv file with our header
    with open(csv_name,'w',newline='') as writefile:
            writer = csv.writer(writefile)
            writer.writerow(['Date','Team','Rank','Away?','Opponent','Opp Rank','W/L','Points','Opp Points','Point Diff.','OT'])

    #This part of our function will be similar in style to our above function
    for url in scroll:

        page = requests.get(url)
        soup = BeautifulSoup(page.text,'html.parser')
        data = soup.find(class_='table_outer_container').find('tbody').find_all('tr')

        for i in data:
            holder = []
            for j in i.find_all('td'):
                try:
                    holder.append(int(j.get_text()))

                except ValueError:
                    holder.append(j.get_text())

                except IndexError:
                    continue

            try:     #we are going to delete some blank spaces in our scraped data
                del holder[1]
                del holder[7]
                del holder[10]

            except IndexError:
                pass

            with open(csv_name,'a',newline='') as appendfile:
                append = csv.writer(appendfile)
                if len(holder) == 0: #Deletes the blank rows in the original table 
                    pass
                else:
                    append.writerow(holder)

In [27]:
#Call the function to save all of the 2019 and 2018 to a respective csv file

NCAA_reg_season_scraper(base_games19,pages19,'NCAA_Reg_Season19.csv')
NCAA_reg_season_scraper(base_games18,pages18,'NCAA_Reg_Season18.csv')