<h2>Goal: </h2><br>Aggregate season level NCAA player statistics to career level NCAA player statistics.  Additionally incorporate new data relating to strength of conference.  

Import packages

In [1]:
import pandas as pd
import numpy as np
import getpass
import psycopg2
from urllib.request import urlopen
from bs4 import BeautifulSoup
pd.set_option('display.max_columns', 500)

Create a connection with our database

In [2]:
mypasswd = getpass.getpass()
conn = psycopg2.connect(database = 'cs20_group4',
                              user = 'mwkmr',#replace with pawprint
                              host = 'pgsql.dsa.lan',
                              password = mypasswd)
cursor = conn.cursor()

········


Query that pulls and aggregates data from the database.  See comments in the SQL code for further detail.

In [4]:
cursor = conn.cursor()
query = """

select 
base.*,
most_minutes.assist_percentage,
most_minutes.turnover_percentage,
most_minutes.usage_percentage,
position.position,
team.team_abbreviation,
conference.conference
from
--Base query where all simple aggregations are performed, usually sum or max.  For some percentage fields, the field are here recalculated according to the definitions in the SportsReference package documentation.
	(select 
	sum(assists) as assists,
	sum(blocks) as blocks,
	sum(defensive_rebounds) as defensive_rebounds,
	sum(defensive_win_shares) as defensive_win_shares,
	sum(field_goal_attempts) as field_goal_attempts,
	--field_goal_percentage,
	case when sum(field_goal_attempts) > 0 then
		sum(field_goals)/sum(field_goal_attempts)
		else null end as field_goal_percentage,
	sum(field_goals) as field_goals,
	--free_throw_attempt_rate,
	case when sum(field_goal_attempts) > 0 then
		sum(free_throw_attempts)/sum(field_goal_attempts) 
		else null end as free_throw_attempt_rate,
	sum(free_throw_attempts) as free_throw_attempts,
	--free_throw_percentage,
	case when sum(free_throw_attempts) > 0 then
		sum(free_throws)/sum(free_throw_attempts) 
		else null end as free_throw_percentage,
	sum(free_throws) as free_throws,
	sum(games_played) as games_played,
	sum(games_started) as games_started,
	max(height) as height,
	sum(minutes_played) as minutes_played,
	sum(offensive_rebounds) as offensive_rebounds,
	sum(offensive_win_shares) as offensive_win_shares,
	sum(personal_fouls) as personal_fouls,
	player_id,
	sum(points) as points,
	sum(steals) as steals,
	--three_point_attempt_rate,
	case when sum(field_goal_attempts) > 0 then
		sum(three_point_attempts)/sum(field_goal_attempts) 
		else null end as three_point_attempt_rate,
	sum(three_point_attempts) as three_point_attempts,
	--three_point_percentage,
	case when sum(three_point_attempts) > 0 then
		sum(three_pointers)/sum(three_point_attempts) 
		else null end as three_point_percentage,
	sum(three_pointers) as three_pointers,
	sum(total_rebounds) as total_rebounds,
	sum(turnovers) as turnovers,
	sum(two_point_attempts) as two_point_attempts,
	--two_point_percentage,
	case when sum(two_point_attempts) > 0 then
		sum(two_pointers)/sum(two_point_attempts) 
		else null end as two_point_percentage,
	sum(two_pointers) as two_pointers,
	max(weight) as weight,
	sum(win_shares) as win_shares,
	max(player_season_number) as seasons_played
	from 
	ncaa_player_career_stats
	where
	season >= '2001'
	group by
	player_id) base
--This outer join adds columns for percentages that could not be recaclulated.  Instead, this query takes the value from each of these fields during the season in which a player logged the most minutes of playing time.
left outer join
	(select distinct
	sub1.player_id,
	avg(sub2.assist_percentage) as assist_percentage,
	avg(sub2.turnover_percentage) as turnover_percentage,
	avg(sub2.usage_percentage) as usage_percentage,
	avg(sub2.win_shares_per_40_minutes) as win_shares_per_40_minutes,
	avg(sub2.block_percentage) as block_percentage 
        --In tandem, these two queries filter the dataframe to just the seasons where each player played the most minutes.
		from
			(select
			player_id,
			max(minutes_played) as minutes_played
			from 
			ncaa_player_career_stats
			where
			season >= '2001'
			group by player_id)sub1
		left outer join
			(select
			player_id,
			minutes_played,
			assist_percentage,
			turnover_percentage,
			usage_percentage,
			win_shares_per_40_minutes,
			block_percentage
			from 
			ncaa_player_career_stats)sub2
		on sub1.player_id = sub2.player_id and sub1.minutes_played = sub2.minutes_played
	group by
	sub1.player_id) most_minutes
on base.player_id = most_minutes.player_id		
--Each of the following outer joins brings in a column that is non-numeric.  The mode is used for some of these columns, like school, so if a player transfered, we treat the school he played the most seasons for as his school.  In the case of a tie, a value is arbatrarily chosen.
left outer join
	(select player_id, mode() within group(order by conference) as conference
	from
	ncaa_player_career_stats
	group by
	player_id)conference
on conference.player_id = base.player_id
left outer join
	(select player_id, mode() within group(order by team_abbreviation) as team_abbreviation
	from
	ncaa_player_career_stats
	group by
	player_id)team
on team.player_id = base.player_id
left outer join
	(select player_id, mode() within group(order by position) as position
	from
	ncaa_player_career_stats
	group by
	player_id)position
on position.player_id = base.player_id


;
"""

In [5]:
player_career_stats = pd.read_sql_query(query, con = conn)

In [6]:
player_career_stats

Unnamed: 0,assists,blocks,defensive_rebounds,defensive_win_shares,field_goal_attempts,field_goal_percentage,field_goals,free_throw_attempt_rate,free_throw_attempts,free_throw_percentage,free_throws,games_played,games_started,height,minutes_played,offensive_rebounds,offensive_win_shares,personal_fouls,player_id,points,steals,three_point_attempt_rate,three_point_attempts,three_point_percentage,three_pointers,total_rebounds,turnovers,two_point_attempts,two_point_percentage,two_pointers,weight,win_shares,seasons_played,assist_percentage,turnover_percentage,usage_percentage,position,team_abbreviation,conference
0,146.0,1.0,57.0,0.9,304.0,0.368421,112.0,0.546053,166.0,0.807229,134.0,31.0,25.0,72.0,906.0,12.0,1.2,104.0,aakim-saintil-1,392.0,46.0,0.427632,130.0,0.261538,34.0,69.0,90.0,174.0,0.448276,78.0,165.0,2.1,1.0,0.307,0.190,0.245,Guard,long-island-university,northeast
1,57.0,2.0,46.0,0.8,167.0,0.329341,55.0,0.526946,88.0,0.738636,65.0,31.0,10.0,72.0,705.0,16.0,0.2,102.0,aakim-saintill-1,194.0,32.0,0.419162,70.0,0.271429,19.0,62.0,45.0,97.0,0.371134,36.0,165.0,1.1,1.0,0.165,0.177,0.183,Guard,south-alabama,sun-belt
2,29.0,3.0,22.0,0.1,73.0,0.438356,32.0,0.041096,3.0,0.333333,1.0,65.0,2.0,75.0,359.0,3.0,0.1,40.0,aalim-moor-1,75.0,10.0,0.328767,24.0,0.416667,10.0,25.0,21.0,49.0,0.448980,22.0,194.0,0.3,4.0,0.216,0.259,0.138,Guard,san-jose-state,wac
3,2.0,1.0,8.0,0.1,21.0,0.380952,8.0,0.428571,9.0,0.555556,5.0,17.0,0.0,74.0,73.0,4.0,0.0,7.0,aamahd-walker-1,22.0,5.0,0.238095,5.0,0.200000,1.0,12.0,5.0,16.0,0.437500,7.0,190.0,0.1,2.0,0.063,0.210,0.207,Guard,california-santa-barbara,big-west
4,78.0,2.0,47.0,0.6,202.0,0.361386,73.0,0.143564,29.0,0.965517,28.0,32.0,31.0,70.0,915.0,10.0,0.5,43.0,aamahne-santos-1,217.0,20.0,0.628713,127.0,0.338583,43.0,57.0,49.0,75.0,0.400000,30.0,165.0,1.1,1.0,0.139,0.185,0.138,Guard,jacksonville,atlantic-sun
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34924,33.0,0.0,31.0,0.0,140.0,0.357143,50.0,0.178571,25.0,0.840000,21.0,30.0,15.0,74.0,671.0,8.0,0.3,44.0,zuri-james-1,156.0,19.0,0.671429,94.0,0.372340,35.0,39.0,35.0,46.0,0.326087,15.0,185.0,0.3,1.0,0.098,0.187,0.141,Guard,georgia-southern,southern
34925,121.0,16.0,381.0,2.7,596.0,0.476510,284.0,0.325503,194.0,0.711340,138.0,62.0,49.0,81.0,1721.0,129.0,3.0,180.0,zvonko-buljan-1,759.0,50.0,0.295302,176.0,0.301136,53.0,510.0,196.0,420.0,0.550000,231.0,235.0,5.7,2.0,0.172,0.227,0.276,Forward,texas-christian,mwc
34926,67.0,5.0,81.0,0.2,322.0,0.400621,129.0,0.173913,56.0,0.571429,32.0,51.0,31.0,77.0,,46.0,0.1,81.0,zydrunas-stankus-1,324.0,14.0,0.369565,119.0,0.285714,34.0,127.0,71.0,203.0,0.467980,95.0,195.0,0.1,2.0,,0.143,,Forward,campbell,atlantic-sun
34927,33.0,7.0,113.0,1.2,375.0,0.389333,146.0,0.072000,27.0,0.666667,18.0,108.0,31.0,77.0,1706.0,44.0,1.9,149.0,zygis-sestokas-1,430.0,24.0,0.861333,323.0,0.371517,120.0,157.0,48.0,52.0,0.500000,26.0,200.0,3.1,4.0,,0.138,,Guard,hofstra,colonial


<h2>Strength of Conference</h2><br>Another variable that we want to consider is the strength of each conference.  To get this information we will use beautiful soup to scrape teamrankings.com to get a list of the conferences ranked by <a href=https://en.wikipedia.org/wiki/Rating_percentage_index>RPI</a> (Rating percentage index) at the end of each season, from 2001 to 2020.  We will then average each conference's ranking over those twenty years to get an average conference ranking, which we will use to approximate conference strength.

In [7]:
conf_rnk = pd.DataFrame()
for i in range(2001,2020):
    url = 'https://www.teamrankings.com/ncaa-basketball/rpi-ranking/rpi-rating-by-conf?date={}-05-01.html'.format(i)
    html = urlopen(url)
    soup = BeautifulSoup(html)
    table = (soup.select_one('table'))
    column_headers = [th.getText() for th in table.findAll('th')]
    data_rows = table.findAll('tr')
    data = [[td.getText() for td in data_rows[i].findAll('td')] for i in range(len(data_rows))]
    df = pd.DataFrame(data, columns=column_headers)
    df = df.iloc[1:]
    df = df[['Rank','Conference']]
    df.Rank = df.Rank.astype('int')
    df.Conference = df.Conference.astype('str')
    conf_rnk = conf_rnk.append(df)

Extract the overall conference rank for each conference by taking the mean over 20 years of data.

In [8]:
conf_rnk = conf_rnk.groupby('Conference').mean().reset_index()

Change the format of Conference to match the format in player_career_stats so that the two dataframes can be mereged.

In [9]:
conf_rnk.Conference = conf_rnk.Conference.str.lower()
conf_rnk.Conference = conf_rnk.Conference.str.replace(' ','-')
conf_rnk.Conference = conf_rnk.Conference.str.replace('american','aac')
conf_rnk.Conference = conf_rnk.Conference.str.replace('caa','colonial')
conf_rnk.Conference = conf_rnk.Conference.str.replace('horizon-league','horizon')
conf_rnk.Conference = conf_rnk.Conference.str.replace('mountain-west','mwc')
conf_rnk.Conference = conf_rnk.Conference.str.replace('ohio-valley','ovc')
conf_rnk.Conference = conf_rnk.Conference.str.replace('aac-east','america-east')


<h3>The next several cells write conf_rnk into the database</h3>

In [10]:
query = """select * from ncaa_player_career_stats;"""

In [11]:
db_conf = pd.read_sql_query(query, con = conn)

In [12]:
print(len(db_conf))
db_conf.head()

91480


Unnamed: 0,assist_percentage,assists,block_percentage,blocks,box_plus_minus,conference,defensive_box_plus_minus,defensive_rebound_percentage,defensive_rebounds,defensive_win_shares,effective_field_goal_percentage,field_goal_attempts,field_goal_percentage,field_goals,free_throw_attempt_rate,free_throw_attempts,free_throw_percentage,free_throws,games_played,games_started,height,minutes_played,offensive_box_plus_minus,offensive_rebound_percentage,offensive_rebounds,offensive_win_shares,personal_fouls,player_efficiency_rating,player_id,player_name,points,points_produced,position,season,steal_percentage,steals,team_abbreviation,three_point_attempt_rate,three_point_attempts,three_point_percentage,three_pointers,total_rebound_percentage,total_rebounds,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,weight,win_shares,win_shares_per_40_minutes,player_season_number
0,0.049,20.0,0.006,7.0,,atlantic-sun,,,79.0,0.3,0.573,245.0,0.478,117.0,0.269,66.0,0.697,46.0,29.0,29.0,76.0,845.0,,,33.0,1.9,35.0,,antonio-cool-1,Antonio Cool,327.0,,Forward,2005.0,,24.0,jacksonville,0.473,116.0,0.405,47.0,0.073,112.0,0.592,0.173,58.0,129.0,0.543,70.0,0.19,190.0,2.2,0.104,1.0
1,,41.0,,5.0,,atlantic-sun,,,75.0,-0.1,0.537,349.0,0.458,160.0,0.155,54.0,0.741,40.0,27.0,27.0,76.0,832.0,,,23.0,1.6,35.0,,antonio-cool-1,Antonio Cool,415.0,,Forward,2006.0,,24.0,jacksonville,0.453,158.0,0.348,55.0,,98.0,0.554,0.09,37.0,191.0,0.55,105.0,,190.0,1.5,0.072,2.0
2,0.043,2.0,0.025,2.0,-5.3,atlantic-10,-3.5,0.078,6.0,0.0,0.404,26.0,0.269,7.0,0.154,4.0,0.75,3.0,15.0,0.0,78.0,87.0,-1.8,0.057,5.0,0.0,19.0,4.8,jake-fay-1,Jake Fay,24.0,23.0,Guard,2014.0,0.013,2.0,hartford,0.808,21.0,0.333,7.0,0.067,11.0,0.43,0.152,5.0,5.0,0.0,0.0,0.181,195.0,0.0,0.003,1.0
3,0.079,1.0,0.0,0.0,-24.5,america-east,-5.4,0.052,1.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,,0.0,10.0,0.0,78.0,22.0,-19.1,0.051,1.0,-0.2,0.0,-16.1,jake-fay-1,Jake Fay,0.0,1.0,Guard,2016.0,0.026,1.0,hartford,0.667,6.0,0.0,0.0,0.051,2.0,0.0,0.182,2.0,3.0,0.0,0.0,0.26,195.0,-0.2,-0.365,2.0
4,0.0,0.0,0.067,1.0,-20.2,america-east,-1.6,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,,0.0,8.0,0.0,78.0,15.0,-18.7,0.073,1.0,-0.1,0.0,-11.8,jake-fay-1,Jake Fay,0.0,1.0,Guard,2017.0,0.038,1.0,hartford,0.667,4.0,0.0,0.0,0.037,1.0,0.0,0.143,1.0,2.0,0.0,0.0,0.236,195.0,-0.1,-0.308,3.0


In [13]:
db_conf = pd.merge(db_conf,conf_rnk, left_on='conference',right_on='Conference',how='left')
print(len(db_conf))
db_conf.head()

91480


Unnamed: 0,assist_percentage,assists,block_percentage,blocks,box_plus_minus,conference,defensive_box_plus_minus,defensive_rebound_percentage,defensive_rebounds,defensive_win_shares,effective_field_goal_percentage,field_goal_attempts,field_goal_percentage,field_goals,free_throw_attempt_rate,free_throw_attempts,free_throw_percentage,free_throws,games_played,games_started,height,minutes_played,offensive_box_plus_minus,offensive_rebound_percentage,offensive_rebounds,offensive_win_shares,personal_fouls,player_efficiency_rating,player_id,player_name,points,points_produced,position,season,steal_percentage,steals,team_abbreviation,three_point_attempt_rate,three_point_attempts,three_point_percentage,three_pointers,total_rebound_percentage,total_rebounds,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,weight,win_shares,win_shares_per_40_minutes,player_season_number,Conference,Rank
0,0.049,20.0,0.006,7.0,,atlantic-sun,,,79.0,0.3,0.573,245.0,0.478,117.0,0.269,66.0,0.697,46.0,29.0,29.0,76.0,845.0,,,33.0,1.9,35.0,,antonio-cool-1,Antonio Cool,327.0,,Forward,2005.0,,24.0,jacksonville,0.473,116.0,0.405,47.0,0.073,112.0,0.592,0.173,58.0,129.0,0.543,70.0,0.19,190.0,2.2,0.104,1.0,atlantic-sun,25.0
1,,41.0,,5.0,,atlantic-sun,,,75.0,-0.1,0.537,349.0,0.458,160.0,0.155,54.0,0.741,40.0,27.0,27.0,76.0,832.0,,,23.0,1.6,35.0,,antonio-cool-1,Antonio Cool,415.0,,Forward,2006.0,,24.0,jacksonville,0.453,158.0,0.348,55.0,,98.0,0.554,0.09,37.0,191.0,0.55,105.0,,190.0,1.5,0.072,2.0,atlantic-sun,25.0
2,0.043,2.0,0.025,2.0,-5.3,atlantic-10,-3.5,0.078,6.0,0.0,0.404,26.0,0.269,7.0,0.154,4.0,0.75,3.0,15.0,0.0,78.0,87.0,-1.8,0.057,5.0,0.0,19.0,4.8,jake-fay-1,Jake Fay,24.0,23.0,Guard,2014.0,0.013,2.0,hartford,0.808,21.0,0.333,7.0,0.067,11.0,0.43,0.152,5.0,5.0,0.0,0.0,0.181,195.0,0.0,0.003,1.0,atlantic-10,8.0
3,0.079,1.0,0.0,0.0,-24.5,america-east,-5.4,0.052,1.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,,0.0,10.0,0.0,78.0,22.0,-19.1,0.051,1.0,-0.2,0.0,-16.1,jake-fay-1,Jake Fay,0.0,1.0,Guard,2016.0,0.026,1.0,hartford,0.667,6.0,0.0,0.0,0.051,2.0,0.0,0.182,2.0,3.0,0.0,0.0,0.26,195.0,-0.2,-0.365,2.0,america-east,21.0
4,0.0,0.0,0.067,1.0,-20.2,america-east,-1.6,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,,0.0,8.0,0.0,78.0,15.0,-18.7,0.073,1.0,-0.1,0.0,-11.8,jake-fay-1,Jake Fay,0.0,1.0,Guard,2017.0,0.038,1.0,hartford,0.667,4.0,0.0,0.0,0.037,1.0,0.0,0.143,1.0,2.0,0.0,0.0,0.236,195.0,-0.1,-0.308,3.0,america-east,21.0


In [14]:
db_conf = db_conf.drop(columns='Conference')
db_conf.rename(columns={"Rank" : "avg_conf_rank"},inplace=True)

In [15]:
print(len(db_conf))
db_conf.head()

91480


Unnamed: 0,assist_percentage,assists,block_percentage,blocks,box_plus_minus,conference,defensive_box_plus_minus,defensive_rebound_percentage,defensive_rebounds,defensive_win_shares,effective_field_goal_percentage,field_goal_attempts,field_goal_percentage,field_goals,free_throw_attempt_rate,free_throw_attempts,free_throw_percentage,free_throws,games_played,games_started,height,minutes_played,offensive_box_plus_minus,offensive_rebound_percentage,offensive_rebounds,offensive_win_shares,personal_fouls,player_efficiency_rating,player_id,player_name,points,points_produced,position,season,steal_percentage,steals,team_abbreviation,three_point_attempt_rate,three_point_attempts,three_point_percentage,three_pointers,total_rebound_percentage,total_rebounds,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,weight,win_shares,win_shares_per_40_minutes,player_season_number,avg_conf_rank
0,0.049,20.0,0.006,7.0,,atlantic-sun,,,79.0,0.3,0.573,245.0,0.478,117.0,0.269,66.0,0.697,46.0,29.0,29.0,76.0,845.0,,,33.0,1.9,35.0,,antonio-cool-1,Antonio Cool,327.0,,Forward,2005.0,,24.0,jacksonville,0.473,116.0,0.405,47.0,0.073,112.0,0.592,0.173,58.0,129.0,0.543,70.0,0.19,190.0,2.2,0.104,1.0,25.0
1,,41.0,,5.0,,atlantic-sun,,,75.0,-0.1,0.537,349.0,0.458,160.0,0.155,54.0,0.741,40.0,27.0,27.0,76.0,832.0,,,23.0,1.6,35.0,,antonio-cool-1,Antonio Cool,415.0,,Forward,2006.0,,24.0,jacksonville,0.453,158.0,0.348,55.0,,98.0,0.554,0.09,37.0,191.0,0.55,105.0,,190.0,1.5,0.072,2.0,25.0
2,0.043,2.0,0.025,2.0,-5.3,atlantic-10,-3.5,0.078,6.0,0.0,0.404,26.0,0.269,7.0,0.154,4.0,0.75,3.0,15.0,0.0,78.0,87.0,-1.8,0.057,5.0,0.0,19.0,4.8,jake-fay-1,Jake Fay,24.0,23.0,Guard,2014.0,0.013,2.0,hartford,0.808,21.0,0.333,7.0,0.067,11.0,0.43,0.152,5.0,5.0,0.0,0.0,0.181,195.0,0.0,0.003,1.0,8.0
3,0.079,1.0,0.0,0.0,-24.5,america-east,-5.4,0.052,1.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,,0.0,10.0,0.0,78.0,22.0,-19.1,0.051,1.0,-0.2,0.0,-16.1,jake-fay-1,Jake Fay,0.0,1.0,Guard,2016.0,0.026,1.0,hartford,0.667,6.0,0.0,0.0,0.051,2.0,0.0,0.182,2.0,3.0,0.0,0.0,0.26,195.0,-0.2,-0.365,2.0,21.0
4,0.0,0.0,0.067,1.0,-20.2,america-east,-1.6,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,,0.0,8.0,0.0,78.0,15.0,-18.7,0.073,1.0,-0.1,0.0,-11.8,jake-fay-1,Jake Fay,0.0,1.0,Guard,2017.0,0.038,1.0,hartford,0.667,4.0,0.0,0.0,0.037,1.0,0.0,0.143,1.0,2.0,0.0,0.0,0.236,195.0,-0.1,-0.308,3.0,21.0


These next cells use the sqlalchemy library to upload the data into a new table in our database.

In [16]:
import sqlalchemy
from sqlalchemy import create_engine
engine = create_engine('postgresql://mwkmr:Rapp1296@pgsql.dsa.lan/cs20_group4')#replace with pawrint 


In [18]:
db_conf.to_sql('ncaa_player_career_stats2', engine)

<h3>Combine conf_rnk with the output from the aggregation query</h3>

Merge the dataframes together

In [19]:
player_career_stats = pd.merge(player_career_stats,conf_rnk, left_on='conference',right_on='Conference',how='left')
player_career_stats.rename(columns={"Rank" : "avg_conf_rank"},inplace=True)
player_career_stats.drop(columns=['Conference'],inplace=True)

Write the player_career_stats to a CSV.

In [20]:
player_career_stats

Unnamed: 0,assists,blocks,defensive_rebounds,defensive_win_shares,field_goal_attempts,field_goal_percentage,field_goals,free_throw_attempt_rate,free_throw_attempts,free_throw_percentage,free_throws,games_played,games_started,height,minutes_played,offensive_rebounds,offensive_win_shares,personal_fouls,player_id,points,steals,three_point_attempt_rate,three_point_attempts,three_point_percentage,three_pointers,total_rebounds,turnovers,two_point_attempts,two_point_percentage,two_pointers,weight,win_shares,seasons_played,assist_percentage,turnover_percentage,usage_percentage,position,team_abbreviation,conference,avg_conf_rank
0,146.0,1.0,57.0,0.9,304.0,0.368421,112.0,0.546053,166.0,0.807229,134.0,31.0,25.0,72.0,906.0,12.0,1.2,104.0,aakim-saintil-1,392.0,46.0,0.427632,130.0,0.261538,34.0,69.0,90.0,174.0,0.448276,78.0,165.0,2.1,1.0,0.307,0.190,0.245,Guard,long-island-university,northeast,26.0
1,57.0,2.0,46.0,0.8,167.0,0.329341,55.0,0.526946,88.0,0.738636,65.0,31.0,10.0,72.0,705.0,16.0,0.2,102.0,aakim-saintill-1,194.0,32.0,0.419162,70.0,0.271429,19.0,62.0,45.0,97.0,0.371134,36.0,165.0,1.1,1.0,0.165,0.177,0.183,Guard,south-alabama,sun-belt,14.0
2,29.0,3.0,22.0,0.1,73.0,0.438356,32.0,0.041096,3.0,0.333333,1.0,65.0,2.0,75.0,359.0,3.0,0.1,40.0,aalim-moor-1,75.0,10.0,0.328767,24.0,0.416667,10.0,25.0,21.0,49.0,0.448980,22.0,194.0,0.3,4.0,0.216,0.259,0.138,Guard,san-jose-state,wac,27.0
3,2.0,1.0,8.0,0.1,21.0,0.380952,8.0,0.428571,9.0,0.555556,5.0,17.0,0.0,74.0,73.0,4.0,0.0,7.0,aamahd-walker-1,22.0,5.0,0.238095,5.0,0.200000,1.0,12.0,5.0,16.0,0.437500,7.0,190.0,0.1,2.0,0.063,0.210,0.207,Guard,california-santa-barbara,big-west,24.0
4,78.0,2.0,47.0,0.6,202.0,0.361386,73.0,0.143564,29.0,0.965517,28.0,32.0,31.0,70.0,915.0,10.0,0.5,43.0,aamahne-santos-1,217.0,20.0,0.628713,127.0,0.338583,43.0,57.0,49.0,75.0,0.400000,30.0,165.0,1.1,1.0,0.139,0.185,0.138,Guard,jacksonville,atlantic-sun,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34924,33.0,0.0,31.0,0.0,140.0,0.357143,50.0,0.178571,25.0,0.840000,21.0,30.0,15.0,74.0,671.0,8.0,0.3,44.0,zuri-james-1,156.0,19.0,0.671429,94.0,0.372340,35.0,39.0,35.0,46.0,0.326087,15.0,185.0,0.3,1.0,0.098,0.187,0.141,Guard,georgia-southern,southern,13.0
34925,121.0,16.0,381.0,2.7,596.0,0.476510,284.0,0.325503,194.0,0.711340,138.0,62.0,49.0,81.0,1721.0,129.0,3.0,180.0,zvonko-buljan-1,759.0,50.0,0.295302,176.0,0.301136,53.0,510.0,196.0,420.0,0.550000,231.0,235.0,5.7,2.0,0.172,0.227,0.276,Forward,texas-christian,mwc,10.0
34926,67.0,5.0,81.0,0.2,322.0,0.400621,129.0,0.173913,56.0,0.571429,32.0,51.0,31.0,77.0,,46.0,0.1,81.0,zydrunas-stankus-1,324.0,14.0,0.369565,119.0,0.285714,34.0,127.0,71.0,203.0,0.467980,95.0,195.0,0.1,2.0,,0.143,,Forward,campbell,atlantic-sun,25.0
34927,33.0,7.0,113.0,1.2,375.0,0.389333,146.0,0.072000,27.0,0.666667,18.0,108.0,31.0,77.0,1706.0,44.0,1.9,149.0,zygis-sestokas-1,430.0,24.0,0.861333,323.0,0.371517,120.0,157.0,48.0,52.0,0.500000,26.0,200.0,3.1,4.0,,0.138,,Guard,hofstra,colonial,15.0


Write our data out to csv in case we need it for local usage. Memory issues with pulling data from our database while in the jupyterhub container requires us have a csv backup.

In [21]:
player_career_stats.to_csv('../Data/player_career_stats.csv')

In [24]:
test = pd.read_sql_query("""
SELECT
    *
FROM ncaa_player_career_stats2 st
--WHERE st.conference IS NOT NULL
""", con = conn)
test

Unnamed: 0,index,assist_percentage,assists,block_percentage,blocks,box_plus_minus,conference,defensive_box_plus_minus,defensive_rebound_percentage,defensive_rebounds,defensive_win_shares,effective_field_goal_percentage,field_goal_attempts,field_goal_percentage,field_goals,free_throw_attempt_rate,free_throw_attempts,free_throw_percentage,free_throws,games_played,games_started,height,minutes_played,offensive_box_plus_minus,offensive_rebound_percentage,offensive_rebounds,offensive_win_shares,personal_fouls,player_efficiency_rating,player_id,player_name,points,points_produced,position,season,steal_percentage,steals,team_abbreviation,three_point_attempt_rate,three_point_attempts,three_point_percentage,three_pointers,total_rebound_percentage,total_rebounds,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,weight,win_shares,win_shares_per_40_minutes,player_season_number,avg_conf_rank
0,0,0.049,20.0,0.006,7.0,,atlantic-sun,,,79.0,0.3,0.573,245.0,0.478,117.0,0.269,66.0,0.697,46.0,29.0,29.0,76.0,845.0,,,33.0,1.9,35.0,,antonio-cool-1,Antonio Cool,327.0,,Forward,2005.0,,24.0,jacksonville,0.473,116.0,0.405,47.0,0.073,112.0,0.592,0.173,58.0,129.0,0.543,70.0,0.190,190.0,2.2,0.104,1.0,25.0
1,1,,41.0,,5.0,,atlantic-sun,,,75.0,-0.1,0.537,349.0,0.458,160.0,0.155,54.0,0.741,40.0,27.0,27.0,76.0,832.0,,,23.0,1.6,35.0,,antonio-cool-1,Antonio Cool,415.0,,Forward,2006.0,,24.0,jacksonville,0.453,158.0,0.348,55.0,,98.0,0.554,0.090,37.0,191.0,0.550,105.0,,190.0,1.5,0.072,2.0,25.0
2,2,0.043,2.0,0.025,2.0,-5.3,atlantic-10,-3.5,0.078,6.0,0.0,0.404,26.0,0.269,7.0,0.154,4.0,0.750,3.0,15.0,0.0,78.0,87.0,-1.8,0.057,5.0,0.0,19.0,4.8,jake-fay-1,Jake Fay,24.0,23.0,Guard,2014.0,0.013,2.0,hartford,0.808,21.0,0.333,7.0,0.067,11.0,0.430,0.152,5.0,5.0,0.000,0.0,0.181,195.0,0.0,0.003,1.0,8.0
3,3,0.079,1.0,0.000,0.0,-24.5,america-east,-5.4,0.052,1.0,0.0,0.000,9.0,0.000,0.0,0.000,0.0,,0.0,10.0,0.0,78.0,22.0,-19.1,0.051,1.0,-0.2,0.0,-16.1,jake-fay-1,Jake Fay,0.0,1.0,Guard,2016.0,0.026,1.0,hartford,0.667,6.0,0.000,0.0,0.051,2.0,0.000,0.182,2.0,3.0,0.000,0.0,0.260,195.0,-0.2,-0.365,2.0,21.0
4,4,0.000,0.0,0.067,1.0,-20.2,america-east,-1.6,0.000,0.0,0.0,0.000,6.0,0.000,0.0,0.000,0.0,,0.0,8.0,0.0,78.0,15.0,-18.7,0.073,1.0,-0.1,0.0,-11.8,jake-fay-1,Jake Fay,0.0,1.0,Guard,2017.0,0.038,1.0,hartford,0.667,4.0,0.000,0.0,0.037,1.0,0.000,0.143,1.0,2.0,0.000,0.0,0.236,195.0,-0.1,-0.308,3.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91475,91475,0.128,36.0,0.014,8.0,2.0,cusa,-1.1,0.168,108.0,0.9,0.555,274.0,0.453,124.0,0.489,134.0,0.769,103.0,28.0,27.0,76.0,780.0,3.1,0.033,21.0,2.0,64.0,22.3,kourtlin-jackson-1,Kourtlin Jackson,407.0,360.0,Guard,2016.0,0.025,32.0,southern-mississippi,0.471,129.0,0.434,56.0,0.100,129.0,0.603,0.124,48.0,145.0,0.469,68.0,0.273,190.0,2.9,0.151,2.0,16.0
91476,91476,0.094,31.0,0.005,5.0,,northeast,,,91.0,-0.2,0.581,167.0,0.581,97.0,0.246,41.0,0.659,27.0,26.0,22.0,80.0,612.0,,,53.0,1.1,71.0,,craig-taylor-1,Craig Taylor,221.0,,Center,2002.0,,8.0,long-island-university,0.000,0.0,,0.0,0.126,144.0,0.593,0.139,30.0,167.0,0.581,97.0,0.160,240.0,0.9,0.061,1.0,26.0
91477,91477,,11.0,,1.0,,northeast,,,13.0,0.1,0.512,41.0,0.512,21.0,0.098,4.0,0.250,1.0,20.0,0.0,80.0,125.0,,,11.0,0.1,30.0,,craig-taylor-1,Craig Taylor,43.0,,Center,2003.0,,2.0,long-island-university,0.024,1.0,0.000,0.0,,24.0,0.501,0.219,12.0,40.0,0.525,21.0,,240.0,0.2,0.052,2.0,26.0
91478,91478,,27.0,,4.0,,wac,,,52.0,1.1,0.427,219.0,0.352,77.0,0.311,68.0,0.676,46.0,33.0,24.0,75.0,638.0,,,32.0,0.0,67.0,,marcus-elliot-1,Marcus Elliot,233.0,,Guard,2006.0,,16.0,louisiana-tech,0.525,115.0,0.287,33.0,,84.0,0.464,0.143,42.0,104.0,0.423,44.0,,180.0,1.0,0.064,1.0,27.0


Let's look at the output of our aggregated NCAA player career data.

In [27]:
test.loc[test.avg_conf_rank.notnull()][['player_id', 'player_name', 'conference', 'season', 'avg_conf_rank']]

Unnamed: 0,player_id,player_name,conference,season,avg_conf_rank
0,antonio-cool-1,Antonio Cool,atlantic-sun,2005.0,25.0
1,antonio-cool-1,Antonio Cool,atlantic-sun,2006.0,25.0
2,jake-fay-1,Jake Fay,atlantic-10,2014.0,8.0
3,jake-fay-1,Jake Fay,america-east,2016.0,21.0
4,jake-fay-1,Jake Fay,america-east,2017.0,21.0
...,...,...,...,...,...
91475,kourtlin-jackson-1,Kourtlin Jackson,cusa,2016.0,16.0
91476,craig-taylor-1,Craig Taylor,northeast,2002.0,26.0
91477,craig-taylor-1,Craig Taylor,northeast,2003.0,26.0
91478,marcus-elliot-1,Marcus Elliot,wac,2006.0,27.0
