# Goal:
Create a data table with a list of players that are currently active in the NCAA. This will help filter out the players into historical (training) and current (test) datasets.

Import libraries to do our work

In [1]:
! pip install lifelines sportsreference
import pandas as pd
import numpy as np
import getpass
import psycopg2
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import QuantileTransformer
from sportsreference.ncaab.roster import Player
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
sns.set_style('whitegrid')
sns.set(rc = {'figure.figsize':(40, 24)})

Collecting lifelines
[?25l  Downloading https://files.pythonhosted.org/packages/e7/cb/31950ed02012b1bd06b63749ae4c6ca46b09f06e4eddb6758a61642d031b/lifelines-0.24.5-py3-none-any.whl (325kB)
[K     |████████████████████████████████| 327kB 1.8MB/s eta 0:00:01
[?25hCollecting sportsreference
[?25l  Downloading https://files.pythonhosted.org/packages/43/f8/58106455dcc98ef602a959881c42a4814bab1bd1211d7d5e341d9b44b914/sportsreference-0.5.2-py3-none-any.whl (472kB)
[K     |████████████████████████████████| 481kB 16.7MB/s eta 0:00:01
[?25hCollecting autograd-gamma>=0.3 (from lifelines)
  Downloading https://files.pythonhosted.org/packages/0a/07/d99339c9420b58b723a9189d1373e5c3889758b2202a1a7fe4a3b7a10c5a/autograd_gamma-0.4.2-py2.py3-none-any.whl
Collecting autograd>=1.3 (from lifelines)
  Downloading https://files.pythonhosted.org/packages/23/12/b58522dc2cbbd7ab939c7b8e5542c441c9a06a8eccb00b3ecac04a739896/autograd-1.3.tar.gz
Collecting pyquery>=1.4.0 (from sportsreference)
  Downloading h

Connect to our database

In [2]:
mypasswd = getpass.getpass()
conn = psycopg2.connect(database = 'cs20_group4',
                              user = 'fhfrf', #replace with pawprint
                              host = 'pgsql.dsa.lan',
                              password = mypasswd)
cursor = conn.cursor()

········


Let's pull in all of our NCAA data

In [3]:
p_ids = pd.read_sql_query("""
SELECT
    DISTINCT player_id
FROM ncaa_current_rosters rosters
""", con = conn)
p_ids = p_ids.player_id.to_list()

Let's loop through the NCAA player data and count up the number of seasons they have played.

In [None]:
active_players = pd.DataFrame()

for p in range(len(p_ids)):
    print(Player(p_ids[p]).name)
    try:
        df = Player(p_ids[p]).dataframe
        df = df.reset_index().rename({'level_0' : 'season'}, axis = 1)
        df = df.iloc[:-1, :]
        df['player_season_number'] = df.sort_values(by = 'season').groupby(['player_id']).cumcount() + 1
        df['player_name'] = Player(p_ids[p]).name
        active_players = pd.concat([active_players, df])
    except:
        pass

Connor Raines
Daniel Love
Donovan Ivory
Kenan Blackshear
Cossy Kowouto
Patrick Mwamba
Josh Nicholas
Flo Thamba
Isaac Elsasser
Jimbo Lull
Michael Barber
Josh Colon
Daniel Sanders
Kevin McKay
Craig Randall
Richmond Aririguzoh
Jerald Gillens-Butler
Hayden Koval
Jon Teske
Josh Carlton
Trevon Taylor
Coleman Sparling
Ian Steere
Jordan Lathon
Justin Turner
Brandon Williams
Denzel Tchougang
A.J. Labriola
Jordan Hairston
Christian Anigwe
Sean Rhea
Javonte Perkins
Luke Lawson
Ta'Lon Cooper
Trent Frazier
Jacoby Ross
Quinton Morton-Robertson
Joshua Hightower
Temi Aiyegbusi
Will Richardson
Hunter Janacek
Jacob McIntosh
Cameron Shelton
DaQuan Bracey
Chase Verdugo
Devin Butts
Solomon Young
Eric Jamison
Mychael Paulo
Sunday Okeke
Donovann Toatley
Taylor Schildroth
Jeremiah Bailey
Ladan Ricketts
Christian Wilson
Dominic Johnson
Terrance Banyard
Anthony Jones
Vin Baker Jr.
Curtis Cobb
Tyler Polley
C.J. Jackson
Tevin Brewer
Tim Fuller
Ty Gadsden
Brandon Kamga
Greg Eboigbodin
Webster Filmore
Alex Tabor Jr

James Manns
Shaquan Davis
Jordan Phillips
Jalen Windham
Tommy Funk
Keon Clergeot
Wali Parks
Xavier Pinson
Jahmir Young
Aleem Ford
Timmy Allen
Jaylan Gainey
Jason Crandall
Dimencio Vaughn
Chris Lykes
Kolton Mitchell
Marlon Hargis
William McNair
Amir Allen
Richard Harward
Phillip Jones
Tray Boyd III
Coty Jude
Jacob Orender
Rayj Dennis
Tyrese Maxey
C.J. Brim
Lazar Nekic
Nathanael Jack
Vonnie Patterson
Mykel Derring
Jordan Veening
Elijah Olaniyi
Daniel Lobach
Tyson Jolly
Hosana Kitenge
Mitchell Smith
Damian Dunn
Donel Cathcart III
Mackenzie McFatten
Brandon Martin
Tywhon Pickford
Alec Woodard
Antonio Reeves
Geo Baker
Elijah Morgan
Tobias Sjoberg
Ty Hands
Jett Briceno
C.J. Williamson
Ricky Clemons
Blake Lindenmeyer
E.J. Anosike
Justin Jaworski
Key Sullivan
Cade Crosland
Alex Hemenway
Eric Dixon
Mickey Pearson Jr.
Darrell Brown
Keyshawn Embery-Simpson
Filippos Gkogkos
Andrew Robinson
Zion Williams
Nathaniel Ogbu
Vado Morse
Mitch Doherty
Leaky Black
Nico Carvacho
Britton Johnson
Nana Opoku
Ch

Rob Phinisee
Anto Keshgegian
Kessler Edwards
Ryan Betley
Payton Youngblood
Joey Potts
Josh Endicott
MaCio Teague
Oscar Okeke
Cameron Krutwig
Cameron Oluyitan
Stanley Davis
Jake Forrester
Gerald Liddell
Johnny McCoy
John Caldwell
Jack Nunge
Matt Horton
Garrison Wade
Will Dillard
Brandon Battle
Hannes Polla
Tyler Hamilton
Sharone Wright Jr
Chase Coleman
Tim Perry Jr.
Devin Wade-Henderson
DeAngelo Epps
Adrian Chapman
Kashaun Hicks
Darryl Banks III
Arnaldo Toro
Evan Buckley
Fahim Jenneto
Andrew Taylor
Kobe Brown
Josh Thomas
DJ Mitchell
Grayson Murphy
Patrick Urey
Jay Estime'
Travis Light
Kaleb Bishop
Dajour Dickens
Trevin Knell
Jonathan Noel
Khalil Shabazz
Trey Wood
Donnell Gresham Jr.
Alvin Stredic Jr.
Brandon Kimble
Lewis Djonkam
Ajiri Ogemuno-Johnson
Caleb Homesley
Clevon Brown
Noah Saunders
Jaelen House
Isaiah Poor Bear-Chandler
Zoar Nedd
Taze Moore
Martynas Arlauskas
Dominick Pickett
Elijah Harkless
Jahvon Quinerly
Patrick Williams
Nathan Hoover
Tre Williams
Galen Alexander
Dre'Shawn 

David Duke
Jay Yoon
Grant Quinn
Cole Henry
Ashton Smith
Jacob Fleschman
Myles Cephas
Josh Parrish
Alterique Gilbert
Ethan Good
Ivan Hadzic
George Willborn III
Chance Garvin
Zavian Jackson
Cyriaque Foucart
Cordell Pemsl
Kareem Reid
Seneca Knight
Stevie Jordan
Chayce Smith
Khadim Sy
Justin Champagnie
Tray Maddox
Kyle Greeley
Antoine Davis
Jason Jitoboh
Cyrus Largie
Josh Price
Japannah Kellogg III
Zach Chappell
Reed Myers
Malik Miller
Lorenzo Jenkins
Jeron Artest
Ja'Darius Harris
Yannis Mendy
Randy Miller Jr.
Nick Crocker
Brennan Canada
Brady Danielson
Charlie Bagin
Chris Barnes
Jacob Toppin
Shaun Belbey
Yuri Collins
Mike Peake
Jaylen Cross
Brian Thomas
Uros Plavsic
Paul Mulcahy
Paul Scruggs
Myles Pierre
Dylan Arnette
Corbin Merritt
Reggie Parris
Paul Atkinson
Jayvon Graves
Garrett Golday
Mattia Da Campo
Cliff Thomas Jr.
Dericko Williams
Kevin McCullar
Derrick Woods
Adrease Jackson
Lony Francis Jr.
Boubacar Diakite
Travis Wagstaff
Jordan Bohannon
Greg Dolan
Kolton Kohl
Jason Preston
Isaia

Deandre Williams
Isaac Barnes
Dennis Ashley
Keli Leaupepe
Milija Cosic
Owen King
Ryan McAdoo
Danny Harris
Evan Yerkes
Joey Glynn
Ben Swett
Willy Isiani
Tasos Kamateros
Bolden Brace
James Ellis
Savion Lewis
Keon Sellers
Brandon Cyrus
Kobe Wilson
Harouna Sissoko
Kale Catchings
Koby McEwen
Devin Vassell
Christian Willis
Tulio Da Silva
Demilade Adelekun
Aleksandar Zecevic
Abdul Mohamed
Jomaru Brown
Raequan Battle
Nick Stampley
Chudier Bile
Andrew Gordon
Ja'Mare Redus
Osa Wilson
Hunter Seacat
Lorenzo McGhee
Julian Champagnie
Khy Kabellis
Ty Kessinger
Eric Steyer
Isaiah Palermo
Jahmar Young Jr.
Luke Tolliver
Amel Kuljuhovic
Jaylon Wilson
Christiaan Jones
Travis Valmon
Joe Petrakis
Jordan Ash
Jacob Grandison
Trent Forrest
Brandon Jackson
Will Johnson
Lucas Kraljevic
Yusuf Jihad
Mouhamadou Gueye
Dondre Duffus
Adili Kuerban
Avery Anderson III
Amorey Womack
Jacob Gilyard
Mason Forbes
Ameer Bennett
McKay Howell
Trent Stephney
Zan Payne
Isaiah Coleman-Lands
Brady Spence
N'Faly Dante
Rob Edwards
Jo

Darius Henderson
David Roelke
Eugene Omoruyi
David Azore
Mamadi Diakite
Ja'Mere Redus
Izaiah Brockington
Eli Abaev
Kyle Rose
Sean Mobley
Taylor Maughan
Myles Cherry
Jacob Young
Parker Hazen
Tyler Wahl
Bryan Antoine
Connor Gholson
Grant Kersey
Curtis Jones Jr.
Angel Smith
Tyson Ward
Kamaka Hepa
Bryce Aiken
Artur Konontsuk
Jordan Goodwin
Collin Gillespie
Brayden Parker
Dru Smith
Amir Gholizadeh
Kareem South
Grant Greabel
Jordan Whitfield
Leon Freeman-Daniels
Devin Gilmore
Trent Brown
Jack Hemphill
Chris Vogt
Ethan Henderson
Ebby Asamoah
Wheza Panzo
Qwanzi Samuels
Dexter Smith
Mike Adewunmi
Quade Green
Luke McDonnell
Xavier Mayo
Patrick McCaffery
Nigel Marshall
DeShaw Andrews
Malek Harwell
Jordan Hemphill
Preston Parks
Neal Quinn
Jordan Bowden
Marlon Ruffin
Donald Hicks
Abayomi Iyiola
Isaiah Bigelow
Kaleb Coleman
Matt Johns
Armani Dodson
Jalen Riley
Chuck Hannah
Ebrima Dibba
Darius Days
Tahj Eaddy
A.J. Lawson
Malik Battle
Tariq Simmons
Cameron Williams
James Towns
Jalen Hodge
Connor Withe

Jared Rice
Byron Breland III
Josh Peterson
Josh Ayeni
Tre Mann
Nate Louis
Luke Bumbalough
T.J. Weeks
Davonta Jordan
Jackson Gammons
Quentin Millora-Brown
Savion Flagg
Roydell Brown
Aramani Hill
Danya Kingsby
Darius Perry
Joniya Gadson
Terrell Allen
Silas Adheke
Tre Clark
Corey Tillery
Andrew Jones
Quirin Emanga
Trey Wertz
Allan Jeanne-Rose
Maxwell Evans
Decardo Day
Sean Bairstow
J.P. Robinson
Trace Arbuckle
Jarod Lucas
Artur Labinowicz
Robin Duncan
Josh Speidel
Durey Cadwell
J.P. Moorman
Maxime Boursiquot
Noah Baumann
Ramon Singh
Israel Barnes
Andrew Carroll
Zach Walton
Ben Harvey
Josh McNair
Timmy Falls
Tyler Morman
Khalid Moore
Courtney Ramey
Jamal Bieniemy
Brandon Newman
Flynn Cameron
Dre Marin
Jethro Tshisumpa
Khalil Turner
Michael Shipp
Joe Bryant Jr
Christian Scott
CJ Felder
Jules Erving
Kipper Nichols
Tommy Burton
Ben Jordan
Tremere Brown
Will Culliton
Marcus Foster
Precious Ikediashi
Samba Diallo
Devin Collins
Trey Doomes
Tre Jackson
Oscar Kao
Kelton Samore
Alex Reed
Aaron Thom

Save a copy of our data to a local CSV as well as insert into our database.

In [None]:
active_players.to_csv('../Data/active_players.csv', index = False)

Let's define our `ncaa_active_players_career_stats` data table schema

In [6]:
cursor = conn.cursor()

c_table = []
for c in active_players.columns.tolist():
    if active_players[c].dtype in [float, int, 'int64']:
        c_table.append('{} numeric'.format(c))
    elif active_players[c].dtype == 'datetime64[ns]':
        c_table.append('{} TIMESTAMP'.format(c))
    else:
        c_table.append('{} varchar({})'.format(c, max([len(str(x)) for x in active_players[c] if x is not None])))
        
create_table = 'DROP TABLE IF EXISTS ncaa_active_players_career_stats; '
create_table += 'CREATE TABLE ncaa_active_players_career_stats ({})'.format(', '.join(c for c in c_table))

cursor.execute(create_table)
conn.commit()

Insert our data into `ncaa_active_players_career_stats`

In [7]:
df = active_players.where(pd.notnull(active_players), None)

cursor = conn.cursor()

INSERT_SQL = 'INSERT INTO ncaa_active_players_career_stats'
INSERT_SQL += '({}) VALUES'.format(', '.join([x for x in df.columns]))
INSERT_SQL += '({})'.format(''.join(['%s,' * len(df.columns)])[:-1])

with conn, conn.cursor() as cursor:
    for row in df.itertuples(index=False, name=None):
        cursor.execute(INSERT_SQL, row)

Test our new data table with a query

In [8]:
pd.read_sql_query("""
SELECT * FROM ncaa_active_players_career_stats
""", con = conn)

Unnamed: 0,season,assist_percentage,assists,block_percentage,blocks,box_plus_minus,conference,defensive_box_plus_minus,defensive_rebound_percentage,defensive_rebounds,defensive_win_shares,effective_field_goal_percentage,field_goal_attempts,field_goal_percentage,field_goals,free_throw_attempt_rate,free_throw_attempts,free_throw_percentage,free_throws,games_played,games_started,height,minutes_played,offensive_box_plus_minus,offensive_rebound_percentage,offensive_rebounds,offensive_win_shares,personal_fouls,player_efficiency_rating,player_id,points,points_produced,position,steal_percentage,steals,team_abbreviation,three_point_attempt_rate,three_point_attempts,three_point_percentage,three_pointers,total_rebound_percentage,total_rebounds,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,weight,win_shares,win_shares_per_40_minutes,player_season_number,player_name
0,2019-20,6.9,13.0,1.0,3.0,-0.1,wac,-0.9,4.4,12.0,0.4,0.553,76.0,0.434,33.0,0.368,28.0,0.857,24.0,28.0,6.0,6-6,346.0,0.8,3.2,10.0,0.7,36.0,14.2,connor-raines-1,108.0,98.0,Forward,3.4,21.0,texas-pan-american,0.579,44.0,0.409,18.0,3.8,22.0,0.605,12.7,13.0,32.0,0.469,15.0,14.6,190,1.1,0.122,1.0,Connor Raines
1,2018-19,29.4,34.0,0.6,1.0,-4.5,southern,1.6,16.3,31.0,0.3,0.319,69.0,0.304,21.0,0.217,15.0,0.467,7.0,25.0,0.0,6-3,216.0,-6.1,4.9,9.0,-0.4,23.0,8.1,daniel-love-1,51.0,69.0,Guard,4.0,15.0,mercer,0.246,17.0,0.118,2.0,10.7,40.0,0.335,23.2,23.0,52.0,0.365,19.0,23.2,170,0.0,-0.005,1.0,Daniel Love
2,2019-20,15.9,54.0,0.6,3.0,-2.6,southern,0.8,12.7,68.0,0.8,0.434,143.0,0.399,57.0,0.238,34.0,0.676,23.0,32.0,12.0,6-3,600.0,-3.4,5.8,29.0,0.1,72.0,10.0,daniel-love-1,147.0,168.0,Guard,2.7,29.0,mercer,0.21,30.0,0.333,10.0,9.4,97.0,0.462,22.8,47.0,113.0,0.416,47.0,17.0,170,0.9,0.063,2.0,Daniel Love
3,2019-20,10.0,1.0,0.0,0.0,-14.4,america-east,-3.2,34.1,5.0,0.0,0.333,3.0,0.333,1.0,0.0,0.0,,0.0,3.0,0.0,6-5,17.0,-11.2,0.0,0.0,0.0,0.0,0.6,donovan-ivory-1,2.0,2.0,Guard,0.0,0.0,massachusetts-lowell,0.667,2.0,0.0,0.0,17.2,5.0,0.333,40.0,2.0,1.0,1.0,1.0,14.7,190,0.0,-0.082,1.0,Donovan Ivory
4,2019-20,9.5,27.0,0.5,2.0,-3.3,cusa,0.8,14.1,60.0,0.7,0.353,92.0,0.337,31.0,0.609,56.0,0.661,37.0,31.0,10.0,6-6,495.0,-4.1,6.4,29.0,0.0,46.0,7.8,kenan-blackshear-1,102.0,119.0,Guard,1.6,14.0,florida-atlantic,0.261,24.0,0.125,3.0,10.2,89.0,0.43,21.8,33.0,68.0,0.412,28.0,15.2,215,0.6,0.052,1.0,Kenan Blackshear
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10713,2018-19,12.4,20.0,1.1,3.0,-0.3,aac,1.0,6.6,20.0,0.4,0.521,71.0,0.479,34.0,0.408,29.0,0.724,21.0,28.0,0.0,6-2,315.0,-1.3,1.9,5.0,0.1,20.0,10.1,ceasar-dejesus-1,95.0,92,Guard,1.3,7.0,central-florida,0.268,19.0,0.316,6.0,4.5,25.0,0.56,23.5,26.0,52.0,0.538,28.0,18.4,190,0.6,0.076,2.0,Ceasar DeJesus
10714,2019-20,19.2,71.0,0.1,1.0,-1.5,aac,-0.7,5.5,38.0,0.8,0.54,224.0,0.504,113.0,0.254,57.0,0.754,43.0,30.0,24.0,6-2,789.0,-0.9,0.6,4.0,0.6,48.0,12.1,ceasar-dejesus-1,285.0,270,Guard,1.7,23.0,central-florida,0.192,43.0,0.372,16.0,3.1,42.0,0.568,23.0,75.0,181.0,0.536,97.0,21.0,190,1.4,0.07,3.0,Ceasar DeJesus
10715,2019-20,21.5,69.0,0.9,4.0,-1.0,aac,2.5,8.8,49.0,1.1,0.387,133.0,0.338,45.0,0.301,40.0,0.7,28.0,31.0,7.0,6-0,634.0,-3.5,0.8,4.0,-0.6,42.0,7.1,isaiah-hill-1,131.0,151,Guard,2.9,31.0,tulsa,0.414,55.0,0.236,13.0,4.9,53.0,0.431,27.6,58.0,78.0,0.41,32.0,17.5,165,0.6,0.037,1.0,Isaiah Hill
10716,2018-19,30.6,124.0,0.5,4.0,-3.2,ovc,-1.4,10.1,86.0,1.1,0.469,338.0,0.411,139.0,0.411,139.0,0.719,100.0,29.0,19.0,6-0,916.0,-1.8,2.8,24.0,0.5,52.0,15.4,jr-clay-1,417.0,431,Guard,3.1,51.0,tennessee-tech,0.293,99.0,0.394,39.0,6.5,110.0,0.516,22.3,116.0,239.0,0.418,100.0,27.2,160,1.5,0.068,1.0,Jr. Clay
