## Goal:
Store NCAA player career stats in a SQL table.

Import packages needed to import and transform data, then load it into our database

In [5]:
import pandas as pd
import numpy as np
import getpass
import psycopg2
from psycopg2.extensions import adapt, register_adapter, AsIs
pd.set_option('display.max_columns', 500)

Read in NCAA player career statistics dataset

In [6]:
player_career_stats = pd.read_csv('../Data/ncaa_player_career_stats.csv')

In [7]:
player_career_stats.shape

(91480, 53)

In [8]:
player_career_stats.head()

Unnamed: 0,assist_percentage,assists,block_percentage,blocks,box_plus_minus,conference,defensive_box_plus_minus,defensive_rebound_percentage,defensive_rebounds,defensive_win_shares,effective_field_goal_percentage,field_goal_attempts,field_goal_percentage,field_goals,free_throw_attempt_rate,free_throw_attempts,free_throw_percentage,free_throws,games_played,games_started,height,minutes_played,offensive_box_plus_minus,offensive_rebound_percentage,offensive_rebounds,offensive_win_shares,personal_fouls,player_efficiency_rating,player_id,player_name,points,points_produced,position,season,steal_percentage,steals,team_abbreviation,three_point_attempt_rate,three_point_attempts,three_point_percentage,three_pointers,total_rebound_percentage,total_rebounds,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,weight,win_shares,win_shares_per_40_minutes
0,4.9,20.0,0.6,7.0,,atlantic-sun,,,79.0,0.3,0.573,245.0,0.478,117.0,0.269,66.0,0.697,46.0,29.0,29.0,6-4,845.0,,,33.0,1.9,35.0,,antonio-cool-1,Antonio Cool,327.0,,Forward,2005.0,,24.0,jacksonville,0.473,116.0,0.405,47.0,7.3,112.0,0.592,17.3,58.0,129.0,0.543,70.0,19.0,190.0,2.2,0.104
1,,41.0,,5.0,,atlantic-sun,,,75.0,-0.1,0.537,349.0,0.458,160.0,0.155,54.0,0.741,40.0,27.0,27.0,6-4,832.0,,,23.0,1.6,35.0,,antonio-cool-1,Antonio Cool,415.0,,Forward,2006.0,,24.0,jacksonville,0.453,158.0,0.348,55.0,,98.0,0.554,9.0,37.0,191.0,0.55,105.0,,190.0,1.5,0.072
2,4.3,2.0,2.5,2.0,-5.3,atlantic-10,-3.5,7.8,6.0,0.0,0.404,26.0,0.269,7.0,0.154,4.0,0.75,3.0,15.0,0.0,6-6,87.0,-1.8,5.7,5.0,0.0,19.0,4.8,jake-fay-1,Jake Fay,24.0,23.0,Guard,2014.0,1.3,2.0,hartford,0.808,21.0,0.333,7.0,6.7,11.0,0.43,15.2,5.0,5.0,0.0,0.0,18.1,195.0,0.0,0.003
3,7.9,1.0,0.0,0.0,-24.5,america-east,-5.4,5.2,1.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,,0.0,10.0,0.0,6-6,22.0,-19.1,5.1,1.0,-0.2,0.0,-16.1,jake-fay-1,Jake Fay,0.0,1.0,Guard,2016.0,2.6,1.0,hartford,0.667,6.0,0.0,0.0,5.1,2.0,0.0,18.2,2.0,3.0,0.0,0.0,26.0,195.0,-0.2,-0.365
4,0.0,0.0,6.7,1.0,-20.2,america-east,-1.6,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,,0.0,8.0,0.0,6-6,15.0,-18.7,7.3,1.0,-0.1,0.0,-11.8,jake-fay-1,Jake Fay,0.0,1.0,Guard,2017.0,3.8,1.0,hartford,0.667,4.0,0.0,0.0,3.7,1.0,0.0,14.3,1.0,2.0,0.0,0.0,23.6,195.0,-0.1,-0.308


Add column to iteratively count each player's season

In [9]:
player_career_stats['player_season_number'] = player_career_stats.sort_values(by = 'season').groupby(['player_id']).cumcount() + 1

Fill missing heights with nominal value. We will replace this value later.

In [10]:
player_career_stats.height.fillna('0-0',inplace=True)

Loop through player heights to transform from 'Feet-inches' format to the individual's height in just inches.

In [11]:
player_heights = []
for height in player_career_stats.height:
    split = height.split('-')
    feet = split[0]
    inches = split[1]
    new_height = int(feet) * 12 + int(inches)
    player_heights.append(new_height)

In [12]:
player_career_stats.height = player_heights

Replace nominal height value with `NULL` values

In [13]:
player_career_stats.height.replace(0, None, inplace=True)

Divide these percentage columns so the numbers are between 0 and 1.

In [14]:
player_career_stats.assist_percentage = player_career_stats.assist_percentage / 100
player_career_stats.block_percentage = player_career_stats.block_percentage / 100
player_career_stats.defensive_rebound_percentage = player_career_stats.defensive_rebound_percentage / 100
player_career_stats.offensive_rebound_percentage = player_career_stats.offensive_rebound_percentage / 100
player_career_stats.steal_percentage = player_career_stats.steal_percentage / 100
player_career_stats.total_rebound_percentage = player_career_stats.total_rebound_percentage / 100
player_career_stats.turnover_percentage = player_career_stats.turnover_percentage / 100
player_career_stats.usage_percentage = player_career_stats.usage_percentage / 100

In [15]:
player_career_stats.head()

Unnamed: 0,assist_percentage,assists,block_percentage,blocks,box_plus_minus,conference,defensive_box_plus_minus,defensive_rebound_percentage,defensive_rebounds,defensive_win_shares,effective_field_goal_percentage,field_goal_attempts,field_goal_percentage,field_goals,free_throw_attempt_rate,free_throw_attempts,free_throw_percentage,free_throws,games_played,games_started,height,minutes_played,offensive_box_plus_minus,offensive_rebound_percentage,offensive_rebounds,offensive_win_shares,personal_fouls,player_efficiency_rating,player_id,player_name,points,points_produced,position,season,steal_percentage,steals,team_abbreviation,three_point_attempt_rate,three_point_attempts,three_point_percentage,three_pointers,total_rebound_percentage,total_rebounds,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,weight,win_shares,win_shares_per_40_minutes,player_season_number
0,0.049,20.0,0.006,7.0,,atlantic-sun,,,79.0,0.3,0.573,245.0,0.478,117.0,0.269,66.0,0.697,46.0,29.0,29.0,76,845.0,,,33.0,1.9,35.0,,antonio-cool-1,Antonio Cool,327.0,,Forward,2005.0,,24.0,jacksonville,0.473,116.0,0.405,47.0,0.073,112.0,0.592,0.173,58.0,129.0,0.543,70.0,0.19,190.0,2.2,0.104,1
1,,41.0,,5.0,,atlantic-sun,,,75.0,-0.1,0.537,349.0,0.458,160.0,0.155,54.0,0.741,40.0,27.0,27.0,76,832.0,,,23.0,1.6,35.0,,antonio-cool-1,Antonio Cool,415.0,,Forward,2006.0,,24.0,jacksonville,0.453,158.0,0.348,55.0,,98.0,0.554,0.09,37.0,191.0,0.55,105.0,,190.0,1.5,0.072,2
2,0.043,2.0,0.025,2.0,-5.3,atlantic-10,-3.5,0.078,6.0,0.0,0.404,26.0,0.269,7.0,0.154,4.0,0.75,3.0,15.0,0.0,78,87.0,-1.8,0.057,5.0,0.0,19.0,4.8,jake-fay-1,Jake Fay,24.0,23.0,Guard,2014.0,0.013,2.0,hartford,0.808,21.0,0.333,7.0,0.067,11.0,0.43,0.152,5.0,5.0,0.0,0.0,0.181,195.0,0.0,0.003,1
3,0.079,1.0,0.0,0.0,-24.5,america-east,-5.4,0.052,1.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,,0.0,10.0,0.0,78,22.0,-19.1,0.051,1.0,-0.2,0.0,-16.1,jake-fay-1,Jake Fay,0.0,1.0,Guard,2016.0,0.026,1.0,hartford,0.667,6.0,0.0,0.0,0.051,2.0,0.0,0.182,2.0,3.0,0.0,0.0,0.26,195.0,-0.2,-0.365,2
4,0.0,0.0,0.067,1.0,-20.2,america-east,-1.6,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,,0.0,8.0,0.0,78,15.0,-18.7,0.073,1.0,-0.1,0.0,-11.8,jake-fay-1,Jake Fay,0.0,1.0,Guard,2017.0,0.038,1.0,hartford,0.667,4.0,0.0,0.0,0.037,1.0,0.0,0.143,1.0,2.0,0.0,0.0,0.236,195.0,-0.1,-0.308,3


Connect to database

In [30]:
mypasswd = getpass.getpass()
conn = psycopg2.connect(database = 'cs20_group4',
                              user = 'mwkmr',
                              host = 'pgsql.dsa.lan',
                              password = mypasswd)

········


In [31]:
cursor = conn.cursor()

Create table to which we will write our data

In [32]:
create_table = """
DROP TABLE IF EXISTS ncaa_player_career_stats;
CREATE TABLE ncaa_player_career_stats (
assist_percentage numeric
, assists numeric
, block_percentage numeric
, blocks numeric
, box_plus_minus numeric
, conference varchar(100)
, defensive_box_plus_minus numeric
, defensive_rebound_percentage numeric
, defensive_rebounds numeric
, defensive_win_shares numeric
, effective_field_goal_percentage numeric
, field_goal_attempts numeric
, field_goal_percentage numeric
, field_goals numeric
, free_throw_attempt_rate numeric
, free_throw_attempts numeric
, free_throw_percentage numeric
, free_throws numeric
, games_played numeric
, games_started numeric
, height numeric
, minutes_played numeric
, offensive_box_plus_minus numeric
, offensive_rebound_percentage numeric
, offensive_rebounds numeric
, offensive_win_shares numeric
, personal_fouls numeric
, player_efficiency_rating numeric
, player_id varchar(100)
, player_name varchar(100)
, points numeric
, points_produced numeric
, position varchar(50)
, season numeric
, steal_percentage numeric
, steals numeric
, team_abbreviation varchar(100)
, three_point_attempt_rate numeric
, three_point_attempts numeric
, three_point_percentage numeric
, three_pointers numeric
, total_rebound_percentage numeric
, total_rebounds numeric
, true_shooting_percentage numeric
, turnover_percentage numeric
, turnovers numeric
, two_point_attempts numeric
, two_point_percentage numeric
, two_pointers numeric
, usage_percentage numeric
, weight numeric
, win_shares numeric
, win_shares_per_40_minutes numeric
, player_season_number numeric
)
"""

Execute create table statement

In [33]:
cursor.execute(create_table)

In [34]:
conn.commit()

Insert data into the table we just created

In [35]:
pstats = player_career_stats.where(pd.notnull(player_career_stats),None)

INSERT_SQL = 'INSERT INTO ncaa_player_career_stats'
INSERT_SQL += '(assist_percentage'
INSERT_SQL +=', assists'
INSERT_SQL +=', block_percentage'
INSERT_SQL +=', blocks'
INSERT_SQL +=', box_plus_minus'
INSERT_SQL +=', conference'
INSERT_SQL +=', defensive_box_plus_minus'
INSERT_SQL +=', defensive_rebound_percentage'
INSERT_SQL +=', defensive_rebounds'
INSERT_SQL +=', defensive_win_shares'
INSERT_SQL +=', effective_field_goal_percentage'
INSERT_SQL +=', field_goal_attempts'
INSERT_SQL +=', field_goal_percentage'
INSERT_SQL +=', field_goals'
INSERT_SQL +=', free_throw_attempt_rate'
INSERT_SQL +=', free_throw_attempts'
INSERT_SQL +=', free_throw_percentage'
INSERT_SQL +=', free_throws'
INSERT_SQL +=', games_played'
INSERT_SQL +=', games_started'
INSERT_SQL +=', height'
INSERT_SQL +=', minutes_played'
INSERT_SQL +=', offensive_box_plus_minus'
INSERT_SQL +=', offensive_rebound_percentage'
INSERT_SQL +=', offensive_rebounds'
INSERT_SQL +=', offensive_win_shares'
INSERT_SQL +=', personal_fouls'
INSERT_SQL +=', player_efficiency_rating'
INSERT_SQL +=', player_id'
INSERT_SQL +=', player_name'
INSERT_SQL +=', points'
INSERT_SQL +=', points_produced'
INSERT_SQL +=', position'
INSERT_SQL +=', season'
INSERT_SQL +=', steal_percentage'
INSERT_SQL +=', steals'
INSERT_SQL +=', team_abbreviation'
INSERT_SQL +=', three_point_attempt_rate'
INSERT_SQL +=', three_point_attempts'
INSERT_SQL +=', three_point_percentage'
INSERT_SQL +=', three_pointers'
INSERT_SQL +=', total_rebound_percentage'
INSERT_SQL +=', total_rebounds'
INSERT_SQL +=', true_shooting_percentage'
INSERT_SQL +=', turnover_percentage'
INSERT_SQL +=', turnovers'
INSERT_SQL +=', two_point_attempts'
INSERT_SQL +=', two_point_percentage'
INSERT_SQL +=', two_pointers'
INSERT_SQL +=', usage_percentage'
INSERT_SQL +=', weight'
INSERT_SQL +=', win_shares'
INSERT_SQL +=', win_shares_per_40_minutes'
INSERT_SQL +=', player_season_number) VALUES'
INSERT_SQL +='(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,'
INSERT_SQL +='%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,'
INSERT_SQL +='%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,'
INSERT_SQL +='%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,'
INSERT_SQL +='%s,%s,%s,%s,%s,%s,%s,%s,%s,%s'
INSERT_SQL +=',%s,%s,%s,%s)'

with conn, conn.cursor() as cursor:
    for row in player_career_stats.itertuples(index=False, name=None):
        cursor.execute(INSERT_SQL,row)

Read back row count to verify that data was inserted to database properly.

In [36]:
test = pd.read_sql_query("""
SELECT
    COUNT(*)
FROM ncaa_player_career_stats
""", con = conn)

print(test.shape)
test.head()

(1, 1)


Unnamed: 0,count
0,91480
