In [1]:
import requests
from bs4 import BeautifulSoup as soup
import uuid
import difflib
import html5lib
import time
import pandas as pd
from sqlalchemy import create_engine
import psycopg2
pd.options.mode.chained_assignment = None 

In [2]:
conn_string = 'postgresql+psycopg2://postgres:password@localhost:5432/nba_disappointments'
player_index = pd.read_csv('../updated_datasets/player_index.csv')
player_index = player_index[player_index.columns[1:]]

In [3]:
null_players = pd.read_csv('../updated_datasets/null_players.csv')
null_players = null_players[null_players.columns[1:]]
null_players.head()

Unnamed: 0,New_name,Original_name
0,Darrell Armstrong,Darrel Armstrong
1,Predrag Stojakovic,Peja Stojakovic
2,PJ Brown,P.J. Brown
3,Jaren Jackson Jr,Jaren Jackson
4,AJ Guyton,A.J. Guyton


In [3]:
db = create_engine(conn_string)
conn = db.connect()

start_time = time.time()
player_index.to_sql('player_index', con=conn, if_exists='replace', index=False)
print("to_sql duration: {} seconds".format(time.time() - start_time))

to_sql duration: 0.07358002662658691 seconds


In [9]:
def load_to_sql(tables, directory):
    table_list = []
    for i in [year for year in range(1996, 2022)]:
        track = []
        var_name = f'{tables}_{i}'
        #load in the csvs
        globals()[var_name] = pd.read_csv(f'../updated_datasets/{directory}/{tables}_{i}.csv')
        globals()[var_name] = globals()[var_name][globals()[var_name].columns[1:]]
        track.append(var_name)
        #upload the csvs to sql
        db = create_engine('postgresql+psycopg2://postgres:password@localhost:5432/nba_disappointments')
        conn = db.connect()

        start_time = time.time()
        globals()[var_name].to_sql(f'{var_name}', con=conn, if_exists='replace', index=False)
        track.append(f'to_sql duration: {time.time() - start_time} seconds')
        #output a list of table names and upload durations
        table_list.append(track)
    return table_list

In [14]:
load_to_sql('draft', 'draft_data')

[['draft_2000', 'to_sql duration: 0.02886223793029785 seconds'],
 ['draft_2001', 'to_sql duration: 0.02787303924560547 seconds'],
 ['draft_2002', 'to_sql duration: 0.026911020278930664 seconds'],
 ['draft_2003', 'to_sql duration: 0.028089046478271484 seconds'],
 ['draft_2004', 'to_sql duration: 0.02652573585510254 seconds'],
 ['draft_2005', 'to_sql duration: 0.030486106872558594 seconds'],
 ['draft_2006', 'to_sql duration: 0.027571916580200195 seconds'],
 ['draft_2007', 'to_sql duration: 0.027065515518188477 seconds'],
 ['draft_2008', 'to_sql duration: 0.027068614959716797 seconds'],
 ['draft_2009', 'to_sql duration: 0.02656841278076172 seconds'],
 ['draft_2010', 'to_sql duration: 0.030083656311035156 seconds'],
 ['draft_2011', 'to_sql duration: 0.026571273803710938 seconds'],
 ['draft_2012', 'to_sql duration: 0.02959132194519043 seconds'],
 ['draft_2013', 'to_sql duration: 0.02646350860595703 seconds'],
 ['draft_2014', 'to_sql duration: 0.02655816078186035 seconds'],
 ['draft_2015', '

In [11]:
load_to_sql('stats', 'stats_data')

[['stats_1996', 'to_sql duration: 0.06541061401367188 seconds'],
 ['stats_1997', 'to_sql duration: 0.059632062911987305 seconds'],
 ['stats_1998', 'to_sql duration: 0.06091594696044922 seconds'],
 ['stats_1999', 'to_sql duration: 0.06942987442016602 seconds'],
 ['stats_2000', 'to_sql duration: 0.05966544151306152 seconds'],
 ['stats_2001', 'to_sql duration: 0.05402541160583496 seconds'],
 ['stats_2002', 'to_sql duration: 0.04854726791381836 seconds'],
 ['stats_2003', 'to_sql duration: 0.06349945068359375 seconds'],
 ['stats_2004', 'to_sql duration: 0.06050300598144531 seconds'],
 ['stats_2005', 'to_sql duration: 0.05754971504211426 seconds'],
 ['stats_2006', 'to_sql duration: 0.058439016342163086 seconds'],
 ['stats_2007', 'to_sql duration: 0.07002067565917969 seconds'],
 ['stats_2008', 'to_sql duration: 0.0717613697052002 seconds'],
 ['stats_2009', 'to_sql duration: 0.06267619132995605 seconds'],
 ['stats_2010', 'to_sql duration: 0.05638766288757324 seconds'],
 ['stats_2011', 'to_sql 

In [20]:
load_to_sql('salaries', 'salaries_data/player_salaries')

[['salaries_2000', 'to_sql duration: 0.03471183776855469 seconds'],
 ['salaries_2001', 'to_sql duration: 0.03020477294921875 seconds'],
 ['salaries_2002', 'to_sql duration: 0.03020000457763672 seconds'],
 ['salaries_2003', 'to_sql duration: 0.03206348419189453 seconds'],
 ['salaries_2004', 'to_sql duration: 0.03282880783081055 seconds'],
 ['salaries_2005', 'to_sql duration: 0.031075477600097656 seconds'],
 ['salaries_2006', 'to_sql duration: 0.030283451080322266 seconds'],
 ['salaries_2007', 'to_sql duration: 0.03032231330871582 seconds'],
 ['salaries_2008', 'to_sql duration: 0.031914710998535156 seconds'],
 ['salaries_2009', 'to_sql duration: 0.04253888130187988 seconds'],
 ['salaries_2010', 'to_sql duration: 0.030081748962402344 seconds'],
 ['salaries_2011', 'to_sql duration: 0.030249595642089844 seconds'],
 ['salaries_2012', 'to_sql duration: 0.031432390213012695 seconds'],
 ['salaries_2013', 'to_sql duration: 0.029964685440063477 seconds'],
 ['salaries_2014', 'to_sql duration: 0.03

In [7]:
load_to_sql('team_salaries', 'salaries_data/team_salaries')

[['team_salaries_1990', 'to_sql duration: 0.019031047821044922 seconds'],
 ['team_salaries_1991', 'to_sql duration: 0.01790595054626465 seconds'],
 ['team_salaries_1992', 'to_sql duration: 0.01920151710510254 seconds'],
 ['team_salaries_1993', 'to_sql duration: 0.01888298988342285 seconds'],
 ['team_salaries_1994', 'to_sql duration: 0.018036842346191406 seconds'],
 ['team_salaries_1995', 'to_sql duration: 0.0188748836517334 seconds'],
 ['team_salaries_1996', 'to_sql duration: 0.020041465759277344 seconds'],
 ['team_salaries_1997', 'to_sql duration: 0.017959117889404297 seconds'],
 ['team_salaries_1998', 'to_sql duration: 0.0201876163482666 seconds'],
 ['team_salaries_1999', 'to_sql duration: 0.024034976959228516 seconds'],
 ['team_salaries_2000', 'to_sql duration: 0.0393068790435791 seconds'],
 ['team_salaries_2001', 'to_sql duration: 0.03924274444580078 seconds'],
 ['team_salaries_2002', 'to_sql duration: 0.03959774971008301 seconds'],
 ['team_salaries_2003', 'to_sql duration: 0.03753

In [6]:
load_to_sql('season_totals', 'season_totals')

[['season_totals_1990', 'to_sql duration: 0.03609609603881836 seconds'],
 ['season_totals_1991', 'to_sql duration: 0.037308454513549805 seconds'],
 ['season_totals_1992', 'to_sql duration: 0.0381011962890625 seconds'],
 ['season_totals_1993', 'to_sql duration: 0.03850102424621582 seconds'],
 ['season_totals_1994', 'to_sql duration: 0.038559913635253906 seconds'],
 ['season_totals_1995', 'to_sql duration: 0.03890514373779297 seconds'],
 ['season_totals_1996', 'to_sql duration: 0.041047096252441406 seconds'],
 ['season_totals_1997', 'to_sql duration: 0.04162263870239258 seconds'],
 ['season_totals_1998', 'to_sql duration: 0.03949594497680664 seconds'],
 ['season_totals_1999', 'to_sql duration: 0.04056572914123535 seconds'],
 ['season_totals_2000', 'to_sql duration: 0.11652112007141113 seconds'],
 ['season_totals_2001', 'to_sql duration: 0.05563211441040039 seconds'],
 ['season_totals_2002', 'to_sql duration: 0.05555462837219238 seconds'],
 ['season_totals_2003', 'to_sql duration: 0.05530

In [5]:
def load_to_sql(tables, directory):
    table_list = []
    for i in range(22):
        track = []
        var_name = f'{tables}'
        #load in the csvs
        globals()[var_name] = pd.read_csv(f'../updated_datasets/{directory}_data/{tables}.csv')
        globals()[var_name] = globals()[var_name][globals()[var_name].columns[1:]]
        track.append(var_name)
        #upload the csvs to sql
        db = create_engine('postgresql+psycopg2://postgres:password@localhost:5432/nba_disappointments')
        conn = db.connect()

        start_time = time.time()
        globals()[var_name].to_sql(f'{var_name}', con=conn, if_exists='replace', index=False)
        track.append(f'to_sql duration: {time.time() - start_time} seconds')
        #output a list of table names and upload durations
        table_list.append(track)
    return table_list

In [16]:
def awards_data_names():
    table_names = []
    main = soup(requests.get('http://www.espn.com/nba/history/awards').text, 'html.parser')
    links = [a['href'] for a in main.find_all('a', class_='bi', href=True)]
    links.remove('//www.espn.com/nba/history/awards/_/id/34')
    
    for link in links:
        #grab the header of the website page
        name = str(soup(requests.get(f'http:{link}').text).select('h2')[0]).replace(
            '<h2>', '').replace('</h2>', '').replace('<h2>', '').partition('- ')[-1].split()
        
        #'the' doesn't get iterated through so remove it
        if 'the' in name:
            name.remove('the')
            
        #set variable name to the first two capitalized words of the header
        if len(name) > 1:
            for word in name:
                if word[0] != word[0].upper():
                    name.remove(word)
            var_name = (name[0] + '_' + name[1]).lower().replace('-', '_')
        else:
            var_name = ''.join(name).lower()
        
        table_names.append(var_name)
    return table_names

In [17]:
for link in awards_data_names():
    load_to_sql(link, 'awards') 