In [1]:
import requests
from bs4 import BeautifulSoup as soup
import uuid
import difflib
import html5lib
import time
import pandas as pd
from sqlalchemy import create_engine
import psycopg2
pd.options.mode.chained_assignment = None 

In [2]:
conn_string = 'postgresql+psycopg2://postgres:password@localhost:5432/nba_disappointments'
player_index = pd.read_csv('../updated_datasets/player_index.csv')
player_index = player_index[player_index.columns[1:]]

In [3]:
null_players = pd.read_csv('../updated_datasets/null_players.csv')
null_players = null_players[null_players.columns[1:]]
null_players.head()

Unnamed: 0,New_name,Original_name
0,Darrell Armstrong,Darrel Armstrong
1,Predrag Stojakovic,Peja Stojakovic
2,PJ Brown,P.J. Brown
3,Jaren Jackson Jr,Jaren Jackson
4,AJ Guyton,A.J. Guyton


In [4]:
db = create_engine(conn_string)
conn = db.connect()

start_time = time.time()
player_index.to_sql('player_index', con=conn, if_exists='replace', index=False)
print("to_sql duration: {} seconds".format(time.time() - start_time))

to_sql duration: 0.07399344444274902 seconds


In [5]:
def load_to_sql(tables, directory):
    table_list = []
    for i in range(22):
        track = []
        year = 2000 + i
        var_name = f'{tables}_{year}'
        #load in the csvs
        globals()[var_name] = pd.read_csv(f'../updated_datasets/{directory}_data/{tables}_{year}.csv')
        globals()[var_name] = globals()[var_name][globals()[var_name].columns[1:]]
        track.append(var_name)
        #upload the csvs to sql
        db = create_engine('postgresql+psycopg2://postgres:password@localhost:5432/nba_disappointments')
        conn = db.connect()

        start_time = time.time()
        globals()[var_name].to_sql(f'{var_name}', con=conn, if_exists='replace', index=False)
        track.append(f'to_sql duration: {time.time() - start_time} seconds')
        #output a list of table names and upload durations
        table_list.append(track)
    return table_list

In [7]:
load_to_sql('draft', 'draft')

[['draft_2000', 'to_sql duration: 0.02873516082763672 seconds'],
 ['draft_2001', 'to_sql duration: 0.029552459716796875 seconds'],
 ['draft_2002', 'to_sql duration: 0.027306795120239258 seconds'],
 ['draft_2003', 'to_sql duration: 0.02992391586303711 seconds'],
 ['draft_2004', 'to_sql duration: 0.02907538414001465 seconds'],
 ['draft_2005', 'to_sql duration: 0.02894878387451172 seconds'],
 ['draft_2006', 'to_sql duration: 0.03666496276855469 seconds'],
 ['draft_2007', 'to_sql duration: 0.028580904006958008 seconds'],
 ['draft_2008', 'to_sql duration: 0.03030705451965332 seconds'],
 ['draft_2009', 'to_sql duration: 0.03548717498779297 seconds'],
 ['draft_2010', 'to_sql duration: 0.028983116149902344 seconds'],
 ['draft_2011', 'to_sql duration: 0.03464508056640625 seconds'],
 ['draft_2012', 'to_sql duration: 0.02904510498046875 seconds'],
 ['draft_2013', 'to_sql duration: 0.029989242553710938 seconds'],
 ['draft_2014', 'to_sql duration: 0.030260801315307617 seconds'],
 ['draft_2015', 'to

In [8]:
load_to_sql('stats', 'stats')

[['stats_2000', 'to_sql duration: 0.04096198081970215 seconds'],
 ['stats_2001', 'to_sql duration: 0.04121279716491699 seconds'],
 ['stats_2002', 'to_sql duration: 0.048497676849365234 seconds'],
 ['stats_2003', 'to_sql duration: 0.045926809310913086 seconds'],
 ['stats_2004', 'to_sql duration: 0.04211592674255371 seconds'],
 ['stats_2005', 'to_sql duration: 0.044976234436035156 seconds'],
 ['stats_2006', 'to_sql duration: 0.047498464584350586 seconds'],
 ['stats_2007', 'to_sql duration: 0.044419050216674805 seconds'],
 ['stats_2008', 'to_sql duration: 0.05034327507019043 seconds'],
 ['stats_2009', 'to_sql duration: 0.05192065238952637 seconds'],
 ['stats_2010', 'to_sql duration: 0.0501101016998291 seconds'],
 ['stats_2011', 'to_sql duration: 0.05045485496520996 seconds'],
 ['stats_2012', 'to_sql duration: 0.04442095756530762 seconds'],
 ['stats_2013', 'to_sql duration: 0.04623126983642578 seconds'],
 ['stats_2014', 'to_sql duration: 0.04549574851989746 seconds'],
 ['stats_2015', 'to_s

In [9]:
load_to_sql('salaries', 'salaries')

[['salaries_2000', 'to_sql duration: 0.03685331344604492 seconds'],
 ['salaries_2001', 'to_sql duration: 0.0352628231048584 seconds'],
 ['salaries_2002', 'to_sql duration: 0.0345001220703125 seconds'],
 ['salaries_2003', 'to_sql duration: 0.043257951736450195 seconds'],
 ['salaries_2004', 'to_sql duration: 0.03705310821533203 seconds'],
 ['salaries_2005', 'to_sql duration: 0.041600942611694336 seconds'],
 ['salaries_2006', 'to_sql duration: 0.04411196708679199 seconds'],
 ['salaries_2007', 'to_sql duration: 0.03722691535949707 seconds'],
 ['salaries_2008', 'to_sql duration: 0.046050310134887695 seconds'],
 ['salaries_2009', 'to_sql duration: 0.04205775260925293 seconds'],
 ['salaries_2010', 'to_sql duration: 0.04005551338195801 seconds'],
 ['salaries_2011', 'to_sql duration: 0.04189157485961914 seconds'],
 ['salaries_2012', 'to_sql duration: 0.03936505317687988 seconds'],
 ['salaries_2013', 'to_sql duration: 0.03506278991699219 seconds'],
 ['salaries_2014', 'to_sql duration: 0.03731727

In [15]:
def load_to_sql(tables, directory):
    table_list = []
    for i in range(22):
        track = []
        var_name = f'{tables}'
        #load in the csvs
        globals()[var_name] = pd.read_csv(f'../updated_datasets/{directory}_data/{tables}.csv')
        globals()[var_name] = globals()[var_name][globals()[var_name].columns[1:]]
        track.append(var_name)
        #upload the csvs to sql
        db = create_engine('postgresql+psycopg2://postgres:password@localhost:5432/nba_disappointments')
        conn = db.connect()

        start_time = time.time()
        globals()[var_name].to_sql(f'{var_name}', con=conn, if_exists='replace', index=False)
        track.append(f'to_sql duration: {time.time() - start_time} seconds')
        #output a list of table names and upload durations
        table_list.append(track)
    return table_list

In [16]:
def awards_data_names():
    table_names = []
    main = soup(requests.get('http://www.espn.com/nba/history/awards').text, 'html.parser')
    links = [a['href'] for a in main.find_all('a', class_='bi', href=True)]
    links.remove('//www.espn.com/nba/history/awards/_/id/34')
    
    for link in links:
        #grab the header of the website page
        name = str(soup(requests.get(f'http:{link}').text).select('h2')[0]).replace(
            '<h2>', '').replace('</h2>', '').replace('<h2>', '').partition('- ')[-1].split()
        
        #'the' doesn't get iterated through so remove it
        if 'the' in name:
            name.remove('the')
            
        #set variable name to the first two capitalized words of the header
        if len(name) > 1:
            for word in name:
                if word[0] != word[0].upper():
                    name.remove(word)
            var_name = (name[0] + '_' + name[1]).lower().replace('-', '_')
        else:
            var_name = ''.join(name).lower()
        
        table_names.append(var_name)
    return table_names

In [17]:
for link in awards_data_names():
    load_to_sql(link, 'awards') 