In [None]:
import datetime
import time
import pytz
import pandas as pd

from itertools import chain
from datetime import timedelta, date, datetime, timezone
from basketball_reference_web_scraper import client

from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database

from secrets import *

In [None]:
dbname = 'cluj'
engine = create_engine('postgres://%s:%s@localhost/%s'%('docker','docker',dbname))

if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

In [None]:
#https://stackoverflow.com/questions/1060279/iterating-through-a-range-of-dates-in-python

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

### Get Schedules

In [None]:
schedule = client.season_schedule(season_end_year=2020)

In [None]:
start = schedule[0]['start_time']
end = datetime.now(pytz.timezone('US/Central')) - timedelta(days=1)
start, end

In [None]:
schedule_df = pd.DataFrame(schedule)
schedule_df['away_team'] = schedule_df.away_team.apply(lambda x: x.name)
schedule_df['home_team'] = schedule_df.home_team.apply(lambda x: x.name)
schedule_df['start_time'] = schedule_df.start_time.apply(lambda x: x.tz_localize(None))
schedule_df['start_date'] = schedule_df.start_time.apply(lambda x: x.date())
schedule_df['season_end_year'] = 2020
schedule_df.head()

In [None]:
schedule_df.to_sql('nba_schedule', con=engine, if_exists='replace', index=False)

In [None]:
for year in range(2001, 2020):
    print(year)
    schedule = client.season_schedule(season_end_year=year)
    schedule_df = pd.DataFrame(schedule)
    schedule_df['away_team'] = schedule_df.away_team.apply(lambda x: x.name)
    schedule_df['home_team'] = schedule_df.home_team.apply(lambda x: x.name)
    schedule_df['start_time'] = schedule_df.start_time.apply(lambda x: x.tz_localize(None))
    schedule_df['start_date'] = schedule_df.start_time.apply(lambda x: x.date())
    schedule_df['season_end_year'] = year
    schedule_df.to_sql('nba_schedule', con=engine, if_exists='append', index=False)

### Get a bunch of boxscores

In [None]:
# I downloaded season_end_year 2019 and 2020. Does not include playoffs.

In [None]:
boxscores_list = []
for day in daterange(start, end):
    print(day)
    boxscores = client.player_box_scores(day=day.day, month=day.month, year=day.year)
    for item in boxscores:
        item.update( {"date":datetime.strftime(day.date(), format = '%Y-%m-%d')})
    boxscores_list.append(boxscores)
    time.sleep(2)

In [None]:
boxscores_df = pd.DataFrame(list(chain.from_iterable(boxscores_list)))
boxscores_df['rebounds'] = boxscores_df.offensive_rebounds + boxscores_df.defensive_rebounds
boxscores_df.rename(columns={'attempted_field_goals':'fga', 'attempted_free_throws':'fta',
                            'made_three_point_field_goals':'threes', 'made_field_goals':'fgm',
                            'made_free_throws':'ftm'}, inplace=True)
boxscores_df['twos'] = boxscores_df.fgm - boxscores_df.threes
boxscores_df['points'] = (boxscores_df.threes * 3) + (boxscores_df.twos * 2) + (boxscores_df.ftm * 1)
boxscores_df.drop(columns=['attempted_three_point_field_goals','defensive_rebounds','offensive_rebounds',
                          'game_score','slug','turnovers','outcome','twos','personal_fouls','location'], inplace=True)
boxscores_df['opponent'] = boxscores_df.opponent.apply(lambda x: x.name)
boxscores_df['team'] = boxscores_df.team.apply(lambda x: x.name)
boxscores_df.columns.values

In [None]:
boxscores_df.to_sql('boxscores', con=engine, if_exists='append', index=False)