# Schedule Games x Weeks

# Description

### Data

Games Calender for each week

### ETL

Content extracted from the site: https://www.nba.com/schedule?cal=all&pd=false
  
The html content was extracted manually and parsed in this notebook.  
The result is a json file.

### To Dos
- chart with top teams
- extract teams player data
- readable list-table of games

# Walk, don´t run

In [1]:
import os
import sys
import pandas as pd
from lxml import html
from lxml import etree
import json 

In [2]:
_notebookFolder = 'nb-001'
_version = '1.0'

### Load data

In [3]:
# html content from https://www.nba.com/schedule?cal=all&pd=false
with open('../../data/universe/scrap-schedule/body.html', 'r', encoding='utf-8') as f:
    htmlContent= f.read()

In [4]:
tree = html.fromstring(htmlContent)

In [5]:
# Weeks
contentDiv = tree.getchildren()
weekEs = contentDiv[2].getchildren()
len(weekEs)


12

---

### Utils

In [6]:
def extract_weekData( _weekE ):
    weekDaysEs = []
    weekDateT = ''
    weekDateE = _weekE.cssselect('h2')
    # case: contains headline text element
    if(len(weekDateE)):
        weekDateT = _weekE.cssselect('h2')[0].text_content()
        # print('DATA:','weekDateT:', weekDateT)
        weekDaysEs = _weekE.getchildren()[1:]
    else:
        # print('LOG:',' no week text')
        weekDaysEs = _weekE.getchildren()
    return {
        'Es': weekDaysEs,
        'week_date': weekDateT
    }


In [7]:
# Infos week day 
def extract_weekDayData( _weekDaysE ):
    weekDaysInfosE = _weekDaysE.getchildren()[0]

    weekDaysInfoCalenderT = weekDaysInfosE.cssselect('h4')[0].text_content()
    weekDaysInfoWeekGamesT = weekDaysInfosE.cssselect('h6')[0].text_content()

    weekDaysContentEs = _weekDaysE.getchildren()[1].getchildren()[1:] #ignore hiddens
    return {
        'calenderT': weekDaysInfoCalenderT,
        'infoGamesT': weekDaysInfoWeekGamesT,
        'contentEs': weekDaysContentEs,
    }

In [8]:
# from each weekDaysContentEs

def extract_gameTeamsData( _weekDaysContentE ):
    gamesEs = _weekDaysContentE.getchildren()
    gameData = []
    for gameE in gamesEs:
        contentLinksEs = gameE.cssselect('a.text-cerulean')
        teamLinks = [link for link in contentLinksEs if '/team/' in  link.get('href')]
        gameItems = []
        for idx, tl in enumerate(teamLinks):
            # print('-', tl.text_content(), tl.get('href'))
            gameItems.append({
                'team_name': tl.text_content(),
                'team_link_abs': 'https://www.nba.com' + tl.get('href'),
            })
        # print('\r')
        gameData.append(gameItems)
    return gameData   

---

### Main

In [15]:
## Usage summary example
D = extract_weekData( weekEs[1] ) # Week 2
dE = D['Es'][0] # Day 1
E = extract_weekDayData( dE )
eE = E['contentEs'][0] # Game 1
F = extract_gameTeamsData( eE )
F[0]

[{'team_name': 'Golden State Warriors',
  'team_link_abs': 'https://www.nba.com/team/1610612744/warriors/'},
 {'team_name': 'Brooklyn Nets',
  'team_link_abs': 'https://www.nba.com/team/1610612751/nets/'}]

In [10]:
mainData = []

for weekE in weekEs:
    
    D = extract_weekData( weekE )
    
    gamesData= []
    
    for dE in D['Es']:
        
        E = extract_weekDayData( dE ) 
        
        gameTeamsData = []
        
        for eE in E['contentEs']:
            gameTeamsData = extract_gameTeamsData( eE ) 
        
        gameItem = {
            'day_date': E['calenderT'],
            'day_games': E['infoGamesT'],
            'day_teams': gameTeamsData
        }
        gamesData.append(gameItem)
    
    mainDataItem = {
        'week_date': D['week_date'],
        'week_games': gamesData
    }
    mainData.append(mainDataItem)

mainData

[{'week_date': '',
  'week_games': [{'day_date': 'Friday, December 11',
    'day_games': ' 5 Games',
    'day_teams': [[{'team_name': 'Orlando Magic',
       'team_link_abs': 'https://www.nba.com/team/1610612753/magic/'},
      {'team_name': 'Atlanta Hawks',
       'team_link_abs': 'https://www.nba.com/team/1610612737/hawks/'}],
     [{'team_name': 'New York Knicks',
       'team_link_abs': 'https://www.nba.com/team/1610612752/knicks/'},
      {'team_name': 'Detroit Pistons',
       'team_link_abs': 'https://www.nba.com/team/1610612765/pistons/'}],
     [{'team_name': 'Houston Rockets',
       'team_link_abs': 'https://www.nba.com/team/1610612745/rockets/'},
      {'team_name': 'Chicago Bulls',
       'team_link_abs': 'https://www.nba.com/team/1610612741/bulls/'}],
     [{'team_name': 'LA Clippers',
       'team_link_abs': 'https://www.nba.com/team/1610612746/clippers/'},
      {'team_name': 'Los Angeles Lakers',
       'team_link_abs': 'https://www.nba.com/team/1610612747/lakers/'}],


### Save Data

In [11]:
productPath = '../../data/master/schedule/'

if not os.path.exists( productPath ):
    os.makedirs( productPath )


In [12]:
productFileName = f'{_notebookFolder}__games-calendar-{_version}.json'

productFullPath = productPath + productFileName

In [13]:
with open( productFullPath, 'w') as outfile:
    json.dump(mainData, outfile, indent=4)

# Docs

### LXML cssselect

- https://lxml.de/xpathxslt.html
- https://lxml.de/cssselect.html
- https://cssselect.readthedocs.io/en/latest/