<a href="https://colab.research.google.com/github/danielmakcy/Python-Project/blob/main/Python_Web_scraping_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

base_url = 'https://www.basketball-reference.com/'

# Retrieve all team in Eastern and Western Conference

In [2]:
def get_NBA_teams(url):

  response = requests.get(url)
  # Parse the HTML content with Beautiful Soup
  soup = BeautifulSoup(response.content, 'html.parser')

  def get_teams (conf):

    tablename = ''
    if conf == 'East':
      tablename = 'confs_standings_E'
    else:
      tablename = 'confs_standings_W'

    table = soup.find('table', {'id': tablename})

    header_data = ['Conference', 'Team Name', 'URL']

    data = []
    for bd in table.find_all('tbody'):
      for row in bd.find_all('th'):
        row_data=[]
        teamname = row.text
        urls = [node.get('href') for node in row.find_all("a")]
        row_data.append(conf)
        row_data.append(teamname)
        row_data.append(urls[0])
        data.append(row_data)

    df = pd.DataFrame(data, columns = header_data)
    return df

  df1 = get_teams("East")
  df2 = get_teams("West")
  df = pd.concat([df1, df2])
  return df.reset_index(drop=True)

teams = get_NBA_teams('https://www.basketball-reference.com/leagues/NBA_2024.html')

AttributeError: 'NoneType' object has no attribute 'find_all'

In [None]:
teams

# Retrieve all players in a team

In [None]:
header_data = ('No', 'Player', 'Pos', 'Ht', 'Wt', 'Birth Date', 'Country', 'Exp', 'College', 'URL')
players = pd.DataFrame((), columns = header_data)

def get_team_players (url):
  response = requests.get(url)

# Parse the HTML content with Beautiful Soup
  soup = BeautifulSoup(response.content, 'html.parser')
  table = soup.find('table', {'id': 'roster'})

  # header_data = []
  # for header in table.find_all('th'):
  #   if len(header.text) != 0:
  #     header_data.append(header.text)

  # header_data.append('URL')
  # print(header_data)

  data = []
  for row in table.find_all('tr'):
    row_data = []
    url=[]
    for cell in row.find_all('td'):
      row_data.append(cell.text)
      for link in cell.find_all('a'):
        url.append(link.get('href'))

    if len(row_data) > 0:
      row_data.insert(0, np.nan)
      row_data.append(url[0])
      data.append(row_data)

  
  df = pd.DataFrame(data, columns = header_data)
  return df

for index, team in teams.iterrows():
  url = base_url + team.URL
  print(url)
  player = get_team_players (url)
  print(player)
  players = pd.concat((players, player), ignore_index=True)


In [None]:
players

# Data Clean up - Player

In [None]:
from xml.etree.ElementTree import XMLPullParser

# Updated Exp
players["Exp"] = players["Exp"].apply(lambda x: 0 if x == 'R' else x)
players["Exp"] = players["Exp"].astype(int)

#Add Age column
from datetime import datetime

def calc_Age (x):

  date_format = "%B %d, %Y"
  birthdate = datetime.strptime(x, date_format)

  current_date = datetime.now()
  x = current_date.year - birthdate.year - ((current_date.month, current_date.day) < (birthdate.month, birthdate.day))
  return x

players["Age"] = players["Birth Date"].apply(calc_Age)

#players.rename(columns={"\xa0": "Country"}, inplace=True)

players["Country"] = players["Country"].apply(lambda x: x.upper() if True else x)

players


# Retrieve player's performance

In [None]:
def get_performance (plist):

  header_data = ['URL', 'Year', 'G', 'PTS', 'RTB', 'AST', 'FG', 'FG3', 'FT', 'eFG', 'PER', 'WS']

  df = pd.DataFrame ((), columns = header_data)
  pdata = []

  for index, player in plist.iterrows():
    url = base_url + player.URL
    #print(url)

    pdata_Last_Year = [player.URL, 'LY']
    pdata_Career = [player.URL, 'C']

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    try:
      p1 = soup.find('div', {'class': 'p1'})
      i=0
      for item in p1.find_all('p'):
        if (i%2==0):
          pdata_Last_Year.append(float(item.text))
        else:
          pdata_Career.append(float(item.text))
        i += 1
    except:
      print(player.URL, ' No Data')

    try:
      p2 = soup.find('div', {'class': 'p2'})
      i=0
      for item in p2.find_all('p'):
        if (i%2==0):
          pdata_Last_Year.append(float(item.text))
        else:
          pdata_Career.append(float(item.text))
        i += 1
    except:
      print(player.URL, ' No Data')

    try:
      p3 = soup.find('div', {'class': 'p3'})
      i=0
      for item in p3.find_all('p'):
        if (i%2==0):
          pdata_Last_Year.append(float(item.text))
        else:
          pdata_Career.append(float(item.text))
        i += 1
    except:
      print(player.URL, ' No Data')

    pdata.append(pdata_Last_Year)
    pdata.append(pdata_Career)
    # print(pdata)

  df = pd.DataFrame(data=pdata, columns = header_data)

  return df

performance = get_performance (players)

In [None]:
performance