In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import requests

In [None]:
#import csv data from 2020 to 2024
nba_data2023_2024 = pd.read_csv('nba2023-2024.csv', encoding='latin1', sep=';')
nba_data2022_2023 = pd.read_csv('nba2022-2023.csv', encoding='latin1', sep=';')
nba_data2021_2022 = pd.read_csv('nba2021-2022.csv', encoding='latin1', sep=';')
print(nba_data2023_2024.info())
print(nba_data2022_2023.info())
print(nba_data2021_2022.info())
#i have to deal with season 2020-2021 and season 2019-20 differently as data is in a different format

In [None]:
#spliting columns and assigning the names as the original data didn't include headeers
nba_data2020_2021 = pd.read_csv('nba2020-2021.csv', encoding='latin1', sep=';', header=None)
nba_data2020_2021 = nba_data2020_2021[0].str.split(',', expand=True)
nba_data2020_2021.columns = [
    'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 
    '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 
    'BLK', 'TOV', 'PF', 'PTS'
]
#converting columns that contain numeric data into numeric format
numeric_columns = [
    'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 
    'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'
]
nba_data2020_2021[numeric_columns] = nba_data2020_2021[numeric_columns].apply(pd.to_numeric, errors='coerce')
nba_data2020_2021_cleaned = nba_data2020_2021.drop(index=0).reset_index(drop=True)
print(nba_data2020_2021_cleaned.head())
print(nba_data2020_2021_cleaned.info())

In [None]:
#data scrapping season 2019-2020
def scrape_team_per_game(team_initial):
    url = f"https://www.basketball-reference.com/teams/{team_initial}/2020.html#all_per_game-playoffs_per_game"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    table = soup.find('table', {'id':'per_game'})
    headers = [th.text for th in table.find('thead').find_all('th')]
    rows = table.find('tbody').find_all('tr')
    data =[]
    for row in rows:
        cells = row.find_all('td')
        cells = [cell.text.strip() for cell in cells]
        data.append(cells)
    df = pd.DataFrame(data, columns=headers[1:])  #skipping the first header which is the rank column
    df['Team'] = team_initial  

    return df
#listing the initials of each team to put in the url
teams = [
    "MIL", "ATL", "BOS", "BRK", "CHO", "CHI", "CLE", "DAL", "DEN", "DET",
    "GSW", "HOU", "IND", "LAC", "LAL", "MEM", "MIA", "MIN", "NOP", "NYK",
    "OKC", "ORL", "PHI", "PHO", "POR", "SAC", "SAS", "TOR", "UTA", "WAS"
]
nba_data2019_2020 = pd.DataFrame()
for team in teams:
    team_data = scrape_team_per_game(team)
    if not team_data.empty:
        nba_data2019_2020 = pd.concat([nba_data2019_2020, team_data], ignore_index=True)
print(nba_data2019_2020.info())
print(nba_data2019_2020.head())

In [None]:
print(nba_data2019_2020.columns)

In [None]:
#changing columns that are numeric to numeric type
nba_data2019_2020.columns = [
    'Player', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', 
    '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 
    'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Team'
]
numeric_columns = [
    'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 
    'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'
]
nba_data2019_2020[numeric_columns] = nba_data2019_2020[numeric_columns].apply(pd.to_numeric, errors='coerce')
nba_data2019_2020_cleaned = nba_data2019_2020.drop(index=0).reset_index(drop=True)
print(nba_data2019_2020_cleaned.head())
print(nba_data2019_2020_cleaned.info())

In [None]:
#looking for data that is missing values, to check why they are missing on the original webpage
missing_data_rows = nba_data2019_2020_cleaned[
    nba_data2019_2020_cleaned['3P%'].isna() |
    nba_data2019_2020_cleaned['eFG%'].isna() |
    nba_data2019_2020_cleaned['FT%'].isna() |
    nba_data2019_2020_cleaned['FG%'].isna()
]

print(missing_data_rows)
#data is missing because they didn't attempt any FT (e.g. if FT is 0, FT% is empty)

In [None]:
#filling empty data with 0, as this is the true value
nba_data2019_2020_cleaned['FT%'].fillna(0, inplace=True)
nba_data2019_2020_cleaned['3P%'].fillna(0, inplace=True)
nba_data2019_2020_cleaned['FG%'].fillna(0, inplace=True)
nba_data2019_2020_cleaned['eFG%'].fillna(0, inplace=True)
nba_data2019_2020_cleaned['2P%'].fillna(0, inplace=True)
print(nba_data2019_2020_cleaned.info())

In [None]:
#check data again before concatenating
print(nba_data2019_2020_cleaned.head()) #no rank column, only need to change team to tm, no Pos column
print(nba_data2019_2020_cleaned.info())
print(nba_data2020_2021_cleaned.head()) #no rank column
print(nba_data2020_2021_cleaned.info())
print(nba_data2021_2022.head()) #need to drop rank column
print(nba_data2021_2022.info())
print(nba_data2022_2023.head()) #need to drop rank column
print(nba_data2022_2023.info())
print(nba_data2023_2024.head()) #need to drop rank column
print(nba_data2023_2024.info())

In [None]:
#drop rank column as it doesen't contribute anything for the project, basically the same as an index
def drop_rk_column(df):
    if 'Rk' in df.columns:
        df = df.drop(columns=['Rk'])
    return df
nba_data2021_2022 = drop_rk_column(nba_data2021_2022)
nba_data2022_2023 = drop_rk_column(nba_data2022_2023)
nba_data2023_2024 = drop_rk_column(nba_data2023_2024)
#changing Team column to Tm for later concatenation
def standardize_column_names(df):
    df = df.rename(columns={'Team': 'Tm'})
    return df
nba_data2019_2020_cleaned = standardize_column_names(nba_data2019_2020_cleaned)

In [None]:
# check for NaN in each years Pos column
def check_pos_nans(dataframe, year_label):
    if 'Pos' in dataframe.columns:
        nan_count = dataframe['Pos'].isna().sum()
    else:
        print(f"No 'Pos' column found in the {year_label} dataset.")
    
check_pos_nans(nba_data2019_2020_cleaned, '2019-2020')
check_pos_nans(nba_data2020_2021_cleaned, '2020-2021')
check_pos_nans(nba_data2021_2022, '2021-2022')
check_pos_nans(nba_data2022_2023, '2022-2023')
check_pos_nans(nba_data2023_2024, '2023-2024')

In [None]:
nba_data2019_2020_cleaned['Season'] = '2019-2020'
nba_data2020_2021_cleaned['Season'] = '2020-2021'
nba_data2021_2022['Season'] = '2021-2022'
nba_data2022_2023['Season'] = '2022-2023'
nba_data2023_2024['Season'] = '2023-2024'

In [None]:
#combine datasets with the Pos column to create a mapping for the year 2019-2020
datasets = [nba_data2020_2021_cleaned, nba_data2021_2022, nba_data2022_2023, nba_data2023_2024]
combined_datasets = pd.concat(datasets)

In [None]:
#creating a map and filling positions in 2019-2020
position_mapping = combined_datasets.groupby('Player')['Pos'].agg(lambda x: x.mode().dropna().iloc[0] if not x.mode().empty else np.nan).dropna()
nba_data2019_2020_cleaned['Pos'] = nba_data2019_2020_cleaned['Player'].map(position_mapping)
#check how many positions are filled
filled_positions_count = nba_data2019_2020_cleaned['Pos'].notna().sum()
print(f"Filled positions in 2019-2020 dataset: {filled_positions_count}")

In [None]:
#change NaNs to Unknown and check if data is now complete for concatenation
nba_data2019_2020_cleaned['Pos'].fillna('Unknown', inplace=True)
print(nba_data2019_2020_cleaned.info())

In [None]:
#concatenating all the data
nba_data_all_years = pd.concat([
    nba_data2019_2020_cleaned, 
    nba_data2020_2021_cleaned, 
    nba_data2021_2022, 
    nba_data2022_2023, 
    nba_data2023_2024
])
print(nba_data_all_years.head()) 
print(nba_data_all_years.info())

In [None]:
# Filter players who appear in all 5 seasons
players_in_all_seasons = nba_data_all_years.groupby('Player')['Season'].nunique()

# Keep only players who have appeared in exactly 5 unique seasons
players_in_all_seasons = players_in_all_seasons[players_in_all_seasons == 5].index

# Filter the original DataFrame to keep only these players
nba_filtered = nba_data_all_years[nba_data_all_years['Player'].isin(players_in_all_seasons)]

# Display the filtered DataFrame
nba_filtered.head()
nba_filtered.info()

In [None]:
#export the data for EDA
nba_filtered.to_excel('EDA project1.xlsx', index=False)

In [None]:
# Group by Player and Season, and calculate the mean of the relevant stats
df_grouped = nba_filtered.groupby(['Player', 'Season']).agg({
    'Age': 'first',  # Age will remain the same for a season
    'G': 'mean',     # Mean of games played
    'GS': 'mean',    # Mean of games started
    'MP': 'mean',    # Mean of minutes played
    'FG': 'mean',
    'FGA': 'mean',
    'FG%': 'mean',
    '3P': 'mean',
    '3PA': 'mean',
    '3P%': 'mean',
    '2P': 'mean',
    '2PA': 'mean',
    '2P%': 'mean',
    'eFG%': 'mean',
    'FT': 'mean',
    'FTA': 'mean',
    'FT%': 'mean',
    'ORB': 'mean',
    'DRB': 'mean',
    'TRB': 'mean',
    'AST': 'mean',
    'STL': 'mean',
    'BLK': 'mean',
    'TOV': 'mean',
    'PF': 'mean',
    'PTS': 'mean',
    'Tm': lambda x: ','.join(x.unique()),  # Concatenate unique team initials
    'Pos': 'first'   # Position will remain the same
}).reset_index()

# Display the result
df_grouped.head()


In [None]:
# First, filter the DataFrame to include only the relevant seasons
filtered_df = df_grouped[df_grouped['Season'].isin(['2023-2024', '2022-2023', '2021-2022'])]

# Create a copy of the DataFrame sorted by Player and Season to apply the shift function
sorted_df = df_grouped.sort_values(['Player', 'Season']).copy()

# Define the columns that you want to apply the lead/lag to (excluding 'Player', 'Season', 'Tm', 'Pos')
columns_to_shift = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']

# Create shifted columns for the last year and the year before last
for col in columns_to_shift:
    # Shift by 1 for last year
    sorted_df[f'{col}_last_year'] = sorted_df.groupby('Player')[col].shift(1)
    # Shift by 2 for the year before last
    sorted_df[f'{col}_year_before_last'] = sorted_df.groupby('Player')[col].shift(2)

# Now add the 'points_scored_next_year' from the 2023-2024 season
sorted_df['Points_scored_next_season'] = sorted_df.groupby('Player')['PTS'].shift(-1)

# Filter the DataFrame again to include only seasons 2023-2024, 2022-2023, and 2021-2022
final_df = sorted_df[sorted_df['Season'].isin(['2023-2024', '2022-2023', '2021-2022'])]

# Display the final DataFrame with the new columns
final_df.head(10)

In [None]:
column_order = ['Player', 
                'Age', 'Age_last_year', 'Age_year_before_last',
                'G', 'G_last_year', 'G_year_before_last',
                'GS', 'GS_last_year', 'GS_year_before_last',
                'MP', 'MP_last_year', 'MP_year_before_last',
                'FG', 'FG_last_year', 'FG_year_before_last',
                'FGA', 'FGA_last_year', 'FGA_year_before_last',
                'FG%', 'FG%_last_year', 'FG%_year_before_last',
                '3P', '3P_last_year', '3P_year_before_last',
                '3PA', '3PA_last_year', '3PA_year_before_last',
                '3P%', '3P%_last_year', '3P%_year_before_last',
                '2P', '2P_last_year', '2P_year_before_last',
                '2PA', '2PA_last_year', '2PA_year_before_last',
                '2P%', '2P%_last_year', '2P%_year_before_last',
                'eFG%', 'eFG%_last_year', 'eFG%_year_before_last',
                'FT', 'FT_last_year', 'FT_year_before_last',
                'FTA', 'FTA_last_year', 'FTA_year_before_last',
                'FT%', 'FT%_last_year', 'FT%_year_before_last',
                'ORB', 'ORB_last_year', 'ORB_year_before_last',
                'DRB', 'DRB_last_year', 'DRB_year_before_last',
                'TRB', 'TRB_last_year', 'TRB_year_before_last',
                'AST', 'AST_last_year', 'AST_year_before_last',
                'STL', 'STL_last_year', 'STL_year_before_last',
                'BLK', 'BLK_last_year', 'BLK_year_before_last',
                'TOV', 'TOV_last_year', 'TOV_year_before_last',
                'PF', 'PF_last_year', 'PF_year_before_last',
                'PTS', 'PTS_last_year', 'PTS_year_before_last', 'Points_scored_next_season',
                'Tm', 'Season', 'Pos']
final_df = final_df[column_order]

final_df = final_df.round(2)

final_df_without_2023 = final_df[final_df['Season'] != '2023-2024']

In [None]:
positions = set([pos for sublist in final_df_without_2023['Pos'].str.split('-') for pos in sublist])

# Create a binary column for each unique position using .loc to avoid the SettingWithCopyWarning
for position in positions:
    final_df_without_2023.loc[:, position] = final_df_without_2023['Pos'].apply(lambda x: 1 if position in x.split('-') else 0)

# Display the first few rows of the modified dataframe
final_df_without_2023.head()

In [None]:
final_df_without_2023.drop(columns=['Pos'])

cols = list(final_df_without_2023.columns)

#move PF to the back of the DF
cols.remove('PF')

cols.append('PF')

# Reorder the dataframe columns
final_df_without_2023 = final_df_without_2023[cols]

# Display the first few rows of the modified dataframe
final_df_without_2023.head()

In [None]:
#export the data for EDA
final_df_without_2023.to_excel('DataWrangling2.0.xlsx', index=False)