In [594]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from imblearn.over_sampling import SMOTE

In [595]:
def detect_missing_values(df):
    missing_data = df.isnull().sum()
    missing_data = missing_data[missing_data > 0]
    return missing_data.sort_values(ascending=False)

def detect_outliers(df, columns):
    outliers = {}
    for column in columns:
            # Use IQR method
            Q1 = df[column].quantile(0.25)
            Q3 = df[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outlier_rows = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
            outliers[column] = outlier_rows[[column]]
    
    return outliers

def detect_duplicates(df):
    duplicate_rows = df[df.duplicated()]
    return duplicate_rows

In [596]:
# Load the teams_post data
teams = pd.read_csv('./data/teams.csv')

# Convert 'playoff' column to binary (1 for 'Y', 0 for 'N')
teams['playoff'] = teams['playoff'].map({'Y': 1, 'N': 0})

zero_cols = ["tmORB", "tmDRB", "tmTRB", "opptmORB", "opptmDRB", "opptmTRB"]

# Since "tmORB", "tmDRB", "tmTRB", "opptmORB", "opptmDRB", and "opptmTRB" contain only zero values,
# and are redundant with "o_oreb", "o_dreb", "o_reb" for team stats, and "d_oreb", "d_dreb", "d_reb" for opponent stats,
# we drop the redundant columns.

teams = teams.drop(columns=zero_cols)
print("Dropped redundant rebound columns and renamed others for clarity.")

# Drop the 'divID' column as it contains only empty strings and does not add useful information
teams = teams.drop(columns=['divID'])
print("Dropped 'divID' column as it contains no information.")

# Drop the 'seeded' column as it contains only zero values
teams = teams.drop(columns=['seeded'])
print("Dropped 'seeded' column as it contains only zero values.")

def calculate_playoff_score(row):
    if row['finals'] == 'W':
        return 4  # Won the championship
    elif row['finals'] == 'L':
        return 3  # Lost in the finals
    elif row['semis'] == 'L':
        return 2  # Lost in the semifinals
    elif row['firstRound'] == 'L':
        return 1  # Lost in the first round
    else:
        return 0  # Did not make the playoffs

# Apply the function to each row to create the playoff_progression_score
teams['playoff_progression_score'] = teams.apply(calculate_playoff_score, axis=1)

# Drop the original 'firstRound', 'semis', and 'finals' columns as they are now redundant
teams = teams.drop(columns=['firstRound', 'semis', 'finals'])

# Drop columns that don't add predictive value
# 'lgID': Contains only "WNBA" for every row, so it provides no additional information.
# 'franchID': Redundant identifier, as 'tmID' already identifies each team uniquely.
# 'confID': Lacks value without conference-specific qualification/matchup data.
# 'name': Purely descriptive and irrelevant to playoff predictions.
# 'arena': Also descriptive and does not impact playoff qualification.
teams = teams.drop(columns=['lgID', 'franchID', 'confID', 'name', 'arena'])
print("Dropped 'lgID', 'franchID', 'confID', 'name', and 'arena' as they are irrelevant for predictive modeling.")

# DETECTION OF MISSING VALUES
missing_values = detect_missing_values(teams)
print("\nMissing Values")
print(missing_values)

# DETECTION OF DUPLICATES
duplicate_rows = detect_duplicates(teams)
print("\nDuplicates")
print(duplicate_rows)

# DETECTION OF OUTLIERS
# Select numerical columns only
numeric_columns = teams.select_dtypes(include=['float64', 'int64']).columns

#for column in numeric_columns:
#    plt.figure(figsize=(8, 4))
#    sns.boxplot(x=teams[column])
#    plt.title(f'Box Plot of {column}')
#    plt.show()
#
#outliers = detect_outliers(teams, numeric_columns)
#
#for col, outlier_data in outliers.items():
#    print(f"Outliers in {col}:\n{outlier_data}\n")

## Plot for 'next_season_playoff' column
#plt.figure(figsize=(6, 4))
#plt.bar(next_playoff_counts.index, next_playoff_counts.values)
#plt.title('Data Balance in Next Season Playoff')
#plt.xlabel('Next Season Playoff (0 = No, 1 = Yes)')
#plt.ylabel('Count')
#plt.xticks([0, 1])
#plt.show()

Dropped redundant rebound columns and renamed others for clarity.
Dropped 'divID' column as it contains no information.
Dropped 'seeded' column as it contains only zero values.
Dropped 'lgID', 'franchID', 'confID', 'name', and 'arena' as they are irrelevant for predictive modeling.

Missing Values
Series([], dtype: int64)

Duplicates
Empty DataFrame
Columns: [year, tmID, rank, playoff, o_fgm, o_fga, o_ftm, o_fta, o_3pm, o_3pa, o_oreb, o_dreb, o_reb, o_asts, o_pf, o_stl, o_to, o_blk, o_pts, d_fgm, d_fga, d_ftm, d_fta, d_3pm, d_3pa, d_oreb, d_dreb, d_reb, d_asts, d_pf, d_stl, d_to, d_blk, d_pts, won, lost, GP, homeW, homeL, awayW, awayL, confW, confL, min, attend, playoff_progression_score]
Index: []

[0 rows x 46 columns]


In [597]:
## Correlation between offensive statistics
## -------------------------
#
#o_stats = teams.filter(regex='^(o_)')
#
#corr_matrix = o_stats.corr()
#
#plt.figure(figsize=(14, 12))
#
#cmap = sns.color_palette("mako", as_cmap=True)
#
#sns.heatmap(corr_matrix, cmap=cmap, vmax=1.0, vmin=-1.0, center=0,
#            square=True, linewidths=.5, annot=True, fmt=".2f", annot_kws={"size":8})
#
#plt.title('Correlation Heatmap of Performance Statistics', fontsize=18)
#plt.xticks(rotation=45, ha='right', fontsize=10)
#plt.yticks(fontsize=10)
#plt.tight_layout()
#
## Display the heatmap
#plt.show()

In [598]:
## Correlation between defensive statistics
## -------------------------
#
#d_stats = teams.filter(regex='^(d_)')
#
#corr_matrix = d_stats.corr()
#
#plt.figure(figsize=(14, 12))
#
#cmap = sns.color_palette("coolwarm", as_cmap=True)
#
#sns.heatmap(corr_matrix, cmap=cmap, vmax=1.0, vmin=-1.0, center=0,
#            square=True, linewidths=.5, annot=True, fmt=".2f", annot_kws={"size":8})
#
#plt.title('Correlation Heatmap of Performance Statistics', fontsize=18)
#plt.xticks(rotation=45, ha='right', fontsize=10)
#plt.yticks(fontsize=10)
#plt.tight_layout()
#
## Display the heatmap
#plt.show()

In [599]:
## Compute correlation between offensive and defensive stats
#o_stats = teams.filter(regex='^(o_)')
#d_stats = teams.filter(regex='^(d_)')
#
#combined_stats = pd.concat([o_stats, d_stats], axis=1)
#
#corr_matrix = combined_stats.corr()
#
#o_d_corr_matrix = corr_matrix.loc[o_stats.columns, d_stats.columns]
#
#plt.figure(figsize=(12, 10))
#sns.heatmap(o_d_corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", center=0,
#            linewidths=.5, square=True, cbar_kws={"shrink": .75})
#
#plt.title("Correlation Between Offensive and Defensive Statistics")
#plt.xticks(rotation=45, ha='right', fontsize=10)
#plt.yticks(fontsize=10)
#plt.tight_layout()
#
#plt.show()

In [600]:
# Calculate statistics for the new dataframe
dataset = pd.DataFrame()

# Adding team statistics
#dataset['Playoff'] = teams['playoff']
dataset['Rank'] = teams['rank']
dataset['PlayoffProgScore'] = teams['playoff_progression_score']
dataset['GP'] = teams['GP']
dataset['W'] = teams['won']
dataset['L'] = teams['lost']
dataset['WIN%'] = 100 * (teams['won'] / teams['GP'])
dataset['MIN'] = teams['min']
dataset['PTS'] = teams['o_pts']
dataset['FGM'] = teams['o_fgm']
dataset['FGA'] = teams['o_fga']
dataset['FG%'] = 100 * (teams['o_fgm'] / teams['o_fga'])
dataset['3PM'] = teams['o_3pm']
dataset['3PA'] = teams['o_3pa']
dataset['3P%'] = 100 * (teams['o_3pm'] / teams['o_3pa'])
dataset['FTM'] = teams['o_ftm']
dataset['FTA'] = teams['o_fta']
dataset['FT%'] = 100 * (teams['o_ftm'] / teams['o_fta'])
dataset['OREB'] = teams['o_oreb']
dataset['DREB'] = teams['o_dreb']
dataset['REB'] = teams['o_reb']
dataset['AST'] = teams['o_asts']
dataset['TOV'] = teams['o_to']
dataset['STL'] = teams['o_stl']
dataset['BLK'] = teams['o_blk']
dataset['BLKA'] = teams['d_blk']
dataset['PF'] = teams['o_pf']
dataset['PFD'] = teams['d_pf']

# Advanced
dataset['POSS'] = 0.5 * (
    (teams['o_fga'] + 0.4 * teams['o_fta'] -
     1.07 * (teams['o_oreb'] / (teams['o_oreb'] + teams['d_dreb'])) *
     (teams['o_fga'] - teams['o_fgm']) + teams['o_to']) +
    (teams['d_fga'] + 0.4 * teams['d_fta'] -
     1.07 * (teams['d_oreb'] / (teams['d_oreb'] + teams['o_dreb'])) *
     (teams['d_fga'] - teams['d_fgm']) + teams['d_to'])
)
dataset['OFFRTG'] = 100 * (teams['o_pts'] / dataset['POSS'])
dataset['DEFRTG'] = 100 * (teams['d_pts'] / dataset['POSS'])
dataset['NETRTG'] = dataset['OFFRTG'] - dataset['DEFRTG']
dataset['AST/TO'] = teams['o_asts'] / teams['o_to']
dataset['AST RATIO'] = (teams['o_asts'] * 100) / dataset['POSS']
dataset['OREB%'] = (
    100 * (teams['o_oreb'] * (dataset['MIN'] / 5)) / 
    (dataset['MIN'] * (teams['o_oreb'] + teams['d_dreb']))
)
dataset['DREB%'] = (
    100 * (teams['o_dreb'] * (dataset['MIN'] / 5)) / 
    (dataset['MIN'] * (teams['o_dreb'] + teams['d_oreb']))
)
dataset['REB%'] = (
    100 * (teams['o_reb'] * (dataset['MIN'] / 5)) / 
    (dataset['MIN'] * (teams['o_reb'] + teams['d_reb']))
)
dataset['TOV%'] = 100 * teams['o_to'] / (
    teams['o_fga'] + 0.44 * teams['o_fta'] + teams['o_to']
)
dataset['EFG%'] = 100 * ((teams['o_fgm'] + (0.5 * teams['o_3pm'])) / teams['o_fga'])
dataset['TS%'] = 100 * (teams['o_pts'] / (2 * (teams['o_fga'] + 0.44 * teams['o_fta'])))

OPPPOSS = 0.5 * (
    (teams['d_fga'] + 0.4 * teams['d_fta'] -
     1.07 * (teams['d_oreb'] / (teams['d_oreb'] + teams['o_dreb'])) *
     (teams['d_fga'] - teams['d_fgm']) + teams['d_to']) +
    (teams['o_fga'] + 0.4 * teams['o_fta'] -
     1.07 * (teams['o_oreb'] / (teams['o_oreb'] + teams['d_dreb'])) *
     (teams['o_fga'] - teams['o_fgm']) + teams['o_to'])
)
dataset['PACE'] = 40 * ((dataset['POSS'] + OPPPOSS) / (2 * (dataset['MIN'] / 5)))

In [601]:
# Label
teams = teams.sort_values(by=['tmID', 'year']).reset_index(drop=True)
dataset['PlayoffNextSeason'] = teams.groupby('tmID')['playoff'].shift(-1)
dataset = dataset.dropna(subset=['PlayoffNextSeason'])
dataset['PlayoffNextSeason'] = dataset['PlayoffNextSeason'].astype(int)

In [602]:
# Separate features and target variable
X = dataset.drop(columns=['PlayoffNextSeason'])
y = dataset['PlayoffNextSeason']

display(y.value_counts())

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Verify the new class distribution after applying SMOTE
display(y_resampled.value_counts())

# Merge resampled data into a new DataFrame
balanced_dataset = pd.DataFrame(X_resampled, columns=X.columns)
balanced_dataset['PlayoffNextSeason'] = y_resampled

dataset = balanced_dataset

PlayoffNextSeason
1    71
0    51
Name: count, dtype: int64

PlayoffNextSeason
1    71
0    71
Name: count, dtype: int64

The cells bellow are to be used when cleaning, transforming and analyzing the remaining datasets. For now we are only studying teams.csv

In [603]:
'''
# Load the teams_post data
teams_post = pd.read_csv("./data/teams_post.csv")

# Drop 'lgID' column as it contains only "WNBA" for every row
teams_post = teams_post.drop(columns=['lgID'])

# Detect and drop duplicates
duplicates = teams_post[teams_post.duplicated()]
if not duplicates.empty:
    print("Duplicates detected. Removing duplicate rows.")
    teams_post = teams_post.drop_duplicates()
else:
    print("No duplicates found.")

# Display a sample of the dataframe to verify changes
display(teams_post.head())

# Store cleaned csv
teams_post.to_csv('./cleaned_data/teams_post.csv', index=False)
'''

'\n# Load the teams_post data\nteams_post = pd.read_csv("./data/teams_post.csv")\n\n# Drop \'lgID\' column as it contains only "WNBA" for every row\nteams_post = teams_post.drop(columns=[\'lgID\'])\n\n# Detect and drop duplicates\nduplicates = teams_post[teams_post.duplicated()]\nif not duplicates.empty:\n    print("Duplicates detected. Removing duplicate rows.")\n    teams_post = teams_post.drop_duplicates()\nelse:\n    print("No duplicates found.")\n\n# Display a sample of the dataframe to verify changes\ndisplay(teams_post.head())\n\n# Store cleaned csv\nteams_post.to_csv(\'./cleaned_data/teams_post.csv\', index=False)\n'

In [604]:
'''
# Load the series_post data
series_post = pd.read_csv("./data/series_post.csv")

# Drop 'lgIDWinner' and 'lgIDLoser' columns as they contain only "WNBA" and add no value
series_post = series_post.drop(columns=['lgIDWinner', 'lgIDLoser'])

# Detect and drop duplicates
duplicates = series_post[series_post.duplicated()]
if not duplicates.empty:
    print("Duplicates detected. Removing duplicate rows.")
    series_post = series_post.drop_duplicates()
else:
    print("No duplicates found.")

# Display a sample of the dataframe to verify changes
display(series_post.head())

# Store cleaned csv
series_post.to_csv('./cleaned_data/series_post.csv', index=False)
'''

'\n# Load the series_post data\nseries_post = pd.read_csv("./data/series_post.csv")\n\n# Drop \'lgIDWinner\' and \'lgIDLoser\' columns as they contain only "WNBA" and add no value\nseries_post = series_post.drop(columns=[\'lgIDWinner\', \'lgIDLoser\'])\n\n# Detect and drop duplicates\nduplicates = series_post[series_post.duplicated()]\nif not duplicates.empty:\n    print("Duplicates detected. Removing duplicate rows.")\n    series_post = series_post.drop_duplicates()\nelse:\n    print("No duplicates found.")\n\n# Display a sample of the dataframe to verify changes\ndisplay(series_post.head())\n\n# Store cleaned csv\nseries_post.to_csv(\'./cleaned_data/series_post.csv\', index=False)\n'

In [605]:
# Load the awards_players data
awards_players = pd.read_csv("./data/awards_players.csv")

# Drop 'lgID' column as it provides no unique value
awards_players = awards_players.drop(columns=['lgID'])

# Separate dataframes for player awards and coach awards
player_awards = awards_players[~awards_players['award'].str.contains("Coach")].copy()
coach_awards = awards_players[awards_players['award'].str.contains("Coach")].copy()

# Standardize award names
award_name_mapping = {
    "Kim Perrot Sportsmanship": "Kim Perrot Sportsmanship Award",
    "Kim Perrot Sportsmanship Award": "Kim Perrot Sportsmanship Award",
    "All-Star Game Most Valuable Player": "All-Star Game MVP",
    "Most Valuable Player": "MVP",
    "WNBA Finals Most Valuable Player": "Finals MVP",
    "Sixth Woman of the Year": "6th Woman of the Year",
    "WNBA All-Decade Team": "All-Decade Team",
    "WNBA All Decade Team Honorable Mention": "All-Decade Team Honorable Mention"
}

player_awards.loc[:, 'award'] = player_awards['award'].map(award_name_mapping).fillna(player_awards['award'])
coach_awards.loc[:, 'award'] = coach_awards['award'].map(award_name_mapping).fillna(coach_awards['award'])

# Detect and drop duplicates
duplicates_player = player_awards[player_awards.duplicated()]
if not duplicates_player.empty:
    print("Duplicates detected in player awards. Removing duplicate rows.")
    player_awards = player_awards.drop_duplicates()
else:
    print("No duplicates found in player awards.")

duplicates_coach = coach_awards[coach_awards.duplicated()]
if not duplicates_coach.empty:
    print("Duplicates detected in coach awards. Removing duplicate rows.")
    coach_awards = coach_awards.drop_duplicates()
else:
    print("No duplicates found in coach awards.")

# Swap 'playerID' for 'coachID' in coach awards
coach_awards = coach_awards.rename(columns={'playerID': 'coachID'})

# Display samples of both dataframes to verify transformations
display(player_awards.head())
display(coach_awards.head())

# Store cleaned csvs
player_awards.to_csv('./cleaned_data/player_awards.csv', index=False)
coach_awards.to_csv('./cleaned_data/coach_awards.csv', index=False)

No duplicates found in player awards.
No duplicates found in coach awards.


Unnamed: 0,playerID,award,year
0,thompti01w,All-Star Game MVP,1
1,leslili01w,All-Star Game MVP,2
2,leslili01w,All-Star Game MVP,3
3,teaslni01w,All-Star Game MVP,4
4,swoopsh01w,All-Star Game MVP,6


Unnamed: 0,coachID,award,year
8,coopemi01w,Coach of the Year,1
9,hugheda99w,Coach of the Year,2
10,stanlma99w,Coach of the Year,3
11,laimbbi01w,Coach of the Year,4
12,mcconsu01w,Coach of the Year,5


In [606]:
# Load the coaches data
coaches = pd.read_csv("./data/coaches.csv")

# Drop 'lgID' as it is only "WNBA" and provides no unique value
coaches = coaches.drop(columns=['lgID'])

# Detect and drop duplicates
duplicates = coaches[coaches.duplicated()]
if not duplicates.empty:
    print("Duplicates detected. Removing duplicate rows.")
    coaches = coaches.drop_duplicates()
else:
    print("No duplicates found.")

# Display a sample of the dataframe to verify changes
display(coaches.head())

No duplicates found.


Unnamed: 0,coachID,year,tmID,stint,won,lost,post_wins,post_losses
0,adamsmi01w,5,WAS,0,17,17,1,2
1,adubari99w,1,NYL,0,20,12,4,3
2,adubari99w,2,NYL,0,21,11,3,3
3,adubari99w,3,NYL,0,18,14,4,4
4,adubari99w,4,NYL,0,16,18,0,0


## Adding Coach to the dataset

In [607]:
# Aggregate data by coachID and year (ignoring stint and teamID)
coaches_agg = coaches.groupby(['coachID', 'year']).agg({
    'won': 'sum',
    'lost': 'sum',
    'post_wins': 'sum',
    'post_losses': 'sum'
}).reset_index()

# Preview the aggregated data
print(coaches_agg.head())


      coachID  year  won  lost  post_wins  post_losses
0  adamsmi01w     5   17    17          1            2
1  adubari99w     1   20    12          4            3
2  adubari99w     2   21    11          3            3
3  adubari99w     3   18    14          4            4
4  adubari99w     4   16    18          0            0


In [608]:
# Find the coach with the maximum post_wins for each year
max_post_wins_per_year = coaches_agg.loc[coaches_agg.groupby('year')['post_wins'].idxmax()][['year', 'coachID', 'post_wins']]
max_post_wins_per_year = max_post_wins_per_year.rename(columns={'post_wins': 'max_post_wins'})

# Merge this information back with the original DataFrame
coaches_agg = coaches_agg.merge(max_post_wins_per_year, on=['year', 'coachID'], how='left')

# Set the championship indicator
coaches_agg['championship'] = 0
coaches_agg.loc[coaches_agg.set_index(['year', 'coachID']).index.isin(max_post_wins_per_year.set_index(['year', 'coachID']).index) & (coaches_agg['post_wins'] >= 6), 'championship'] = 1

# Drop the temporary max_post_wins column
coaches_agg.drop(columns=['max_post_wins'], inplace=True)

# Recalculate metrics
coaches_agg['win_ratio'] = (coaches_agg['won'] / (coaches_agg['won'] + coaches_agg['lost'])).round(4)
coaches_agg['post_win_ratio'] = (coaches_agg['post_wins'] / (coaches_agg['post_wins'] + coaches_agg['post_losses'])).round(4)

# If a coach has not reached playoffs, post_win% will be NaN. Fill these with 0.
coaches_agg['post_win_ratio'] = coaches_agg['post_win_ratio'].fillna(0)

# Merge with Awards Data for Coach of the Year
coaches_agg = coaches_agg.merge(
    coach_awards[['coachID', 'year', 'award']], 
    on=['coachID', 'year'], 
    how='left'
)
coaches_agg['COTY'] = np.where(coaches_agg['award'] == 'Coach of the Year', 1, 0)
coaches_agg.drop(columns=['award', 'won', 'lost'], inplace=True)

#Normalize features if needed
coaches_agg['win_ratio'] = coaches_agg['win_ratio'].clip(0, 1)  # Ensure the ratio is between 0 and 1
coaches_agg['post_win_ratio'] = coaches_agg['post_win_ratio'].clip(0, 1)  # Ensure the ratio is between 0 and 1

# Drop the temporary columns
coaches_agg.drop(columns=['post_wins', 'post_losses'], inplace=True)

# Preview updated data
print(coaches_agg.head())

# Store cleaned csv
coaches_agg.to_csv('./cleaned_data/coaches.csv', index=False)

      coachID  year  championship  win_ratio  post_win_ratio  COTY
0  adamsmi01w     5             0     0.5000          0.3333     0
1  adubari99w     1             0     0.6250          0.5714     0
2  adubari99w     2             0     0.6562          0.5000     0
3  adubari99w     3             0     0.5625          0.5000     0
4  adubari99w     4             0     0.4706          0.0000     0


In [609]:
'''
# Load the players_teams data
players_teams = pd.read_csv("./data/players_teams.csv")

# Drop 'lgID' as it contains only "WNBA" and provides no unique value
players_teams = players_teams.drop(columns=['lgID'])

# Detect and drop duplicates
duplicates = players_teams[players_teams.duplicated()]
if not duplicates.empty:
    print("Duplicates detected. Removing duplicate rows.")
    players_teams = players_teams.drop_duplicates()
else:
    print("No duplicates found.")

# Display a sample of the dataframe to verify changes
display(players_teams.head())

# Store cleaned csv
players_teams.to_csv('./cleaned_data/players_teams.csv', index=False)
'''

'\n# Load the players_teams data\nplayers_teams = pd.read_csv("./data/players_teams.csv")\n\n# Drop \'lgID\' as it contains only "WNBA" and provides no unique value\nplayers_teams = players_teams.drop(columns=[\'lgID\'])\n\n# Detect and drop duplicates\nduplicates = players_teams[players_teams.duplicated()]\nif not duplicates.empty:\n    print("Duplicates detected. Removing duplicate rows.")\n    players_teams = players_teams.drop_duplicates()\nelse:\n    print("No duplicates found.")\n\n# Display a sample of the dataframe to verify changes\ndisplay(players_teams.head())\n\n# Store cleaned csv\nplayers_teams.to_csv(\'./cleaned_data/players_teams.csv\', index=False)\n'

In [610]:
'''
# Load the players data
players = pd.read_csv("./data/players.csv")

# Filter out rows in players that do not have corresponding playerIDs in players_teams
valid_player_ids = players_teams['playerID'].unique()
players = players[players['bioID'].isin(valid_player_ids)]

# Show that all values in firstseason and lastseason are '0'
firstseason_all_zero = (players['firstseason'] == 0).all()
lastseason_all_zero = (players['lastseason'] == 0).all()

print("All values in 'firstseason' are 0:", firstseason_all_zero)
print("All values in 'lastseason' are 0:", lastseason_all_zero)

# Show the only valida player with a registered Death Date
non_zero_death_dates = players[players['deathDate'] != "0000-00-00"]
display(non_zero_death_dates.head())

# Even though there is 1 registered Death Date, it really doesn't add anything. Birth Date is kept, for potential aging information.
players = players.drop(columns=['firstseason', 'lastseason', 'deathDate'])
print("Dropped 'firstseason', 'lastseason', and 'deathDate' columns as they contain only irrelevant values.")

# Detect and drop duplicates
duplicates = players[players.duplicated()]
if not duplicates.empty:
    print("Duplicates detected. Removing duplicate rows.")
    players = players.drop_duplicates()
else:
    print("No duplicates found.")

# Display a sample of the dataframe to verify changes
display(players.head())

# Store cleaned csv
players.to_csv('./cleaned_data/players.csv', index=False)
'''

'\n# Load the players data\nplayers = pd.read_csv("./data/players.csv")\n\n# Filter out rows in players that do not have corresponding playerIDs in players_teams\nvalid_player_ids = players_teams[\'playerID\'].unique()\nplayers = players[players[\'bioID\'].isin(valid_player_ids)]\n\n# Show that all values in firstseason and lastseason are \'0\'\nfirstseason_all_zero = (players[\'firstseason\'] == 0).all()\nlastseason_all_zero = (players[\'lastseason\'] == 0).all()\n\nprint("All values in \'firstseason\' are 0:", firstseason_all_zero)\nprint("All values in \'lastseason\' are 0:", lastseason_all_zero)\n\n# Show the only valida player with a registered Death Date\nnon_zero_death_dates = players[players[\'deathDate\'] != "0000-00-00"]\ndisplay(non_zero_death_dates.head())\n\n# Even though there is 1 registered Death Date, it really doesn\'t add anything. Birth Date is kept, for potential aging information.\nplayers = players.drop(columns=[\'firstseason\', \'lastseason\', \'deathDate\'])\np

In [611]:
# Save the new dataset to a CSV file
output_file_path = './cleaned_data/dataset.csv'
dataset.to_csv(output_file_path, index=False)

print(f"Processed dataset saved to {output_file_path}")

Processed dataset saved to ./cleaned_data/dataset.csv
