In [1]:
import pandas as pd

In [2]:
# Load the teams_post data
teams = pd.read_csv('./data/teams.csv')

zero_cols = ["tmORB", "tmDRB", "tmTRB", "opptmORB", "opptmDRB", "opptmTRB"]

# Since "tmORB", "tmDRB", "tmTRB", "opptmORB", "opptmDRB", and "opptmTRB" contain only zero values,
# and are redundant with "o_oreb", "o_dreb", "o_reb" for team stats, and "d_oreb", "d_dreb", "d_reb" for opponent stats,
# we drop the redundant columns.

teams = teams.drop(columns=zero_cols)
print("Dropped redundant rebound columns and renamed others for clarity.")

# Drop the 'divID' column as it contains only empty strings and does not add useful information
teams = teams.drop(columns=['divID'])
print("Dropped 'divID' column as it contains no information.")

# Drop the 'seeded' column as it contains only zero values
teams = teams.drop(columns=['seeded'])
print("Dropped 'seeded' column as it contains only zero values.")

def calculate_playoff_score(row):
    if row['finals'] == 'W':
        return 4  # Won the championship
    elif row['finals'] == 'L':
        return 3  # Lost in the finals
    elif row['semis'] == 'L':
        return 2  # Lost in the semifinals
    elif row['firstRound'] == 'L':
        return 1  # Lost in the first round
    else:
        return 0  # Did not make the playoffs

# Apply the function to each row to create the playoff_progression_score
teams['playoff_progression_score'] = teams.apply(calculate_playoff_score, axis=1)

# Drop the original 'firstRound', 'semis', and 'finals' columns as they are now redundant
teams = teams.drop(columns=['firstRound', 'semis', 'finals'])

# Drop columns that don't add predictive value
# 'lgID': Contains only "WNBA" for every row, so it provides no additional information.
# 'franchID': Redundant identifier, as 'tmID' already identifies each team uniquely.
# 'confID': Lacks value without conference-specific qualification/matchup data.
# 'name': Purely descriptive and irrelevant to playoff predictions.
# 'arena': Also descriptive and does not impact playoff qualification.
teams = teams.drop(columns=['lgID', 'franchID', 'confID', 'name', 'arena'])
print("Dropped 'lgID', 'franchID', 'confID', 'name', and 'arena' as they are irrelevant for predictive modeling.")

# Detect and drop duplicates
duplicates = teams[teams.duplicated()]
if not duplicates.empty:
    print("Duplicates detected. Removing duplicate rows.")
    teams = teams.drop_duplicates()
else:
    print("No duplicates found.")

# Display a sample of the dataframe to verify changes
display(teams.head())

# Store cleaned csv
teams.to_csv('./cleaned_data/teams.csv', index=False)

Dropped redundant rebound columns and renamed others for clarity.
Dropped 'divID' column as it contains no information.
Dropped 'seeded' column as it contains only zero values.
Dropped 'lgID', 'franchID', 'confID', 'name', and 'arena' as they are irrelevant for predictive modeling.
No duplicates found.


Unnamed: 0,year,tmID,rank,playoff,o_fgm,o_fga,o_ftm,o_fta,o_3pm,o_3pa,...,GP,homeW,homeL,awayW,awayL,confW,confL,min,attend,playoff_progression_score
0,9,ATL,7,N,895,2258,542,725,202,598,...,34,1,16,3,14,2,18,6825,141379,0
1,10,ATL,2,Y,1089,2428,569,755,114,374,...,34,12,5,6,11,10,12,6950,120737,1
2,1,CHA,8,N,812,1903,431,577,131,386,...,32,5,11,3,13,5,16,6475,90963,0
3,2,CHA,4,Y,746,1780,410,528,153,428,...,32,11,5,7,9,15,6,6500,105525,3
4,3,CHA,2,Y,770,1790,490,663,211,527,...,32,11,5,7,9,12,9,6450,106670,1


In [3]:
# Load the teams_post data
teams_post = pd.read_csv("./data/teams_post.csv")

# Drop 'lgID' column as it contains only "WNBA" for every row
teams_post = teams_post.drop(columns=['lgID'])

# Detect and drop duplicates
duplicates = teams_post[teams_post.duplicated()]
if not duplicates.empty:
    print("Duplicates detected. Removing duplicate rows.")
    teams_post = teams_post.drop_duplicates()
else:
    print("No duplicates found.")

# Display a sample of the dataframe to verify changes
display(teams_post.head())

# Store cleaned csv
teams_post.to_csv('./cleaned_data/teams_post.csv', index=False)

No duplicates found.


Unnamed: 0,year,tmID,W,L
0,1,HOU,6,0
1,1,ORL,1,2
2,1,CLE,3,3
3,1,WAS,0,2
4,1,NYL,4,3


In [4]:
# Load the series_post data
series_post = pd.read_csv("./data/series_post.csv")

# Drop 'lgIDWinner' and 'lgIDLoser' columns as they contain only "WNBA" and add no value
series_post = series_post.drop(columns=['lgIDWinner', 'lgIDLoser'])

# Detect and drop duplicates
duplicates = series_post[series_post.duplicated()]
if not duplicates.empty:
    print("Duplicates detected. Removing duplicate rows.")
    series_post = series_post.drop_duplicates()
else:
    print("No duplicates found.")

# Display a sample of the dataframe to verify changes
display(series_post.head())

# Store cleaned csv
series_post.to_csv('./cleaned_data/series_post.csv', index=False)

No duplicates found.


Unnamed: 0,year,round,series,tmIDWinner,tmIDLoser,W,L
0,1,FR,A,CLE,ORL,2,1
1,1,FR,B,NYL,WAS,2,0
2,1,FR,C,LAS,PHO,2,0
3,1,FR,D,HOU,SAC,2,0
4,1,CF,E,HOU,LAS,2,0


In [5]:
# Load the coaches data
coaches = pd.read_csv("./data/coaches.csv")

# Drop 'lgID' as it is only "WNBA" and provides no unique value
coaches = coaches.drop(columns=['lgID'])

# Detect and drop duplicates
duplicates = coaches[coaches.duplicated()]
if not duplicates.empty:
    print("Duplicates detected. Removing duplicate rows.")
    coaches = coaches.drop_duplicates()
else:
    print("No duplicates found.")

# Display a sample of the dataframe to verify changes
display(coaches.head())

# Store cleaned csv
coaches.to_csv('./cleaned_data/coaches.csv', index=False)

No duplicates found.


Unnamed: 0,coachID,year,tmID,stint,won,lost,post_wins,post_losses
0,adamsmi01w,5,WAS,0,17,17,1,2
1,adubari99w,1,NYL,0,20,12,4,3
2,adubari99w,2,NYL,0,21,11,3,3
3,adubari99w,3,NYL,0,18,14,4,4
4,adubari99w,4,NYL,0,16,18,0,0


In [6]:
# Load the players_teams data
players_teams = pd.read_csv("./data/players_teams.csv")

# Drop 'lgID' as it contains only "WNBA" and provides no unique value
players_teams = players_teams.drop(columns=['lgID'])

# Detect and drop duplicates
duplicates = players_teams[players_teams.duplicated()]
if not duplicates.empty:
    print("Duplicates detected. Removing duplicate rows.")
    players_teams = players_teams.drop_duplicates()
else:
    print("No duplicates found.")

# Display a sample of the dataframe to verify changes
display(players_teams.head())

# Store cleaned csv
players_teams.to_csv('./cleaned_data/players_teams.csv', index=False)

No duplicates found.


Unnamed: 0,playerID,year,stint,tmID,GP,GS,minutes,points,oRebounds,dRebounds,...,PostBlocks,PostTurnovers,PostPF,PostfgAttempted,PostfgMade,PostftAttempted,PostftMade,PostthreeAttempted,PostthreeMade,PostDQ
0,abrossv01w,2,0,MIN,26,23,846,343,43,131,...,0,0,0,0,0,0,0,0,0,0
1,abrossv01w,3,0,MIN,27,27,805,314,45,101,...,0,0,0,0,0,0,0,0,0,0
2,abrossv01w,4,0,MIN,30,25,792,318,44,97,...,1,8,8,22,6,8,8,7,3,0
3,abrossv01w,5,0,MIN,22,11,462,146,17,57,...,2,3,7,23,8,4,2,8,2,0
4,abrossv01w,6,0,MIN,31,31,777,304,29,78,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Load the players data
players = pd.read_csv("./data/players.csv")

# Filter out rows in players that do not have corresponding playerIDs in players_teams
valid_player_ids = players_teams['playerID'].unique()
players = players[players['bioID'].isin(valid_player_ids)]

# Show that all values in firstseason and lastseason are '0'
firstseason_all_zero = (players['firstseason'] == 0).all()
lastseason_all_zero = (players['lastseason'] == 0).all()

print("All values in 'firstseason' are 0:", firstseason_all_zero)
print("All values in 'lastseason' are 0:", lastseason_all_zero)

# Show the only valida player with a registered Death Date
non_zero_death_dates = players[players['deathDate'] != "0000-00-00"]
display(non_zero_death_dates.head())

# Even though there is 1 registered Death Date, it really doesn't add anything. Birth Date is kept, for potential aging information.
players = players.drop(columns=['firstseason', 'lastseason', 'deathDate'])
print("Dropped 'firstseason', 'lastseason', and 'deathDate' columns as they contain only irrelevant values.")

# Detect and drop duplicates
duplicates = players[players.duplicated()]
if not duplicates.empty:
    print("Duplicates detected. Removing duplicate rows.")
    players = players.drop_duplicates()
else:
    print("No duplicates found.")

# Display a sample of the dataframe to verify changes
display(players.head())

# Store cleaned csv
players.to_csv('./cleaned_data/players.csv', index=False)

All values in 'firstseason' are 0: True
All values in 'lastseason' are 0: True


Unnamed: 0,bioID,pos,firstseason,lastseason,height,weight,college,collegeOther,birthDate,deathDate
225,dydekma01w,C,0,0,9.0,223,,,1974-04-28,2011-05-27


Dropped 'firstseason', 'lastseason', and 'deathDate' columns as they contain only irrelevant values.
No duplicates found.


Unnamed: 0,bioID,pos,height,weight,college,collegeOther,birthDate
1,abrossv01w,F,74.0,169,Connecticut,,1980-07-09
4,adamsjo01w,C,75.0,180,New Mexico,,1981-05-24
8,aguilel01w,G,67.0,165,George Washington,,1976-10-15
9,ajavoma01w,G,68.0,160,Rutgers,,1986-05-07
11,aldrima01w,G,71.0,153,UNC Charlotte,,1973-09-15


In [8]:
# Load the awards_players data
awards_players = pd.read_csv("./data/awards_players.csv")

# Drop 'lgID' column as it provides no unique value
awards_players = awards_players.drop(columns=['lgID'])

# Separate dataframes for player awards and coach awards
player_awards = awards_players[~awards_players['award'].str.contains("Coach")].copy()
coach_awards = awards_players[awards_players['award'].str.contains("Coach")].copy()

# Standardize award names
award_name_mapping = {
    "Kim Perrot Sportsmanship": "Kim Perrot Sportsmanship Award",
    "Kim Perrot Sportsmanship Award": "Kim Perrot Sportsmanship Award",
    "All-Star Game Most Valuable Player": "All-Star Game MVP",
    "Most Valuable Player": "MVP",
    "WNBA Finals Most Valuable Player": "Finals MVP",
    "Sixth Woman of the Year": "6th Woman of the Year",
    "WNBA All-Decade Team": "All-Decade Team",
    "WNBA All Decade Team Honorable Mention": "All-Decade Team Honorable Mention"
}

player_awards.loc[:, 'award'] = player_awards['award'].map(award_name_mapping).fillna(player_awards['award'])
coach_awards.loc[:, 'award'] = coach_awards['award'].map(award_name_mapping).fillna(coach_awards['award'])

# Detect and drop duplicates
duplicates_player = player_awards[player_awards.duplicated()]
if not duplicates_player.empty:
    print("Duplicates detected in player awards. Removing duplicate rows.")
    player_awards = player_awards.drop_duplicates()
else:
    print("No duplicates found in player awards.")

duplicates_coach = coach_awards[coach_awards.duplicated()]
if not duplicates_coach.empty:
    print("Duplicates detected in coach awards. Removing duplicate rows.")
    coach_awards = coach_awards.drop_duplicates()
else:
    print("No duplicates found in coach awards.")

# Display samples of both dataframes to verify transformations
display(player_awards.head())
display(coach_awards.head())

# Store cleaned csvs
player_awards.to_csv('./cleaned_data/player_awards.csv', index=False)
coach_awards.to_csv('./cleaned_data/coach_awards.csv', index=False)

No duplicates found in player awards.
No duplicates found in coach awards.


Unnamed: 0,playerID,award,year
0,thompti01w,All-Star Game MVP,1
1,leslili01w,All-Star Game MVP,2
2,leslili01w,All-Star Game MVP,3
3,teaslni01w,All-Star Game MVP,4
4,swoopsh01w,All-Star Game MVP,6


Unnamed: 0,playerID,award,year
8,coopemi01w,Coach of the Year,1
9,hugheda99w,Coach of the Year,2
10,stanlma99w,Coach of the Year,3
11,laimbbi01w,Coach of the Year,4
12,mcconsu01w,Coach of the Year,5
