# Cleaning Roster Dumps

Cleansing and concatenating current roster dumps

In [71]:
import pandas as pd
import numpy as np
import glob
import os
pd.set_option('display.max_rows', 500)


In [72]:
path = "../data/cbsDump"
all_files = glob.glob(os.path.join(path, "*.csv")) #make list of paths
team_names = [os.path.splitext(os.path.basename(file))[0] for file in all_files]

In [73]:
team_names

['tampaBayBadgers',
 'goBigOrGoHome',
 'kickersAndQBs',
 'clevelandSteamers',
 'trustTheProcess',
 'essendonBombers',
 'atkinsonRules',
 'beatsByRay',
 'gTechNick',
 'acworthEagles']

In [74]:
team_dfs = []

for n in range(0, len(team_names)):
    # Get full filepath
    file = all_files[n]
    team_name = team_names[n]
    # Getting the file name without extension
    file_name = os.path.splitext(os.path.basename(file))[0]
    
    # Reading the file content to create a DataFrame
    df = pd.read_csv(file, header=2)
    
    # Define columns to keep
    keepcols = ['Pos', 'Players', 'Salary']
    df = df[keepcols]
    
    # Storing team name
    df['roster'] = team_name
    
    # Dropping na rows
    df = df.dropna()
    
    # Storing dataframe to df dict
    team_dfs.append(df)

In [75]:
league = pd.concat(team_dfs, ignore_index=True)

In [76]:
league['player_split'] = league['Players'].apply(lambda x: x.split(' | '))

In [77]:
league['team'] = league['player_split'].apply(lambda x: x[-1].strip())

In [78]:
league['position'] = league['player_split'].apply(lambda x: x[0].split(' ')[-1].strip())

In [79]:
league['player'] = league['player_split'].apply(lambda x: (' ').join(x[0].split(' ')[:-1]).strip())
league['player'] = league['player'].apply(lambda x: x.replace(' III', ''))
league['player'] = league['player'].applye(lambda x: x.replace(' Jr', ''))

AttributeError: 'Series' object has no attribute 'applye'

In [80]:
league['salary'] = league['Salary']

In [81]:
keepcols = ['position', 'player', 'team', 'salary', 'roster']
league = league[keepcols]

In [82]:
league.head(200)

Unnamed: 0,position,player,team,salary,roster
0,QB,Kyler Murray,ARI,14.0,tampaBayBadgers
1,RB,Miles Sanders,PHI,14.0,tampaBayBadgers
2,RB,Jonathan Taylor,IND,12.0,tampaBayBadgers
3,WR,Chris Godwin,TB,7.0,tampaBayBadgers
4,WR,Diontae Johnson,PIT,7.0,tampaBayBadgers
5,TE,Noah Fant,DEN,7.0,tampaBayBadgers
6,RB,Travis Etienne,JAC,10.0,tampaBayBadgers
7,WR,Kenny Golladay,NYG,10.0,tampaBayBadgers
8,QB,Davis Mills,HOU,2.0,tampaBayBadgers
9,QB,Tua Tagovailoa,MIA,12.0,tampaBayBadgers


Join with player data to get player id and standardize team names

In [83]:
players = pd.read_csv('../data/cleaned/allPlayers2021.csv')

In [84]:
players.head()

Unnamed: 0.1,Unnamed: 0,player_id,player,position,team
0,4,5870,Daniel Jones,QB,NYG
1,6,1347,Chris Gragg,TE,
2,13,3396,Charone Peake,WR,
3,15,6343,Manny Wilkins,QB,
4,16,3199,Michael Thomas,WR,NO


In [85]:
new_df = pd.merge(league, players,  how='outer', left_on=['player', 'position'], right_on = ['player', 'position'])

In [86]:
new_df.head()

Unnamed: 0.1,position,player,team_x,salary,roster,Unnamed: 0,player_id,team_y
0,QB,Kyler Murray,ARI,14.0,tampaBayBadgers,3312.0,5849,ARI
1,RB,Miles Sanders,PHI,14.0,tampaBayBadgers,5088.0,6151,PHI
2,RB,Jonathan Taylor,IND,12.0,tampaBayBadgers,2836.0,6813,IND
3,WR,Chris Godwin,TB,7.0,tampaBayBadgers,2329.0,4037,TB
4,WR,Diontae Johnson,PIT,7.0,tampaBayBadgers,4078.0,5937,PIT


In [87]:
new_df['salary'] = new_df['salary'].astype(pd.Int32Dtype())
new_df['team'] = new_df['team_y']
new_df.head()

Unnamed: 0.1,position,player,team_x,salary,roster,Unnamed: 0,player_id,team_y,team
0,QB,Kyler Murray,ARI,14,tampaBayBadgers,3312.0,5849,ARI,ARI
1,RB,Miles Sanders,PHI,14,tampaBayBadgers,5088.0,6151,PHI,PHI
2,RB,Jonathan Taylor,IND,12,tampaBayBadgers,2836.0,6813,IND,IND
3,WR,Chris Godwin,TB,7,tampaBayBadgers,2329.0,4037,TB,TB
4,WR,Diontae Johnson,PIT,7,tampaBayBadgers,4078.0,5937,PIT,PIT


In [88]:
new_df['roster'].value_counts()

tampaBayBadgers      25
goBigOrGoHome        25
clevelandSteamers    24
essendonBombers      24
trustTheProcess      23
gTechNick            23
acworthEagles        22
atkinsonRules        20
beatsByRay           20
kickersAndQBs        19
Name: roster, dtype: int64

In [89]:
rosterDict = {
    'acworthEagles': 0,
    'atkinsonRules': 1,
    'beatsByRay': 2,
    'clevelandSteamers': 3,
    'essendonBombers': 4,
    'goBigOrGoHome': 5,
    'gTechNick': 6,
    'kickersAndQBs': 7,
    'tampaBayBadgers': 8,
    'trustTheProcess': 9,
    'none': 999
}

In [90]:
new_df['team'] = new_df.team.fillna(new_df.team_x)
new_df['roster'] = new_df['roster'].fillna('none')
new_df['roster_id'] = new_df['roster'].apply(lambda x: rosterDict[x])
keep_cols = ['player_id', 'player', 'position', 'team', 'salary', 'roster']
new_df = new_df[keep_cols]

In [91]:
new_df['salary'] = new_df['salary'].fillna(0)

In [92]:
len(new_df)

2807

In [93]:
new_df.tail(200)

Unnamed: 0,player_id,player,position,team,salary,roster
2607,1456,Tavarres King,WR,,0,none
2608,2922,Devin Mahina,TE,,0,none
2609,389,Matt Cassel,QB,,0,none
2610,4651,Corey Clement,RB,NYG,0,none
2611,6016,Dax Raymond,TE,PIT,0,none
2612,5278,Jarvion Franklin,RB,,0,none
2613,4641,Keeon Johnson,WR,,0,none
2614,2038,Jake Murphy,TE,,0,none
2615,6359,CJ Worton,WR,,0,none
2616,2048,Marion Grice,RB,,0,none


In [94]:
new_df.columns

Index(['player_id', 'player', 'position', 'team', 'salary', 'roster'], dtype='object')

In [95]:
new_df.to_csv('../data/cleaned/playersCurrentExcel.csv', index=False)