In [142]:
import pandas as pd
import re

In [143]:
url_dict = {'fantasy': 'https://www.footballdb.com/fantasy-football/index.html?pos=QB%2CRB%2CWR%2CTE&yr=2018&wk=all&rules=1',
            'QB': 'https://www.footballdb.com/stats/stats.html?mode=P&lg=NFL',
            'RB': 'https://www.footballdb.com/stats/stats.html?lg=NFL&yr=2018&type=reg&mode=R&conf=&limit=all',
            'WR/TE': 'https://www.footballdb.com/stats/stats.html?lg=NFL&yr=2018&type=reg&mode=C&conf=&limit=all'}

In [234]:
def split_name(name):
    if re.search('(Jr\.)[A-Z]\.', name):
        s = re.split('(Jr\.)[A-Z]\.', name)
        return(s[0])
    elif re.search('([a-z])[A-Z]\.', name):
        s = re.split('([a-z])[A-Z]\.', name)
        return(s[0] + s[1])
    elif re.search('(IV?I?I?)[A-Z]\.', name):
        s = re.split('(IV?I?I?)[A-Z]\.', name)
        return(s[0])
    else:
        s = ['------------------', name]
        return(s[0] + s[1])

def split_names(df, col_name):
    new_series = []
    names = df[col_name]
    for name in names.values:
        new_series.append((split_name(name)))
    return(pd.Series(new_series).str.strip())

def add_stats(main_df, position):
    """Adds more player stats to the main fantasy stats dataframe.
    
    Args:
    main_df -- (pd.DataFrame) the dataframe you want to add stats to
    position -- (str) one of ['QB', 'RB', 'WR/TE']
    """
    url = url_dict[position]
    df = pd.read_html(url, header=0)[0]
    df['position'] = position

    df['player'] = split_names(df, 'Player')
    del df['Player']

    main_df = pd.merge(main_df, df, on='player', how='outer')
    
    coalesce_cols = [i for i in main_df.columns if '_x' in i]
    for col in coalesce_cols:
        main_df[col[:-2]] = main_df[col].combine_first(main_df[col[:-2]+'_y'])
        del main_df[col]
        del main_df[col[:-2]+'_y']
        
    return(main_df)

def clean_dk_name(name):
    if name[-3:] == 'Jr.':
        return(name[:-4])
    else:
        return(name)
    
def clean_dk_names(df):
    new_series = []
    names = df['Name']
    for name in names.values:
        new_series.append((clean_dk_name(name)))
    return(pd.Series(new_series))

In [235]:
## Read in fantasy data
url = url_dict['fantasy']
stats = pd.read_html(url, header=1)[0].iloc[:, :-1]

cols = ['player', 'bye_wk', 'fantasy_pts', 'pass_att', 'pass_cmp', 'pass_yds', 'pass_td', 'pass_int', 'pass_2pt', 
        'rush_att', 'rush_yds', 'rush_td', 'rush_2pt', 'rec_rcp', 'rec_yds', 'rec_td', 'rec_2pt', 'fumbles_lost']
stats.columns = cols
cols = ['player', 'bye_wk', 'fantasy_pts','pass_2pt', 'rush_2pt','rec_2pt', 'fumbles_lost']
stats = stats[cols]

stats['player'] = split_names(stats, 'player')

In [236]:
## Add stats
stats = add_stats(stats, 'QB')
stats = add_stats(stats, 'RB')
stats = add_stats(stats, 'WR/TE')

In [237]:
stats.shape

(422, 29)

***

In [251]:
salaries = pd.read_csv('DKSalaries.csv')
salaries['Name'] = salaries['Name'].str.rstrip('I ')
salaries['Name'] = clean_dk_names(salaries)

df = pd.merge(salaries[['Name', 'Salary', 'Position', 'TeamAbbrev', 'AvgPointsPerGame']],
         stats,
         left_on='Name', right_on='player', how='inner')

In [252]:
df

Unnamed: 0,Name,Salary,Position,TeamAbbrev,AvgPointsPerGame,player,bye_wk,fantasy_pts,pass_2pt,rush_2pt,...,YAC,Gms,Avg,YPG,FD,Team,Yds,TD,Lg,position
0,Todd Gurley,9400,RB,LAR,28.30,Todd Gurley,12.0,89.0,0.0,2.0,...,216.0,4.0,4.28,84.5,21.0,LA,338.0,4.0,23,RB
1,Antonio Brown,9100,WR,PIT,18.55,Antonio Brown,7.0,44.0,0.0,0.0,...,143.0,4.0,9.38,68.0,14.0,PIT,272.0,3.0,27t,WR/TE
2,Melvin Gordon,8600,RB,LAC,27.88,Melvin Gordon,8.0,78.0,0.0,1.0,...,203.0,4.0,5.11,69.0,13.0,LAC,276.0,2.0,34,RB
3,Julio Jones,8500,WR,ATL,21.58,Julio Jones,8.0,49.0,0.0,0.0,...,113.0,4.0,11.00,2.8,1.0,ATL,11.0,0.0,11,RB
4,Christian McCaffrey,8000,RB,CAR,23.27,Christian McCaffrey,4.0,40.0,0.0,0.0,...,151.0,3.0,5.89,90.3,11.0,CAR,271.0,0.0,45,RB
5,Odell Beckham,8000,WR,NYG,17.75,Odell Beckham,9.0,33.0,0.0,0.0,...,115.0,4.0,3.00,2.2,1.0,NYG,9.0,0.0,10,RB
6,Keenan Allen,7800,WR,LAC,15.05,Keenan Allen,,,,,...,107.0,4.0,11.75,70.5,13.0,LAC,282.0,1.0,25,WR/TE
7,Saquon Barkley,7700,RB,NYG,23.32,Saquon Barkley,9.0,60.0,0.0,0.0,...,230.0,4.0,4.64,65.0,13.0,NYG,260.0,3.0,68t,RB
8,Adam Thielen,7700,WR,MIN,27.82,Adam Thielen,10.0,58.0,0.0,0.0,...,149.0,4.0,11.82,118.2,25.0,MIN,473.0,2.0,45t,WR/TE
9,Davante Adams,7600,WR,GB,18.62,Davante Adams,7.0,45.0,0.0,0.0,...,124.0,4.0,10.18,71.2,13.0,GB,285.0,3.0,51,WR/TE


***

### Features to explore:
- total fantasy points
- average fantasy points per game
- TD
- Yds
- YPG
- QB:
    - Cmp
- RB: 
    - YPA
- WR/TE:
    - Rec
    - Tar

***

In [240]:
## Find what names in the DK data are not found in the stats data
found, not_found = [], []
dk_names = salaries.Name
for dk in dk_names:
    if dk in stats.player.values:
        found.append(dk)
    else: 
        not_found.append(dk)
len(found), len(not_found)

(297, 137)