In [2]:
import os
import pandas as pd
import warnings
import numpy as np
from sklearn import metrics
warnings.filterwarnings('ignore')
from statsmodels.formula.api import ols
# for getting rid of outliers
from scipy.stats.mstats import winsorize
from sklearn.linear_model import LinearRegression
from scipy.stats import t
from scipy import stats
# used to get subset of overall dataset
from scipy.stats.mstats import winsorize
# used to get train test split for model testing
from sklearn.model_selection import train_test_split
# for feature selection
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression
# for feature selection
from sklearn.feature_selection import RFECV

# Setup Data Frame

Going to use next weeks fantasy points as the predictor of this weeks data

In [17]:
# creating the path to the files of this week and next week

def make_week_df(df_curr,df_next):
    df_next = df_next[['Player','PPRFantasyPoints']]
    df_next.rename(columns={'PPRFantasyPoints':'next_week_points'},inplace=True)
    fin_df_curr = df_curr.merge(df_next,how='left',on='Player')
    return fin_df_curr

def get_week_df(year,this_week):
    # next week number is this +1
    next_week = this_week+1
    # making csv path
    this_week_path = 'week'+str(this_week)+'.csv'
    next_week_path = 'week'+str(next_week)+'.csv'
    this_week_path = os.path.join('./data_v2/weekly/',str(year),str(this_week_path))
    next_week_path = os.path.join('./data_v2/weekly/',str(year),str(next_week_path))
    # opening both dataframs from csv
    df_curr = pd.read_csv(this_week_path)
    df_next = pd.read_csv(next_week_path)
    # making the df of just this week
    df = make_week_df(df_curr,df_next)
    return df

def make_final_df(start_year,end_year,start_week,end_week):
    # initialize df
    final_df = pd.DataFrame()
    # iterate through years from 2010 to 2019
    for year_num in range(start_year,end_year):
        for week_num in range(start_week,end_week):
            df_week = get_week_df(year_num,week_num)
            final_df = final_df.append(df_week)
    return final_df

def make_seperate_df(df):
    # dropping:
    # name, the team, and points for non PPR leagues
    df.drop(columns=['Player','Tm','StandardFantasyPoints','HalfPPRFantasyPoints','FL'],inplace=True)
    # if the player scored no points the next week he was
    # not listed and therefore got 0 points
    df = df.fillna(0)
    # making df for the top 4 and most common positions in a fantasy league
    df_qb = df[(df.Pos == 'QB')]
    df_rb = df[(df.Pos == 'RB')]
    df_wr = df[(df.Pos == 'WR')]
    df_te = df[(df.Pos == 'TE')]
    return df_qb,df_rb,df_wr,df_te

In [19]:
start_year = 2010
end_year = 2018
start_week = 1
end_week = 17
df = make_final_df(start_year,end_year,start_week,end_week)

df_qb,df_rb,df_wr,df_te = make_seperate_df(df)

# ----------------------------------------------------------------------------------------

# DFA QB

In [9]:
df_qb.head(50)

Unnamed: 0,Pos,PassingYds,PassingTD,Int,PassingAtt,Cmp,RushingAtt,RushingYds,RushingTD,Rec,Tgt,ReceivingYds,ReceivingTD,PPRFantasyPoints,next_week_points
0,QB,154.0,2.0,0.0,17.0,13.0,7.0,30.0,0.0,0.0,0.0,0.0,0.0,15.16,-2.16
1,QB,170.0,3.0,0.0,21.0,16.0,7.0,10.0,0.0,0.0,0.0,0.0,0.0,19.8,2.82
2,QB,258.0,3.0,0.0,35.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.32,11.92
3,QB,433.0,3.0,0.0,57.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.32,21.9
4,QB,372.0,2.0,1.0,35.0,23.0,5.0,22.0,0.0,0.0,0.0,0.0,0.0,21.08,23.38
5,QB,170.0,2.0,1.0,23.0,18.0,2.0,4.0,1.0,0.0,0.0,0.0,0.0,19.2,15.42
7,QB,175.0,1.0,0.0,24.0,16.0,11.0,103.0,0.0,0.0,0.0,0.0,0.0,21.3,23.06
8,QB,237.0,1.0,0.0,36.0,27.0,4.0,-6.0,0.0,0.0,0.0,0.0,0.0,12.88,19.86
9,QB,298.0,2.0,0.0,39.0,22.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,20.42,21.36
10,QB,345.0,2.0,1.0,50.0,34.0,4.0,9.0,0.0,0.0,0.0,0.0,0.0,20.7,6.58


In [10]:
df_qb.drop(columns = ['Pos','Rec','Tgt','ReceivingYds','ReceivingTD'],inplace=True)

In [11]:
features = df_qb.drop(columns='next_week_points')
target = df_qb['next_week_points']

### Making train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=9,test_size=0.2)



In [13]:
lr = LinearRegression()
lr = lr.fit(X_train,y_train)
pred = lr.predict(X_train)
rmse = np.sqrt(metrics.mean_squared_error(y_train,pred))

In [14]:
rmse

8.463103269716074

# ----------------------------------------------------------------------------------------

# QB Feature Engineering
Current data
* attempts / int ratio
* td / int ratio
* completion percentage
* do they rush
* last week over x points
* attempts over x

Need other data
* opposing defense rank or yards allowed (**need other data**)
* last year points
* 

### TD / Int Ratio

In [35]:
conditions = [
    df_qb['Int'] == 0,
    df_qb['PassingTD'] == 0,
]
actions = [
    df_qb['PassingTD'],
    - df_qb['Int']
]
df_qb['td_int_ratio'] = np.select(conditions, actions,default=df_qb['PassingTD'] / df_qb['Int'])

In [36]:
df_qb.head(50)

Unnamed: 0,Pos,PassingYds,PassingTD,Int,PassingAtt,Cmp,RushingAtt,RushingYds,RushingTD,Rec,Tgt,ReceivingYds,ReceivingTD,PPRFantasyPoints,next_week_points,att_int_ratio,td_int_ratio
0,QB,154.0,2.0,0.0,17.0,13.0,7.0,30.0,0.0,0.0,0.0,0.0,0.0,15.16,-2.16,17.0,2.0
1,QB,170.0,3.0,0.0,21.0,16.0,7.0,10.0,0.0,0.0,0.0,0.0,0.0,19.8,2.82,21.0,3.0
2,QB,258.0,3.0,0.0,35.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.32,11.92,35.0,3.0
3,QB,433.0,3.0,0.0,57.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.32,21.9,57.0,3.0
4,QB,372.0,2.0,1.0,35.0,23.0,5.0,22.0,0.0,0.0,0.0,0.0,0.0,21.08,23.38,35.0,2.0
5,QB,170.0,2.0,1.0,23.0,18.0,2.0,4.0,1.0,0.0,0.0,0.0,0.0,19.2,15.42,23.0,2.0
7,QB,175.0,1.0,0.0,24.0,16.0,11.0,103.0,0.0,0.0,0.0,0.0,0.0,21.3,23.06,24.0,1.0
8,QB,237.0,1.0,0.0,36.0,27.0,4.0,-6.0,0.0,0.0,0.0,0.0,0.0,12.88,19.86,36.0,1.0
9,QB,298.0,2.0,0.0,39.0,22.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,20.42,21.36,39.0,2.0
10,QB,345.0,2.0,1.0,50.0,34.0,4.0,9.0,0.0,0.0,0.0,0.0,0.0,20.7,6.58,50.0,2.0


# ----------------------------------------------------------------------------------------