# Packages

In [1]:
import pandas as pd
import numpy as np

# Data Exploration


# Add Features

### Features:
<p> Yards per play</p>
<p>Average Starting Yardline </p>
<p> Past Opponent ELO(from previous season)</p>
<p> % of average starting line held oppponent too. </p>

### Features to explore

<em> Essentially see how a specific team's defense has performed by seeing if teams they played against typically start at higher or lower then their average starting yard lines
if its typically higher, that implies a teams defense is better at preventing yard gain </em>
<p>% of starters present</p>
<p> Current win streak </p>

In [1]:
def get_feature_df(play_df, ranking_df, game_df, drive_df):
    """
    Creates the features to be modeled based on raw data from CFBD
    Args:
        play_df
        ranking_df
        game_df
        drive_df
    Returns: a Dataframe where each row is a game with new features based on past performance
    """

    feature_df = game_df[["id", "week","season", "season_type", "neutral_site", "conference_game",'start_date', "home_id", "home_team",
                        "home_points", "home_line_scores", "away_id", "away_team", "away_points", "away_line_scores"]]

    ## - Get home and away yards per play
    id_list = feature_df["id"]
    def get_home_yards(x):
        game = play_df.loc[play_df["game_id"] == x]
        game = game.query("offense == home")
        return game["yards_gained"].sum()/len(game)

    def get_away_yards(x):
        game = play_df.loc[play_df["game_id"] == x]
        game = game.query("offense == away")
        return game["yards_gained"].sum()/len(game)


    home_yards_play = pd.DataFrame(id_list.apply(get_home_yards))
    away_yards_play = pd.DataFrame(id_list.apply(get_away_yards))

    feature_df["home_yards/play"] = home_yards_play
    feature_df["away_yards/play"] = away_yards_play
    
    #add game margin, positive indicates home team won, vice versa

    feature_df["margin"] = feature_df["home_points"]-feature_df["away_points"]
    #home average yards per play in previous games


    team_list = feature_df["home_team"].unique()
    def get_home_avg_yards_play(x):
        game = feature_df.loc[feature_df["id"] == x]
        
        home_team = game["home_team"].values
        home_team = home_team[0]
        
    
        week = game["week"].values
        week = week[0]
        
        
        past_home_home_games = feature_df.loc[(feature_df["home_team"] == home_team)]
        past_home_home_games = past_home_home_games.loc[past_home_home_games["week"] < week]
        
        past_home_away_games = feature_df.loc[(feature_df["away_team"] == home_team)]
        past_home_away_games = past_home_away_games.loc[past_home_away_games["week"] < week]
        
        
        
        home_average_yards = past_home_home_games["home_yards/play"].to_list() + past_home_away_games["away_yards/play"].to_list()
    
        if len(home_average_yards) > 0: 
            return  sum(home_average_yards)/len(home_average_yards)
        else: return "NA"
        
    

    home_past_avg = pd.DataFrame(id_list.apply(get_home_avg_yards_play))
    
    # Away team's past average yards per play

    def get_away_avg_yards_play(x):
        game = feature_df.loc[feature_df["id"] == x]
        
        away_team = game["away_team"].values
        away_team = away_team[0]
        
        week = game["week"].values
        week = week[0]
        
        past_away_home_games = feature_df.loc[(feature_df["home_team"] == away_team)]
        past_away_home_games = past_away_home_games.loc[past_away_home_games["week"] < week]
        
        past_away_away_games = feature_df.loc[(feature_df["away_team"] == away_team)]
        past_away_away_games = past_away_away_games.loc[past_away_away_games["week"] < week]
        
        away_average_yards = past_away_home_games["home_yards/play"].to_list() + past_away_away_games["away_yards/play"].to_list()
    
        if len(away_average_yards) > 0: 
            return  sum(away_average_yards)/len(away_average_yards)
        else: return "NA"

    away_past_avg = pd.DataFrame(id_list.apply(get_away_avg_yards_play))
    
    feature_df["home_past_average_yards/play"] = home_past_avg
    feature_df["away_past_average_yards/play"] = away_past_avg
    
    #get starting yardline per game
    def get_starting_away_yards(x):
        game = drive_df.loc[drive_df["game_id"] == x]
        
        game_away = game.query("is_home_offense == False" )

        return game_away["start_yards_to_goal"].mean()

    def get_starting_home_yards(x):
        game = drive_df.loc[drive_df["game_id"] == x]
        
        game_home = game.query("is_home_offense == True" )
        

        return game_home["start_yards_to_goal"].mean()
        


    away_starting_yards = pd.DataFrame(id_list.apply(get_starting_away_yards))
    home_starting_yards = pd.DataFrame(id_list.apply(get_starting_home_yards))


    feature_df["away_avg_starting_yardline"] = away_starting_yards
    feature_df['home_avg_starting_yardline'] = home_starting_yards
    feature_df.query('id == 400933827')
    
    #Past historical average home and away starting yard line

    def get_home_past_starting_line(x):
        game = feature_df.loc[feature_df["id"] == x]
        
        home_team = game["home_team"].values
        home_team = home_team[0]
        
    
        week = game["week"].values
        week = week[0]
        
        
        past_home_home_games = feature_df.loc[(feature_df["home_team"] == home_team)]
        past_home_home_games = past_home_home_games.loc[past_home_home_games["week"] < week]
        
        past_home_away_games = feature_df.loc[(feature_df["away_team"] == home_team)]
        past_home_away_games = past_home_away_games.loc[past_home_away_games["week"] < week]
        
        
        
        home_average_yards = past_home_home_games["home_avg_starting_yardline"].to_list() + past_home_away_games["away_avg_starting_yardline"].to_list()
    
        if len(home_average_yards) > 0: 
            return  sum(home_average_yards)/len(home_average_yards)
        else: return "NA"
        
    

    def get_away_past_starting_line(x):
        game = feature_df.loc[feature_df["id"] == x]
        
        away_team = game["away_team"].values
        away_team = away_team[0]
        
        week = game["week"].values
        week = week[0]
        
        past_away_home_games = feature_df.loc[(feature_df["home_team"] == away_team)]
        past_away_home_games = past_away_home_games.loc[past_away_home_games["week"] < week]
        
        past_away_away_games = feature_df.loc[(feature_df["away_team"] == away_team)]
        past_away_away_games = past_away_away_games.loc[past_away_away_games["week"] < week]
        
        away_average_yards = past_away_home_games["home_avg_starting_yardline"].to_list() + past_away_away_games["away_avg_starting_yardline"].to_list()
    
        if len(away_average_yards) > 0: 
            return  sum(away_average_yards)/len(away_average_yards)
        else: return "NA"


    away_past_starting_yard_line = pd.DataFrame(id_list.apply(get_away_past_starting_line))
    feature_df["away_past_starting_yard_line"] = away_past_starting_yard_line

    home_past_starting_yard_line = pd.DataFrame(id_list.apply(get_home_past_starting_line))
    feature_df["home_past_starting_yard_line"] = home_past_starting_yard_line
    #function to pull last years elo, for some reason the way it read gave me a weird dataframe structure, thus the second function to squish it

    def get_elo_home(x):
        game = feature_df.loc[feature_df['id'] == x]
        team = game['home_team'].values
        team = team[0]

        elo = ranking_df.loc[ranking_df['team'] == team]
        elo = elo['elo']
        return elo


    home_elo = pd.DataFrame(id_list.apply(get_elo_home))

    def get_elo_away(x):
        game = feature_df.loc[feature_df['id'] == x]
        team = game['away_team'].values
        team = team[0]

        elo = ranking_df.loc[ranking_df['team'] == team]
        elo = elo['elo']
        return elo


    away_elo = pd.DataFrame(id_list.apply(get_elo_away))



    def f(x):
        if x.first_valid_index() is None:
            return None
        else:
            return x[x.first_valid_index()]

    home_elo = home_elo.apply(f, axis=1)
    away_elo = away_elo.apply(f, axis = 1)

    feature_df['home_elo'] = home_elo
    feature_df['away_elo'] = away_elo
    #average elo of opponents

    def get_opp_elo_away(x):
        game = feature_df.loc[feature_df["id"] == x]
        
        away_team = game["away_team"].values
        away_team = away_team[0]
        
        week = game["week"].values
        week = week[0]
        
        past_away_home_games = feature_df.loc[(feature_df["home_team"] == away_team)]
        past_away_home_games = past_away_home_games.loc[past_away_home_games["week"] < week]
        
        past_away_away_games = feature_df.loc[(feature_df["away_team"] == away_team)]
        past_away_away_games = past_away_away_games.loc[past_away_away_games["week"] < week]
        
        away_schedule_strength = past_away_home_games["away_elo"].to_list() + past_away_away_games["home_elo"].to_list()
        away_schedule_strength = [x for x in away_schedule_strength if pd.isnull(x) == False]
        
        if len(away_schedule_strength) > 0: 
            return  sum(away_schedule_strength)/len(away_schedule_strength)
        else: return "NA"

    def get_opp_elo_home(x):
        game = feature_df.loc[feature_df["id"] == x]
        
        away_team = game["home_team"].values
        away_team = away_team[0]
        
        week = game["week"].values
        week = week[0]
        
        past_away_home_games = feature_df.loc[(feature_df["home_team"] == away_team)]
        past_away_home_games = past_away_home_games.loc[past_away_home_games["week"] < week]
        
        past_away_away_games = feature_df.loc[(feature_df["away_team"] == away_team)]
        past_away_away_games = past_away_away_games.loc[past_away_away_games["week"] < week]
        
        away_schedule_strength = past_away_home_games["away_elo"].to_list() + past_away_away_games["home_elo"].to_list()
        away_schedule_strength = [x for x in away_schedule_strength if pd.isnull(x) == False]
        
        if len(away_schedule_strength) > 0: 
            return  sum(away_schedule_strength)/len(away_schedule_strength)
        else: return "NA"

    away_past_elo = pd.DataFrame(id_list.apply(get_opp_elo_away))
    feature_df['away_schedule_strength'] = away_past_elo

    home_past_elo = pd.DataFrame(id_list.apply(get_opp_elo_home))

    feature_df['home_schedule_strength'] = home_past_elo
    
    betting_df = pd.read_csv('betting_df2017.csv')
    betting_df = betting_df.drop_duplicates(subset='id').reset_index(drop=True)
    feature_df = pd.merge(feature_df, betting_df, on = 'id', how = 'inner')
    return(feature_df)
