In [31]:
import pandas as pd
import numpy as np
pd.options.display.float_format = None

<h1>Feature Engineering Notebook</h1>

The main purpose of this notebook is to create extra columns of data in our dataset to be trained on. Our plan is to create a column(s) for the current distribution of pitches thrown by that pitcher for that game.

In [75]:
# Read in the data
pitches = pd.read_parquet('catboost_data.parquet')
pitches.g_id = pitches.g_id.astype('int32')
print(f"Done reading in pitches, has size {pitches.shape}")

Done reading in pitches, has size (3514339, 17)


In [71]:
pitches.pitch_type.unique()
base_pitch_dict = {value: 0 for value in pitches.pitch_type.unique()}
print(base_pitch_dict)

{'FF': 0, 'CU': 0, 'FC': 0, 'SI': 0, 'CH': 0, 'FT': 0, 'IN': 0, 'SL': 0, 'FS': 0, 'KC': 0, None: 0, 'PO': 0, 'FO': 0, 'EP': 0, 'UN': 0, 'KN': 0, 'FA': 0, 'SC': 0, 'AB': 0}


In [73]:
# Get a specific pitcher to test the data on.
selected = pitches[pitches.full_name == pitches.full_name.sample(1).values[0]]

# Initialize a variable to keep track of the current game
current_game = None

# Function to update pitch frequencies and create the dictionary
def update_pitch_frequencies(row):
    global current_game
    
    # Check if the game has changed
    if row['g_id'] != current_game:
        for pitch in base_pitch_dict:
            base_pitch_dict[pitch] = 0  # Reset pitch counts for the new game
        current_game = row['g_id']
    
    pitch_type = row['pitch_type']
    if pitch_type in base_pitch_dict:
        base_pitch_dict[pitch_type] += 1
    else:
        base_pitch_dict[pitch_type] = 1
    return dict(base_pitch_dict)  # Return a copy to avoid reference issues

# Apply the function to create the dictionary column
selected['pitch_frequencies'] = selected.apply(update_pitch_frequencies, axis=1)


# Display the DataFrame
selected

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,full_name,b_score,b_count,s_count,current_outs,pitch_num,on_1b,on_2b,on_3b,batter_id,inning,p_score,p_throws,batter_stance,top,g_id,pitch_type,pitch_frequencies
143484,Ervin Santana,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,596019,1.0,0.0,Right,Left,True,201801586,FF,"{'FF': 1, 'CU': 0, 'FC': 0, 'SI': 0, 'CH': 0, ..."
143485,Ervin Santana,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,596019,1.0,0.0,Right,Left,True,201801586,CH,"{'FF': 1, 'CU': 0, 'FC': 0, 'SI': 0, 'CH': 1, ..."
143486,Ervin Santana,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,488726,1.0,0.0,Right,Left,True,201801586,FF,"{'FF': 2, 'CU': 0, 'FC': 0, 'SI': 0, 'CH': 1, ..."
143487,Ervin Santana,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,488726,1.0,0.0,Right,Left,True,201801586,SL,"{'FF': 2, 'CU': 0, 'FC': 0, 'SI': 0, 'CH': 1, ..."
143488,Ervin Santana,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,608070,1.0,0.0,Right,Left,True,201801586,SL,"{'FF': 2, 'CU': 0, 'FC': 0, 'SI': 0, 'CH': 1, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3502319,Ervin Santana,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,592518,9.0,2.0,Right,Right,False,201700667,FF,"{'FF': 32, 'CU': 0, 'FC': 0, 'SI': 0, 'CH': 18..."
3502320,Ervin Santana,0.0,0.0,2.0,1.0,3.0,0.0,0.0,0.0,592518,9.0,2.0,Right,Right,False,201700667,FF,"{'FF': 33, 'CU': 0, 'FC': 0, 'SI': 0, 'CH': 18..."
3502321,Ervin Santana,0.0,0.0,2.0,1.0,4.0,0.0,0.0,0.0,592518,9.0,2.0,Right,Right,False,201700667,SL,"{'FF': 33, 'CU': 0, 'FC': 0, 'SI': 0, 'CH': 18..."
3502322,Ervin Santana,0.0,0.0,2.0,1.0,5.0,0.0,0.0,0.0,592518,9.0,2.0,Right,Right,False,201700667,SL,"{'FF': 33, 'CU': 0, 'FC': 0, 'SI': 0, 'CH': 18..."


In [40]:
selected.pitch_type.unique()

array(['FF', 'FT', 'SL', 'FS', 'CU', None, 'CH', 'PO', 'IN'], dtype=object)

In [20]:
pitches.pitch_type.value_counts().index

Index(['FF', 'SL', 'FT', 'CH', 'SI', 'CU', 'FC', 'KC', 'FS', 'KN', 'IN', 'EP',
       'FO', 'PO', 'SC', 'UN', 'FA', 'AB'],
      dtype='object')

In [81]:
import pandas as pd

# Initialize a dictionary to store pitch frequencies for each pitcher
pitch_frequencies_by_pitcher = {}

def update_pitch_frequencies(group):
    pitcher_name = group['full_name'].iloc[0]
    if pitcher_name not in pitch_frequencies_by_pitcher:
        pitch_frequencies_by_pitcher[pitcher_name] = base_pitch_dict
    
    pitcher_pitch_dict = pitch_frequencies_by_pitcher[pitcher_name]
    
    # Check if the game has changed
    if group['g_id'].nunique() > 1:
        for pitch in pitcher_pitch_dict:
            pitcher_pitch_dict[pitch] = 0  # Reset pitch counts for the new game
    
    for _, row in group.iterrows():
        pitch_type = row['pitch_type']
        if pitch_type in pitcher_pitch_dict:
            pitcher_pitch_dict[pitch_type] += 1
        else:
            pitcher_pitch_dict[pitch_type] = 1
    
    return group

# Apply the function within each pitcher group to create the dictionary column
result = pitches.groupby('full_name').apply(update_pitch_frequencies).reset_index(drop=True)

# Display the resulting DataFrame
print(result)

# Display the DataFrame
print(pitches)


          full_name  b_score  b_count  s_count  current_outs  pitch_num  \
0        Jon Lester      0.0      0.0      0.0           0.0        1.0   
1        Jon Lester      0.0      0.0      1.0           0.0        2.0   
2        Jon Lester      0.0      0.0      2.0           0.0        3.0   
3        Jon Lester      0.0      0.0      2.0           0.0        4.0   
4        Jon Lester      0.0      1.0      2.0           0.0        5.0   
...             ...      ...      ...      ...           ...        ...   
3514334  Yoan Lopez      0.0      0.0      0.0           2.0        1.0   
3514335  Yoan Lopez      0.0      1.0      0.0           2.0        2.0   
3514336  Yoan Lopez      0.0      1.0      0.0           2.0        3.0   
3514337  Yoan Lopez      0.0      1.0      0.0           2.0        4.0   
3514338  Yoan Lopez      0.0      1.0      0.0           2.0        5.0   

         on_1b  on_2b  on_3b  batter_id  inning  p_score p_throws  \
0          0.0    0.0    0.0  

In [84]:
result.columns

Index(['full_name', 'b_score', 'b_count', 's_count', 'current_outs',
       'pitch_num', 'on_1b', 'on_2b', 'on_3b', 'batter_id', 'inning',
       'p_score', 'p_throws', 'batter_stance', 'top', 'g_id', 'pitch_type'],
      dtype='object')