In [1]:
import pandas as pd
import numpy as np
pd.options.display.float_format = None

<h1>Feature Engineering Notebook</h1>

The main purpose of this notebook is to create extra columns of data in our dataset to be trained on. Our plan is to create a column(s) for the current distribution of pitches thrown by that pitcher for that game.

In [3]:
# Read in the data
pitches = pd.read_parquet('catboost_data.parquet')
pitches.g_id = pitches.g_id.astype('int32')
print(f"Done reading in pitches, has size {pitches.shape}")

Done reading in pitches, has size (3514339, 17)


In [25]:
# Define a set of all possible pitch values
possible_pitch_values = {value: 0 for value in pitches.pitch_type.unique()}  # Add all possible values

# Initialize an empty DataFrame to store the results
result_df = pd.DataFrame()

# Loop through each unique pitcher's name
for pitcher_name in pitches['full_name'].unique():
    # Get data for the current pitcher
    selected = pitches[pitches['full_name'] == pitcher_name].copy()
    
    # Initialize a variable to keep track of the current game
    current_game = None
    
    # Initialize a dictionary to store pitch frequencies for the current pitcher
    base_pitch_dict = {pitch: 0 for pitch in possible_pitch_values}
    
    # Function to update pitch frequencies and create the dictionary
    def update_pitch_frequencies(row):
        global current_game
        
        # Check if the game has changed
        if row['g_id'] != current_game:
            for pitch in base_pitch_dict:
                base_pitch_dict[pitch] = 0  # Reset pitch counts for the new game
            current_game = row['g_id']
        
        pitch_type = row['pitch_type']
        if pitch_type in base_pitch_dict:
            base_pitch_dict[pitch_type] += 1
        else:
            base_pitch_dict[pitch_type] = 1
        return dict(base_pitch_dict)  # Return a copy to avoid reference issues
    
    # Apply the function to create the dictionary column
    selected['pitch_frequencies'] = selected.apply(update_pitch_frequencies, axis=1)
    
    # Append the results for the current pitcher to the overall result DataFrame
    result_df = result_df.append(selected)

# Reset the index of the result DataFrame
result_df.reset_index(drop=True, inplace=True)

# Display the resulting DataFrame
display(result_df)


Unnamed: 0,full_name,b_score,b_count,s_count,current_outs,pitch_num,on_1b,on_2b,on_3b,batter_id,inning,p_score,p_throws,batter_stance,top,g_id,pitch_type,pitch_frequencies
0,Jon Lester,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,572761,1.0,0.0,Left,Left,True,201500001,FF,"{'FF': 1, 'CU': 0, 'FC': 0, 'SI': 0, 'CH': 0, ..."
1,Jon Lester,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,572761,1.0,0.0,Left,Left,True,201500001,FF,"{'FF': 2, 'CU': 0, 'FC': 0, 'SI': 0, 'CH': 0, ..."
2,Jon Lester,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,572761,1.0,0.0,Left,Left,True,201500001,FF,"{'FF': 3, 'CU': 0, 'FC': 0, 'SI': 0, 'CH': 0, ..."
3,Jon Lester,0.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0,572761,1.0,0.0,Left,Left,True,201500001,FF,"{'FF': 4, 'CU': 0, 'FC': 0, 'SI': 0, 'CH': 0, ..."
4,Jon Lester,0.0,1.0,2.0,0.0,5.0,0.0,0.0,0.0,572761,1.0,0.0,Left,Left,True,201500001,CU,"{'FF': 4, 'CU': 1, 'FC': 0, 'SI': 0, 'CH': 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3514334,Chase d'Arnaud,11.0,1.0,0.0,1.0,2.0,1.0,0.0,0.0,605512,8.0,4.0,Right,Left,False,201801865,CU,"{'FF': 0, 'CU': 11, 'FC': 0, 'SI': 0, 'CH': 1,..."
3514335,Chase d'Arnaud,11.0,1.0,1.0,1.0,3.0,1.0,0.0,0.0,605512,8.0,4.0,Right,Left,False,201801865,CU,"{'FF': 0, 'CU': 12, 'FC': 0, 'SI': 0, 'CH': 1,..."
3514336,Chase d'Arnaud,11.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,599096,8.0,4.0,Right,Right,False,201801865,CU,"{'FF': 0, 'CU': 13, 'FC': 0, 'SI': 0, 'CH': 1,..."
3514337,Chase d'Arnaud,11.0,0.0,1.0,2.0,2.0,1.0,0.0,0.0,599096,8.0,4.0,Right,Right,False,201801865,CU,"{'FF': 0, 'CU': 14, 'FC': 0, 'SI': 0, 'CH': 1,..."


In [26]:
# Convert the column of dictionaries to individual columns
# Use the pd.DataFrame constructor to create a new DataFrame
new_df = pd.DataFrame(result_df['pitch_frequencies'].tolist())

# Concatenate the new DataFrame with the original DataFrame, dropping the dictionary column
result_df_test = pd.concat([result_df.drop('pitch_frequencies', axis=1), new_df], axis=1)

# Display the resulting DataFrame
print(result_df_test)

              full_name  b_score  b_count  s_count  current_outs  pitch_num  \
0            Jon Lester      0.0      0.0      0.0           0.0        1.0   
1            Jon Lester      0.0      0.0      1.0           0.0        2.0   
2            Jon Lester      0.0      0.0      2.0           0.0        3.0   
3            Jon Lester      0.0      0.0      2.0           0.0        4.0   
4            Jon Lester      0.0      1.0      2.0           0.0        5.0   
...                 ...      ...      ...      ...           ...        ...   
3514334  Chase d'Arnaud     11.0      1.0      0.0           1.0        2.0   
3514335  Chase d'Arnaud     11.0      1.0      1.0           1.0        3.0   
3514336  Chase d'Arnaud     11.0      0.0      0.0           2.0        1.0   
3514337  Chase d'Arnaud     11.0      0.0      1.0           2.0        2.0   
3514338  Chase d'Arnaud     11.0      1.0      1.0           2.0        3.0   

         on_1b  on_2b  on_3b  batter_id  ...  KC  N

In [41]:
# Save results_df_test as a parquet file.
result_df_test.rename(columns={None: 'Unknown'}, inplace=True)
result_df_test.to_parquet('pitch_dist_by_game.parquet')