In [2]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.simplefilter("ignore", RuntimeWarning)

In [3]:
pitch_types = ['Changeup',
 'Four-Seam Fastball',
 'Slider',
 'Sinker',
 'Knuckle Curve',
 'Cutter',
 'Curveball',
 'Intentional Ball',
 'Sweeper',
 'Splitter',
 'Pitchout',
 'Automatic Ball',
 'Forkball',
 'Fastball',
 'Slurve',
 'Knuckle Ball',
 'Eephus',
 'Slow Curve',
 'Screwball',]

In [4]:
pitch_categories = {
    "True Fastballs": ["Four-Seam Fastball", "Fastball"],
    "Movement Fastballs": ["Sinker", "Cutter"],
    "Breaking Balls": ["Slider", "Curveball", "Knuckle Curve", "Sweeper", "Slurve", "Slow Curve"],
    "Offspeed": ["Changeup", "Splitter", "Forkball", "Screwball"],
    "Non-Competitive Pitches": ["Intentional Ball", "Pitchout", "Automatic Ball", "Eephus"]
}

In [5]:
csv_files = ['2010.csv', '2011.csv','2012.csv', '2013.csv','2014.csv', '2015.csv',
 '2016.csv',
 '2017.csv',
 '2018.csv','2019.csv','2020.csv',
 '2021.csv', '2022.csv', '2023.csv', '2024.csv']

In [6]:
# Replace 'your_directory_path' with the path to your folder
folder_path = 'Players/pitchers'

# List all CSV files in the folder
players = [file for file in os.listdir(folder_path)]

In [7]:
players.remove('.DS_Store')

In [15]:
dict_final = {'Pitches_faced': [], 'Atbats_faced':[], 'Pitch_count': [], 'Other': [], 'Fastball': [], 'Movement Fastball': [], 'Breaking': [], 'Offspeed': [],'Non-Competitive':[], 'Actual_pitch': [], 'Outs': [], 'location': [], 'spinRate_rolling':[], 'breakLength_rolling':[], 'Start_speed_rolling':[], 'breakVerticalInduced_rolling':[], 'breakHorizontal_rolling':[], 'breakVertical_rolling': [],'Outcome': [],'Balls': [], 'Strikes': [],'Event': [], 'Pitch': [], 'Previous_Pitches': [], 'Start_speed': [],
              'breakAngle': [], 'breakLength': [], 'breakVertical': [], 'breakHorizontal': [],
              'breakVerticalInduced': [], 'spinRate': [], 'batter_name': [], 'batter_side': [],
              'batter_split': [], 'pitcher_id': [], 'pitcher_name': [], 'pitcher_hand': [],
              'pitcher_split': [], 'Date': [], 'X': [], 'Y': [], 'isTopInning': [], 'Outs': [], 'Inning': [], 'awayScore': [], 'homeScore': [], 'strikeZoneTop': [], 'strikeZoneBottom': [], 'IsOut': [], 'AVG': [], 'OBP': [], 'SLG': []}
pitch_categories_reversed = {
    "Four-Seam Fastball": "True Fastballs",
    "Fastball": "True Fastballs",
    "Sinker": "Movement Fastballs",
    "Cutter": "Movement Fastballs",
    "Slider": "Breaking Balls",
    "Curveball": "Breaking Balls",
    "Knuckle Curve": "Breaking Balls",
    "Sweeper": "Breaking Balls",
    "Slurve": "Breaking Balls",
    "Slow Curve": "Breaking Balls",
    "Changeup": "Offspeed",
    "Splitter": "Offspeed",
    "Forkball": "Offspeed",
    "Screwball": "Offspeed",
    "Intentional Ball": "Non-Competitive Pitches",
    "Pitchout": "Non-Competitive Pitches",
    "Automatic Ball": "Non-Competitive Pitches",
    "Eephus": "Non-Competitive Pitches"
}

dict_batters = {}


dict_final_full = dict_final.copy()  # Copy structure

roll_pitch_list = ['Fastball', 'Movement Fastball', 'Breaking', 'Offspeed', 'Non-Competitive']

roll_pitch_rolling = []

for pitcher in players:
    final_df = pd.DataFrame()
    
    # Replace 'your_directory_path' with the path to your folder
    folder_path = f'Players/pitchers/{pitcher}'

    # List all CSV files in the folder
    csv_files = [file for file in os.listdir(folder_path)]

    for csv in csv_files:
        year = pd.read_csv(f'{folder_path}/{csv}')
        final_df = pd.concat([final_df, year])
        
    # final_df['Pitch_Cat'] = final_df['Pitch'].apply(lambda x: pitch_categories_reversed[x])
        
    try:
        final_df['X']
    except:
        print(pitcher)
        continue

    # Handle NaN values
    final_df.fillna(method='ffill', inplace=True)
    
    def bin_pitch_types(df):
        """Bins pitch types into categories."""
        pitch_map = {
            'Fastball': ['Four-Seam Fastball', 'Fastball'],
            'Movement Fastball': ['Sinker', 'Cutter'],
            'Breaking': ['Slider', 'Curveball', 'Sweeper', 'Slurve', 'Knuckle Curve'],
            'Offspeed': ['Changeup', 'Split-Finger', 'Forkball', 'Screwball'],
            'Non-Competitive': ['Eephus', 'Pitch Out', 'Intent Ball']
        }
        reverse_map = {pitch: category for category, pitches in pitch_map.items() for pitch in pitches}
        df['Pitch_Category'] = df['Pitch'].map(reverse_map).fillna('Other')
        return df

    # Compute rolling means
    def calculate_rolling_averages(df, roll_pitch_rolling, roll_pitch_list, cols, dict_final, windows=[50, 100, 200, 300]):
        """Computes rolling averages for specified columns."""

        for col in cols:
            df[f'{col}_rolling'] = (
                df.groupby('Pitch_Category')[col]
                .transform(lambda x: x.rolling(window=50, min_periods=30).mean().shift(1))
            )

        # Reset the index to ensure no duplicate index issues
        df = df.reset_index(drop=True)

        unique_pitches = df['Pitch_Category'].unique()

        for col in cols:
            for pitch in roll_pitch_list:
                col_name = f'rolling_{col}_{pitch}'

                if col_name not in dict_final:
                    dict_final[col_name] = []
                    roll_pitch_rolling.append(col_name)

                if pitch not in unique_pitches:
                    df[col_name] = 0
                    continue

                mask = df['Pitch_Category'] == pitch
                df[col_name] = np.nan
                df.loc[mask, col_name] = (
                    df.loc[mask]
                    .groupby('pitcher_id')[col]
                    .rolling(window=50, min_periods=30)
                    .mean()
                    .reset_index(level=0, drop=True)
                    .shift(1)
                )

        # ✅ Forward fill all rolling stats *after* creation
        for col in roll_pitch_rolling:
            df[col] = df.groupby(['pitcher_id', 'Pitch_Category'])[col].ffill()

        return df, dict_final, roll_pitch_rolling

    
    final_df = bin_pitch_types(final_df)
    final_df, dict_final, roll_pitch_rolling = calculate_rolling_averages(final_df, roll_pitch_rolling, roll_pitch_list, ['spinRate', 'breakLength', 'Start_speed', 'breakVerticalInduced', 'breakHorizontal', 'breakVertical'], dict_final)

    final_df['Pre_Outs'] = final_df.apply(lambda x: x['Outs'] - 1 if x['IsOut'] == True else x['Outs'], axis=1)

    count_strike, count_ball = 0, 0
    pitches = ""
    
    pitch_count = 0
    
    track_matchups = {}
    
    track_pitches_matchup = {}
    
    matchup_true = {}
    
    date = ''
    
    total_pitches = 0

    track_pitches = {'Fastball': 0, 'Movement Fastball': 0, 'Breaking': 0, 'Offspeed': 0,'Non-Competitive':0, 'Other':0}

    for _, i in final_df.iterrows():
        
        for pitch_roll in roll_pitch_rolling:
            
            dict_final[pitch_roll].append(i[pitch_roll])
        
        if i['Date'] != date:
        
            pitch_count = 0
            
            date = i['Date']
        
        if i['pitcher_id'] not in matchup_true:
        
            matchup_true[i['pitcher_id']] = []
        
        if i['batter_name'] not in matchup_true[i['pitcher_id']]:
        
            matchup_true[i['pitcher_id']].append(i['batter_name'])

            track_pitches_matchup[f"{i['pitcher_id']}_vs_{i['batter_name']}"] = 1
            
            track_matchups[f"{i['pitcher_id']}_vs_{i['batter_name']}"] = 0

        else:

            track_pitches_matchup[f"{i['pitcher_id']}_vs_{i['batter_name']}"] += 1
        
        
        if i['batter_name'] not in dict_batters:
            
            dict_batters[i['batter_name']] = {'Home Run': [], 'Triple': [], 'Double': [],'Single':[], 'Walk': [], 'Hit By Pitch': [], 'HIT': [], 'AB': [], 'Sac Fly': [], 'Sac Bunt': []}
    
            
        if ('Foul' in i['Description']) or ('Missed Bunt' in i['Description']) or ('Pitchout' in i['Description']) or ('Strike' in i['Description']) or ('Swinging' in i['Description']) or ('Ball' in i['Description']):
            
            pitch_count+=1
            
            dict_final['Outcome'].append(i['Description'])
            dict_final['Balls'].append(count_ball)
            dict_final['Strikes'].append(count_strike)
            dict_final['Pitch'].append(i['Pitch_Category'])
            dict_final['Start_speed'].append(i['Start_speed'])
            dict_final['breakAngle'].append(i['breakAngle'])
            dict_final['breakLength'].append(i['breakLength'])
            dict_final['breakVertical'].append(i['breakVertical'])
            dict_final['breakHorizontal'].append(i['breakHorizontal'])
            dict_final['breakVerticalInduced'].append(i['breakVerticalInduced'])
            dict_final['spinRate'].append(i['spinRate'])
            dict_final['batter_name'].append(i['batter_name'])
            dict_final['batter_side'].append(i['batter_side'])
            dict_final['batter_split'].append(i['batter_split'])
            dict_final['pitcher_id'].append(i['pitcher_id'])
            dict_final['pitcher_name'].append(i['pitcher_name'])
            dict_final['pitcher_hand'].append(i['pitcher_hand'])
            dict_final['pitcher_split'].append(i['pitcher_split'])
            dict_final['Date'].append(i['Date'])
            dict_final['X'].append(i['X'])
            dict_final['Y'].append(i['Y'])
            dict_final['isTopInning'].append(i['isTopInning'])
            dict_final['Inning'].append(i['Inning'])
            dict_final['awayScore'].append(i['awayScore'])
            dict_final['homeScore'].append(i['homeScore'])
            dict_final['strikeZoneTop'].append(i['strikeZoneTop'])
            dict_final['strikeZoneBottom'].append(i['strikeZoneBottom'])
            dict_final['IsOut'].append(i['IsOut'])
            dict_final['Previous_Pitches'].append(pitches)  # Fixed append to dict_final
            dict_final['spinRate_rolling'].append(i['spinRate_rolling'])
            dict_final['breakLength_rolling'].append(i['breakLength_rolling'])
            dict_final['Start_speed_rolling'].append(i['Start_speed_rolling'])
            dict_final['breakVerticalInduced_rolling'].append(i['breakVerticalInduced_rolling'])
            dict_final['breakHorizontal_rolling'].append(i['breakHorizontal_rolling'])
            dict_final['breakVertical_rolling'].append(i['breakVertical_rolling'])
            dict_final['location'].append(i['location'])
            dict_final['Outs'].append(i['Pre_Outs'])
            dict_final['Actual_pitch'].append(i['Pitch'])
            dict_final['Pitch_count'].append(pitch_count)
            dict_final['Pitches_faced'].append(track_pitches_matchup[f"{i['pitcher_id']}_vs_{i['batter_name']}"])
            dict_final['Atbats_faced'].append(track_matchups[f"{i['pitcher_id']}_vs_{i['batter_name']}"])
            
            pitch_cats = ['Fastball', 'Movement Fastball', 'Breaking', 'Offspeed', 'Non-Competitive', 'Other']
            
            pit = i['Pitch_Category']
            
            total_pitches += 1

                
            if i['Description'] == 'Pitchout':
                dict_final['AVG'].append(avg)
                dict_final['OBP'].append(obp)
                dict_final['SLG'].append(slg)
                dict_final['Event'].append('Nothing')
                
                track_pitches[pit] += 1
            
                for pitch in pitch_cats:

                    dict_final[pitch].append(track_pitches[pitch])
                continue

            if 'Foul' in i['Description'] and count_strike <= 1:
                count_strike += 1
                pitches += f"{i['Pitch_Category']},"
                dict_final['Event'].append('Nothing')
                
                dict_final['AVG'].append(avg)
                dict_final['OBP'].append(obp)
                dict_final['SLG'].append(slg)
                
                track_pitches[pit] += 1
                

            
                for pitch in pitch_cats:

                    dict_final[pitch].append(track_pitches[pitch])

            elif ('Strike' in i['Description']) or ('Swinging' in i['Description']) or ('Missed Bunt' in i['Description']):

                count_strike += 1
                pitches += f"{i['Pitch_Category']},"

                
                if count_strike == 3:
                    
                    dict_final['Event'].append(i['Event'])
                    
                    count_strike, count_ball, pitches = 0, 0, ""
                    
                    track_pitches[pit] += 1
            
                    for pitch in pitch_cats:

                        dict_final[pitch].append(track_pitches[pitch])
                        
                        track_pitches[pit] = 0
                    
                    if len(dict_batters[i['batter_name']]['HIT']) < 20:
            
                        for j in ['Home Run', 'Triple','Walk', 'Double', 'Single', 'Hit By Pitch', 'HIT', 'Sac Fly', 'Sac Bunt']:

                            dict_batters[i['batter_name']][j].append(0)
                        
                        dict_batters[i['batter_name']]['AB'].append(1)

                        dict_final['AVG'].append(np.nan)
                        dict_final['OBP'].append(np.nan)
                        dict_final['SLG'].append(np.nan)


                    else:

                        avg = np.sum(dict_batters[i['batter_name']]['HIT']) / np.sum(dict_batters[i['batter_name']]['AB'])
                        obp = (
                            np.sum(dict_batters[i['batter_name']]['HIT'] +
                                   dict_batters[i['batter_name']]['Walk'] +
                                   dict_batters[i['batter_name']]['Hit By Pitch'])
                            / np.sum(dict_batters[i['batter_name']]['AB'] +
                                     dict_batters[i['batter_name']]['Walk'] +
                                     dict_batters[i['batter_name']]['Hit By Pitch'] +
                                     dict_batters[i['batter_name']]['Sac Fly'])
                        )
                        slg = (
                            np.sum(dict_batters[i['batter_name']]['Single'] +
                                   dict_batters[i['batter_name']]['Double'] * 2 +
                                   dict_batters[i['batter_name']]['Triple'] * 3 +
                                   dict_batters[i['batter_name']]['Home Run'] * 4)
                            / np.sum(dict_batters[i['batter_name']]['AB'])
                        )

                        dict_final['AVG'].append(avg)
                        dict_final['OBP'].append(obp)
                        dict_final['SLG'].append(slg)


                        for j in ['Home Run', 'Triple','Walk', 'Double', 'Single', 'Hit By Pitch', 'HIT', 'Sac Fly', 'Sac Bunt']:
                            dict_batters[i['batter_name']][j].append(0)
                            dict_batters[i['batter_name']][j].pop(0)

                        dict_batters[i['batter_name']]['AB'].append(1)
                        dict_batters[i['batter_name']]['AB'].pop(0)

                    
                else:
                    
                    avg = np.sum(dict_batters[i['batter_name']]['HIT']) / np.sum(dict_batters[i['batter_name']]['AB'])
                    
                    obp = (
                            np.sum(dict_batters[i['batter_name']]['HIT'] +
                                   dict_batters[i['batter_name']]['Walk'] +
                                   dict_batters[i['batter_name']]['Hit By Pitch'])
                            / np.sum(dict_batters[i['batter_name']]['AB'] +
                                     dict_batters[i['batter_name']]['Walk'] +
                                     dict_batters[i['batter_name']]['Hit By Pitch'] +
                                     dict_batters[i['batter_name']]['Sac Fly'])
                        )
                    slg = (
                        np.sum(dict_batters[i['batter_name']]['Single'] +
                               dict_batters[i['batter_name']]['Double'] * 2 +
                               dict_batters[i['batter_name']]['Triple'] * 3 +
                               dict_batters[i['batter_name']]['Home Run'] * 4)
                        / np.sum(dict_batters[i['batter_name']]['AB'])
                    )
                    
                    dict_final['Event'].append('Nothing')
                    dict_final['AVG'].append(avg)
                    dict_final['OBP'].append(obp)
                    dict_final['SLG'].append(slg)
                    
                    track_pitches[pit] += 1
            
                    for pitch in pitch_cats:

                        dict_final[pitch].append(track_pitches[pitch])


            elif ('Ball' in i['Description']) or ('Pitchout' == i['Description']):
                count_ball += 1
                pitches += f"{i['Pitch_Category']},"
                
                if count_ball == 4:
                    
                    dict_final['Event'].append(i['Event'])
                    
                    count_strike, count_ball, pitches = 0, 0, ""
                    
                    track_pitches[pit] += 1
            
                    for pitch in pitch_cats:

                        dict_final[pitch].append(track_pitches[pitch])
                        
                        track_pitches[pit] = 0
                    
                    if len(dict_batters[i['batter_name']]['HIT']) < 20:
            
                        for j in ['Home Run', 'Triple','AB', 'Double', 'Single', 'Hit By Pitch', 'HIT', 'Sac Fly', 'Sac Bunt']:

                            dict_batters[i['batter_name']][j].append(0)
                        
                        dict_batters[i['batter_name']]['Walk'].append(1)

                        dict_final['AVG'].append(np.nan)
                        dict_final['OBP'].append(np.nan)
                        dict_final['SLG'].append(np.nan)


                    else:

                        avg = np.sum(dict_batters[i['batter_name']]['HIT']) / np.sum(dict_batters[i['batter_name']]['AB'])
                        obp = (
                            np.sum(dict_batters[i['batter_name']]['HIT'] +
                                   dict_batters[i['batter_name']]['Walk'] +
                                   dict_batters[i['batter_name']]['Hit By Pitch'])
                            / np.sum(dict_batters[i['batter_name']]['AB'] +
                                     dict_batters[i['batter_name']]['Walk'] +
                                     dict_batters[i['batter_name']]['Hit By Pitch'] +
                                     dict_batters[i['batter_name']]['Sac Fly'])
                        )
                        slg = (
                            np.sum(dict_batters[i['batter_name']]['Single'] +
                                   dict_batters[i['batter_name']]['Double'] * 2 +
                                   dict_batters[i['batter_name']]['Triple'] * 3 +
                                   dict_batters[i['batter_name']]['Home Run'] * 4)
                            / np.sum(dict_batters[i['batter_name']]['AB'])
                        )

                        dict_final['AVG'].append(avg)
                        dict_final['OBP'].append(obp)
                        dict_final['SLG'].append(slg)


                        for j in ['Home Run', 'Triple','AB', 'Double', 'Single', 'Hit By Pitch', 'HIT', 'Sac Fly', 'Sac Bunt']:
                            dict_batters[i['batter_name']][j].append(0)
                            dict_batters[i['batter_name']][j].pop(0)

                        dict_batters[i['batter_name']]['Walk'].append(1)
                        dict_batters[i['batter_name']]['Walk'].pop(0)
                    
                else:
                    
                    track_pitches[pit] += 1
            
                    for pitch in pitch_cats:

                        dict_final[pitch].append(track_pitches[pitch])
                    
                    avg = np.sum(dict_batters[i['batter_name']]['HIT']) / np.sum(dict_batters[i['batter_name']]['AB'])
                    
                    obp = (
                            np.sum(dict_batters[i['batter_name']]['HIT'] +
                                   dict_batters[i['batter_name']]['Walk'] +
                                   dict_batters[i['batter_name']]['Hit By Pitch'])
                            / np.sum(dict_batters[i['batter_name']]['AB'] +
                                     dict_batters[i['batter_name']]['Walk'] +
                                     dict_batters[i['batter_name']]['Hit By Pitch'] +
                                     dict_batters[i['batter_name']]['Sac Fly'])
                        )
                    slg = (
                        np.sum(dict_batters[i['batter_name']]['Single'] +
                               dict_batters[i['batter_name']]['Double'] * 2 +
                               dict_batters[i['batter_name']]['Triple'] * 3 +
                               dict_batters[i['batter_name']]['Home Run'] * 4)
                        / np.sum(dict_batters[i['batter_name']]['AB'])
                    )
                    
                    dict_final['Event'].append('Nothing')
                    dict_final['AVG'].append(avg)
                    dict_final['OBP'].append(obp)
                    dict_final['SLG'].append(slg)

            else:
                track_pitches[pit] += 1

                for pitch in pitch_cats:

                    dict_final[pitch].append(track_pitches[pitch])
                        
                pitches += f"{i['Pitch_Category']},"
                
                avg = np.sum(dict_batters[i['batter_name']]['HIT']) / np.sum(dict_batters[i['batter_name']]['AB'])
                    
                obp = (
                        np.sum(dict_batters[i['batter_name']]['HIT'] +
                               dict_batters[i['batter_name']]['Walk'] +
                               dict_batters[i['batter_name']]['Hit By Pitch'])
                        / np.sum(dict_batters[i['batter_name']]['AB'] +
                                 dict_batters[i['batter_name']]['Walk'] +
                                 dict_batters[i['batter_name']]['Hit By Pitch'] +
                                 dict_batters[i['batter_name']]['Sac Fly'])
                    )
                slg = (
                    np.sum(dict_batters[i['batter_name']]['Single'] +
                           dict_batters[i['batter_name']]['Double'] * 2 +
                           dict_batters[i['batter_name']]['Triple'] * 3 +
                           dict_batters[i['batter_name']]['Home Run'] * 4)
                    / np.sum(dict_batters[i['batter_name']]['AB'])
                )
                
                dict_final['AVG'].append(avg)
                dict_final['OBP'].append(obp)
                dict_final['SLG'].append(slg)
                dict_final['Event'].append('Nothing')

            

        elif i['Description'] in ['In play, out(s)', 'In play, no out', 'In play, run(s)', 'Hit By Pitch']:
            
            pitch_count += 1
            
            dict_final['Outcome'].append(i['Description'])
            dict_final['Balls'].append(count_ball)
            dict_final['Strikes'].append(count_strike)
            dict_final['Event'].append(i['Event'])
            dict_final['Pitch'].append(i['Pitch_Category'])
            dict_final['Start_speed'].append(i['Start_speed'])
            dict_final['breakAngle'].append(i['breakAngle'])
            dict_final['breakLength'].append(i['breakLength'])
            dict_final['breakVertical'].append(i['breakVertical'])
            dict_final['breakHorizontal'].append(i['breakHorizontal'])
            dict_final['breakVerticalInduced'].append(i['breakVerticalInduced'])
            dict_final['spinRate'].append(i['spinRate'])
            dict_final['batter_name'].append(i['batter_name'])
            dict_final['batter_side'].append(i['batter_side'])
            dict_final['batter_split'].append(i['batter_split'])
            dict_final['pitcher_id'].append(i['pitcher_id'])
            dict_final['pitcher_name'].append(i['pitcher_name'])
            dict_final['pitcher_hand'].append(i['pitcher_hand'])
            dict_final['pitcher_split'].append(i['pitcher_split'])
            dict_final['Date'].append(i['Date'])
            dict_final['X'].append(i['X'])
            dict_final['Y'].append(i['Y'])
            dict_final['isTopInning'].append(i['isTopInning'])
            dict_final['Inning'].append(i['Inning'])
            dict_final['awayScore'].append(i['awayScore'])
            dict_final['homeScore'].append(i['homeScore'])
            dict_final['strikeZoneTop'].append(i['strikeZoneTop'])
            dict_final['strikeZoneBottom'].append(i['strikeZoneBottom'])
            dict_final['IsOut'].append(i['IsOut'])
            dict_final['Previous_Pitches'].append(pitches)  # Fixed append to dict_final
            dict_final['spinRate_rolling'].append(i['spinRate_rolling'])
            dict_final['breakLength_rolling'].append(i['breakLength_rolling'])
            dict_final['Start_speed_rolling'].append(i['Start_speed_rolling'])
            dict_final['breakVerticalInduced_rolling'].append(i['breakVerticalInduced_rolling'])
            dict_final['breakHorizontal_rolling'].append(i['breakHorizontal_rolling'])
            dict_final['breakVertical_rolling'].append(i['breakVertical_rolling'])
            dict_final['location'].append(i['location'])
            dict_final['Outs'].append(i['Pre_Outs'])
            dict_final['Actual_pitch'].append(i['Pitch'])
            dict_final['Pitch_count'].append(pitch_count)
            dict_final['Pitches_faced'].append(track_pitches_matchup[f"{i['pitcher_id']}_vs_{i['batter_name']}"])
            dict_final['Atbats_faced'].append(track_matchups[f"{i['pitcher_id']}_vs_{i['batter_name']}"])
            
            hits = ['Home Run', 'Triple', 'Double', 'Single']
            extra = ['HIT', 'AB']
            other = ['Hit By Pitch', 'Sac Fly', 'Sac Bunt']

            if len(dict_batters[i['batter_name']]['HIT']) < 20:

                    
                if i['Event'] in other:
                    other_copy = other.copy()
                    other_copy.remove(i['Event'])

                    for j in hits + other_copy + extra:
                        dict_batters[i['batter_name']][j].append(0)

                    dict_batters[i['batter_name']][i['Event']].append(1)

                elif i['Event'] in hits:
                    hits_copy = hits.copy()
                    hits_copy.remove(i['Event'])

                    dict_batters[i['batter_name']][i['Event']].append(1)
                    dict_batters[i['batter_name']]['AB'].append(1)
                    dict_batters[i['batter_name']]['HIT'].append(1)

                    for j in hits_copy + other:
                        dict_batters[i['batter_name']][j].append(0)

                dict_final['AVG'].append(np.nan)
                dict_final['OBP'].append(np.nan)
                dict_final['SLG'].append(np.nan)

            else:
                avg = np.sum(dict_batters[i['batter_name']]['HIT']) / np.sum(dict_batters[i['batter_name']]['AB'])
                obp = (
                    np.sum(dict_batters[i['batter_name']]['HIT'] +
                           dict_batters[i['batter_name']]['Walk'] +
                           dict_batters[i['batter_name']]['Hit By Pitch'])
                    / np.sum(dict_batters[i['batter_name']]['AB'] +
                             dict_batters[i['batter_name']]['Walk'] +
                             dict_batters[i['batter_name']]['Hit By Pitch'] +
                             dict_batters[i['batter_name']]['Sac Fly'])
                )
                slg = (
                    np.sum(dict_batters[i['batter_name']]['Single'] +
                           dict_batters[i['batter_name']]['Double'] * 2 +
                           dict_batters[i['batter_name']]['Triple'] * 3 +
                           dict_batters[i['batter_name']]['Home Run'] * 4)
                    / np.sum(dict_batters[i['batter_name']]['AB'])
                )

                dict_final['AVG'].append(avg)
                dict_final['OBP'].append(obp)
                dict_final['SLG'].append(slg)

                if i['Event'] in other:
                    other_copy = other.copy()
                    other_copy.remove(i['Event'])

                    for j in hits + other_copy + extra:
                        dict_batters[i['batter_name']][j].append(0)
                        dict_batters[i['batter_name']][j].pop(0)

                    dict_batters[i['batter_name']][i['Event']].append(1)
                    dict_batters[i['batter_name']][i['Event']].pop(0)

                elif i['Event'] in hits:
                    hits_copy = hits.copy()
                    hits_copy.remove(i['Event'])

                    dict_batters[i['batter_name']][i['Event']].append(1)
                    dict_batters[i['batter_name']]['AB'].append(1)
                    dict_batters[i['batter_name']]['HIT'].append(1)

                    dict_batters[i['batter_name']][i['Event']].pop(0)
                    dict_batters[i['batter_name']]['AB'].pop(0)
                    dict_batters[i['batter_name']]['HIT'].pop(0)

                    for j in hits_copy + other:
                        dict_batters[i['batter_name']][j].append(0)
                        dict_batters[i['batter_name']][j].pop(0)


            
            count_strike, count_ball, pitches = 0, 0, ""
            
            pitch_cats = ['Fastball', 'Movement Fastball', 'Breaking', 'Offspeed', 'Non-Competitive', 'Other']
            
            pit = i['Pitch_Category']
            
            track_pitches[pit] += 1
            
            for pitch in pitch_cats:
                
                dict_final[pitch].append(track_pitches[pitch])
                
                track_pitches[pitch] = 0


            track_matchups[f"{i['pitcher_id']}_vs_{i['batter_name']}"] += 1
            
        else:
            
            print('Error')


Adam Mazur
Vidal Bruján
Cole Tucker
Dedniel Núñez
DJ Herz
Luis Contreras
Shota Imanaga
Ben Brown
Lake Bachar
David Festa
Patrick Wisdom
Brant Hurter
Connor Gillispie
Stephen Kolek
Hunter Bigge
Seth Johnson
Marc Church
Randy Rodríguez
David Hensley
Michael Mercado
Edgardo Henriquez
River Ryan
Spencer Arrighetti
Clayton Beeter
Aaron Hicks
Owen Miller
Will Klein
Mitch Spence
Michael Petersen
Chayce McDermott
Yilber Diaz
Tanner Gordon
Roddery Muñoz
Bailey Horn
Jaden Hill
Anthony Maldonado
Chase Shugart
Cade Povich
Kai-Wei Teng
Landen Roupp
James Meeker
Valente Bellozo
Matt Wallner
Miguel Sanó
Jairo Iriarte
Josh Maciejewski
Trevor McDonald
Tyler Phillips
Eddy Alvarez
Mason Montgomery
Fraser Ellard
Tyler Jay
Weston Wilson
Mitchell Parker
Ricky Vanasco
Paul Skenes
Tyler Fitzgerald
Mason Black
Michael Stefanic
Emmanuel Rivera
Jared Jones
Dominic Smith
Orlando Ribalta
Blair Henley
Joey Cantillo
Cole Winn
Chris Roycroft
Hayden Birdsong
Michel Otañez
Bryan King
Mike Burrows
Bryan Sammons
B

In [None]:
final_df['Fastball' ]

In [21]:
total_df['Event'].unique()

array(['Single', 'Triple', 'Groundout', 'Nothing', 'Walk', 'Flyout',
       'Strikeout', 'Double', 'Sac Bunt', 'Sac Fly', 'Grounded Into DP',
       'Home Run', 'Pop Out', 'Lineout', 'Intent Walk', 'Forceout',
       'Bunt Pop Out', 'Bunt Groundout', 'Fielders Choice',
       'Fielders Choice Out', 'Field Error', 'Hit By Pitch',
       'Double Play', 'Strikeout Double Play', 'Sac Fly Double Play',
       'Bunt Lineout', 'Triple Play', 'Catcher Interference',
       'Caught Stealing 2B', 'Caught Stealing Home', 'Game Advisory',
       'Ejection', 'Pickoff 1B', 'Runner Out', 'Sac Bunt Double Play',
       'Caught Stealing 3B', 'Field Out', 'Pickoff Caught Stealing 2B',
       'Pickoff 3B', 'Batter Out', 'Pickoff 2B'], dtype=object)

In [34]:
total_df['Outcome'].unique()

array(['In play, no out', 'In play, run(s)', 'In play, out(s)', 'Ball',
       'Called Strike', 'Swinging Strike', 'Foul', 'Ball In Dirt',
       'Foul Bunt', 'Swinging Strike (Blocked)', 'Swinging Pitchout',
       'Foul Tip', 'Pitchout', 'Intent Ball', 'Missed Bunt',
       'Hit By Pitch', 'Foul Pitchout'], dtype=object)

In [13]:
final_df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Inning', 'awayScore', 'homeScore',
       'isTopInning', 'Outs', 'Balls', 'Strikes', 'X', 'Y', 'Zone',
       'strikeZoneTop', 'strikeZoneBottom', 'IsOut', 'Outcome', 'Inplay',
       'Event', 'Pitch', 'Description', 'Start_speed', 'End_speed',
       'breakAngle', 'breakLength', 'breakY', 'breakVertical',
       'breakHorizontal', 'breakVerticalInduced', 'spinRate', 'location',
       'batter_name', 'batter_side', 'batter_split', 'pitcher_id',
       'pitcher_name', 'pitcher_hand', 'pitcher_split', 'Date',
       'Pitch_Category', 'spinRate_rolling', 'breakLength_rolling',
       'Start_speed_rolling', 'breakVerticalInduced_rolling',
       'breakHorizontal_rolling', 'breakVertical_rolling',
       'rolling_spinRate_Fastball', 'rolling_spinRate_Breaking',
       'rolling_spinRate_Offspeed', 'rolling_spinRate_Movement Fastball',
       'rolling_breakLength_Fastball', 'rolling_breakLength_Breaking',
       'rolling_breakLength_Offspeed', 'rolling_br

In [70]:
track_matchups

{}

In [66]:
final_df['batter_name']

0         Avisaíl García
1       Christian Yelich
2       Christian Yelich
3       Christian Yelich
4       Christian Yelich
              ...       
8822           Jake Cave
8823           Jake Cave
8824           Jake Cave
8825           Jake Cave
8826         Johan Rojas
Name: batter_name, Length: 8827, dtype: object

In [16]:
final_df.isna().sum().tolist()

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 8827,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 9,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 120,
 120,
 120,
 120,
 120,
 120,
 51,
 68,
 919,
 2852,
 51,
 68,
 919,
 2852,
 51,
 68,
 919,
 2852,
 51,
 68,
 919,
 2852,
 51,
 68,
 919,
 2852,
 51,
 68,
 919,
 2852,
 0]

In [59]:
# Reset the index to ensure no duplicate index issues
final_df = final_df.reset_index(drop=True)

# Loop over each unique pitch type
for pitch in final_df['Pitch_Category'].unique():
    col_name = f'rolling_spinRate_{pitch}'

    # Filter for the specific pitch type
    rolling_col = (
        final_df
        .loc[final_df['Pitch_Category'] == pitch]  # Filter for the specific pitch type
        ['spinRate']
        .rolling(window=50, min_periods=30)
        .mean()
        .shift(1)  # Shift the values to avoid look-ahead bias
        .ffill()  # Forward fill missing values
    )

    # Reassign the computed rolling mean back to the final DataFrame
    final_df[col_name] = rolling_col.reindex(final_df.index, method='ffill')  # Ensure it aligns with final_df


In [60]:
final_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Inning,awayScore,homeScore,isTopInning,Outs,Balls,Strikes,X,...,breakLength_rolling,Start_speed_rolling,breakVerticalInduced_rolling,breakHorizontal_rolling,breakVertical_rolling,Pre_Outs,rolling_spinRate_Fastball,rolling_spinRate_Breaking,rolling_spinRate_Offspeed,rolling_spinRate_Movement Fastball
0,42106,42106,1,0,0,False,0,1,0,-2.362889,...,,,,,,0,,,,
1,42107,42107,1,0,0,False,0,0,1,-0.445384,...,,,,,,0,,,,
2,42108,42108,1,0,0,False,0,0,2,-0.088557,...,,,,,,0,,,,
3,42109,42109,1,0,0,False,0,1,2,-0.723198,...,,,,,,0,,,,
4,42110,42110,1,0,0,False,0,2,2,0.683757,...,,,,,,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8822,705744,705744,5,6,6,False,2,1,2,-0.409862,...,4.896,94.378,11.220,9.738,-19.520,1,2217.2,2311.70,1840.14,2253.76
8823,705745,705745,5,6,6,False,2,1,2,-0.611827,...,7.848,85.252,3.286,-6.406,-34.300,1,2217.2,2317.68,1840.14,2253.76
8824,705746,705746,5,6,6,False,2,1,2,-0.805357,...,4.920,94.304,11.162,9.626,-19.628,1,2214.7,2317.68,1840.14,2253.76
8825,705747,705747,5,6,6,False,2,1,3,0.114166,...,7.896,85.174,3.116,-6.470,-34.536,1,2214.7,2321.30,1840.14,2253.76


In [55]:
final_df.groupby('Pitch_Category')['spinRate'].transform(lambda x: x.rolling(window=50, min_periods=30).mean().shift(1))
                                                  
                                                  

0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
         ...   
2975    2217.20
2976    2317.68
2977    2214.70
2978    2321.30
2979    2320.08
Name: spinRate, Length: 8827, dtype: float64

In [35]:
final_df['Pitch'].unique()

array(['Four-Seam Fastball', 'Slider', 'Curveball', 'Changeup', 'Sinker'],
      dtype=object)

In [46]:
final_df.loc[final_df['Pitch_Category'] == 'Fastball']

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Inning,awayScore,homeScore,isTopInning,Outs,Balls,Strikes,X,...,pitcher_split,Date,Pitch_Category,spinRate_rolling,breakLength_rolling,Start_speed_rolling,breakVerticalInduced_rolling,breakHorizontal_rolling,breakVertical_rolling,Pre_Outs
0,42106,42106,1,0,0,False,0,1,0,-2.362889,...,vs_RHB,2020-07-31,Fastball,,,,,,,0
1,42107,42107,1,0,0,False,0,0,1,-0.445384,...,vs_LHB,2020-07-31,Fastball,,,,,,,0
2,42108,42108,1,0,0,False,0,0,2,-0.088557,...,vs_LHB,2020-07-31,Fastball,,,,,,,0
5,42111,42111,1,0,0,False,0,3,2,-0.454930,...,vs_LHB,2020-07-31,Fastball,,,,,,,0
6,42112,42112,1,0,0,False,0,3,2,0.442578,...,vs_LHB,2020-07-31,Fastball,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2965,705734,705734,5,6,5,False,1,0,1,-0.119325,...,vs_LHB,2023-09-27,Fastball,2222.80,4.896,94.482,11.228,9.876,-19.450,0
2966,705735,705735,5,6,5,False,1,0,2,0.582823,...,vs_LHB,2023-09-27,Fastball,2220.50,4.896,94.456,11.216,9.858,-19.474,0
2973,705742,705742,5,6,6,False,2,0,1,-1.001574,...,vs_LHB,2023-09-27,Fastball,2217.64,4.896,94.410,11.198,9.848,-19.520,1
2975,705744,705744,5,6,6,False,2,1,2,-0.409862,...,vs_LHB,2023-09-27,Fastball,2217.20,4.896,94.378,11.220,9.738,-19.520,1


In [44]:
final_df['Pitch_Category'].unique()

array(['Other', 'Breaking', 'Offspeed', 'Movement Fastball'], dtype=object)

In [41]:
final_df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Inning', 'awayScore', 'homeScore',
       'isTopInning', 'Outs', 'Balls', 'Strikes', 'X', 'Y', 'Zone',
       'strikeZoneTop', 'strikeZoneBottom', 'IsOut', 'Outcome', 'Inplay',
       'Event', 'Pitch', 'Description', 'Start_speed', 'End_speed',
       'breakAngle', 'breakLength', 'breakY', 'breakVertical',
       'breakHorizontal', 'breakVerticalInduced', 'spinRate', 'location',
       'batter_name', 'batter_side', 'batter_split', 'pitcher_id',
       'pitcher_name', 'pitcher_hand', 'pitcher_split', 'Date',
       'Pitch_Category', 'spinRate_rolling', 'breakLength_rolling',
       'Start_speed_rolling', 'breakVerticalInduced_rolling',
       'breakHorizontal_rolling', 'breakVertical_rolling', 'Pre_Outs'],
      dtype='object')

In [51]:
total_df['Actual_pitch'].unique()

array([nan, 'Four-Seam Fastball', 'Changeup', 'Slider', 'Sinker',
       'Pitchout', 'Intentional Ball', 'Fastball', 'Curveball', 'Cutter',
       'Knuckle Curve', 'Eephus', 'Sweeper', 'Slow Curve', 'Splitter',
       'Knuckle Ball', 'Screwball', 'Slurve', 'Forkball',
       'Automatic Ball'], dtype=object)

In [38]:
final_df.loc[final_df['Pitch'] == 'Four-Seam Fastball']

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Inning,awayScore,homeScore,isTopInning,Outs,Balls,Strikes,X,...,pitcher_split,Date,Pitch_Category,spinRate_rolling,breakLength_rolling,Start_speed_rolling,breakVerticalInduced_rolling,breakHorizontal_rolling,breakVertical_rolling,Pre_Outs
0,42106,42106,1,0,0,False,0,1,0,-2.362889,...,vs_RHB,2020-07-31,Other,,,,,,,0
1,42107,42107,1,0,0,False,0,0,1,-0.445384,...,vs_LHB,2020-07-31,Other,,,,,,,0
2,42108,42108,1,0,0,False,0,0,2,-0.088557,...,vs_LHB,2020-07-31,Other,,,,,,,0
5,42111,42111,1,0,0,False,0,3,2,-0.454930,...,vs_LHB,2020-07-31,Other,,,,,,,0
6,42112,42112,1,0,0,False,0,3,2,0.442578,...,vs_LHB,2020-07-31,Other,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2965,705734,705734,5,6,5,False,1,0,1,-0.119325,...,vs_LHB,2023-09-27,Other,2222.80,4.896,94.482,11.228,9.876,-19.450,0
2966,705735,705735,5,6,5,False,1,0,2,0.582823,...,vs_LHB,2023-09-27,Other,2220.50,4.896,94.456,11.216,9.858,-19.474,0
2973,705742,705742,5,6,6,False,2,0,1,-1.001574,...,vs_LHB,2023-09-27,Other,2217.64,4.896,94.410,11.198,9.848,-19.520,1
2975,705744,705744,5,6,6,False,2,1,2,-0.409862,...,vs_LHB,2023-09-27,Other,2217.20,4.896,94.378,11.220,9.738,-19.520,1


In [17]:
final_df['Event'].unique()

array(['Hit By Pitch', 'Walk', 'Home Run', 'Groundout', 'Sac Fly',
       'Strikeout', 'Flyout', 'Double', 'Lineout', 'Grounded Into DP',
       'Single', 'Forceout', 'Triple', 'Fielders Choice Out', 'Pop Out',
       'Field Error', 'Fielders Choice', 'Sac Bunt',
       'Strikeout Double Play', 'Intent Walk', 'Catcher Interference',
       'Bunt Pop Out', 'Double Play', 'Caught Stealing 2B'], dtype=object)

In [18]:
final_df['Description'].unique()

array(['Hit By Pitch', 'Called Strike', 'Foul', 'Ball', 'Ball In Dirt',
       'In play, run(s)', 'In play, out(s)', 'Swinging Strike',
       'In play, no out', 'Swinging Strike (Blocked)', 'Foul Tip',
       'Foul Bunt'], dtype=object)

In [88]:
final_df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Inning', 'awayScore', 'homeScore',
       'isTopInning', 'Outs', 'Balls', 'Strikes', 'X', 'Y', 'Zone',
       'strikeZoneTop', 'strikeZoneBottom', 'IsOut', 'Outcome', 'Inplay',
       'Event', 'Pitch', 'Description', 'Start_speed', 'End_speed',
       'breakAngle', 'breakLength', 'breakY', 'breakVertical',
       'breakHorizontal', 'breakVerticalInduced', 'spinRate', 'location',
       'batter_name', 'batter_side', 'batter_split', 'pitcher_id',
       'pitcher_name', 'pitcher_hand', 'pitcher_split', 'Date',
       'Pitch_Category', 'spinRate_rolling', 'breakLength_rolling',
       'Start_speed_rolling', 'breakVerticalInduced_rolling',
       'breakHorizontal_rolling', 'breakVertical_rolling', 'Pre_Outs'],
      dtype='object')

In [10]:
for i in dict_final:
    
    print(i)
    
    print(len(dict_final[i]))

Pitches_faced
10404427
Atbats_faced
10404427
Pitch_count
10404427
Other
10404427
Fastball
10404427
Movement Fastball
10404427
Breaking
10404427
Offspeed
10404427
Non-Competitive
10404427
Actual_pitch
10404427
Outs
10404427
location
10404427
spinRate_rolling
10404427
breakLength_rolling
10404427
Start_speed_rolling
10404427
breakVerticalInduced_rolling
10404427
breakHorizontal_rolling
10404427
breakVertical_rolling
10404427
Outcome
10404427
Balls
10404427
Strikes
10404427
Event
10404427
Pitch
10404427
Previous_Pitches
10404427
Start_speed
10404427
breakAngle
10404427
breakLength
10404427
breakVertical
10404427
breakHorizontal
10404427
breakVerticalInduced
10404427
spinRate
10404427
batter_name
10404427
batter_side
10404427
batter_split
10404427
pitcher_id
10404427
pitcher_name
10404427
pitcher_hand
10404427
pitcher_split
10404427
Date
10404427
X
10404427
Y
10404427
isTopInning
10404427
Inning
10404427
awayScore
10404427
homeScore
10404427
strikeZoneTop
10404427
strikeZoneBottom
10404427

In [13]:
final_df['rolling_breakVertical_Breaking'].head(80)

0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
        ...    
75   -39.934286
76   -39.802778
77   -39.802778
78   -39.802778
79   -39.824324
Name: rolling_breakVertical_Breaking, Length: 80, dtype: float64

In [17]:
final_df[['rolling_breakVertical_Breaking', 'breakVertical']]

Unnamed: 0,rolling_breakVertical_Breaking,breakVertical
0,,-19.9
1,,-18.4
2,,-16.2
3,,-34.0
4,,-32.6
...,...,...
8822,-34.290,-24.1
8823,-34.300,-38.0
8824,-34.300,-21.8
8825,-34.536,-37.8


In [76]:
final_df['Pre_Outs'] = final_df.apply(lambda x: x['Outs'] - 1 if x['IsOut'] == True else x['Outs'], axis=1)

In [78]:
final_df.loc[final_df['Outs'] == 3][['IsOut', 'Outs']]

Unnamed: 0,IsOut,Outs
23,True,3
24,True,3
25,True,3
26,True,3
27,True,3
...,...,...
2937,True,3
2938,True,3
2960,True,3
2961,True,3


In [63]:
df['Break_Length_Rolling_Avg'] = (
    df.groupby('Pitch_Category')['breakLength']
    .transform(lambda x: x.rolling(window=50, min_periods=30).mean().shift(1))
)

In [64]:
df[['breakLength', 'Break_Length_Rolling_Avg', 'Pitch_Category']].head(10)

Unnamed: 0,breakLength,Break_Length_Rolling_Avg,Pitch_Category
0,4.8,,Other
1,4.8,,Other
2,3.6,,Other
3,7.2,,Breaking
4,7.2,,Breaking
5,3.6,,Other
6,4.8,,Other
7,12.0,,Breaking
8,7.2,,Breaking
9,4.8,,Other


In [35]:
np.sum(dict_batters[i['batter_name']]['HIT'] +
   dict_batters[i['batter_name']]['Walk'] +
   dict_batters[i['batter_name']]['Hit By Pitch']) / (np.sum(dict_batters[i['batter_name']]['AB'] + dict_batters[i['batter_name']]['Walk'] +
     dict_batters[i['batter_name']]['Hit By Pitch'] +
     dict_batters[i['batter_name']]['Sac Fly']))

0.7272727272727273

In [33]:
np.sum(dict_batters[i['batter_name']]['AB'] + dict_batters[i['batter_name']]['Walk'] +
     dict_batters[i['batter_name']]['Hit By Pitch'] +
     dict_batters[i['batter_name']]['Sac Fly'])

22

In [None]:
pd.read_csv(f'Players/pitchers/Matt Cain/2010.csv')['Event'].unique()

In [19]:
total_df

NameError: name 'total_df' is not defined

In [16]:
total_df = pd.DataFrame(dict_final)

In [1]:
len(total_df)

NameError: name 'total_df' is not defined

In [18]:
dict(total_df.isna().sum())

{'Pitches_faced': 0,
 'Atbats_faced': 0,
 'Pitch_count': 0,
 'Other': 0,
 'Fastball': 0,
 'Movement Fastball': 0,
 'Breaking': 0,
 'Offspeed': 0,
 'Non-Competitive': 0,
 'Actual_pitch': 7113,
 'Outs': 2336,
 'location': 13725,
 'spinRate_rolling': 305156,
 'breakLength_rolling': 305095,
 'Start_speed_rolling': 305095,
 'breakVerticalInduced_rolling': 2001039,
 'breakHorizontal_rolling': 2001039,
 'breakVertical_rolling': 2001039,
 'Outcome': 0,
 'Balls': 0,
 'Strikes': 0,
 'Event': 0,
 'Pitch': 0,
 'Previous_Pitches': 0,
 'Start_speed': 7113,
 'breakAngle': 7113,
 'breakLength': 7113,
 'breakVertical': 1795792,
 'breakHorizontal': 1795792,
 'breakVerticalInduced': 1795792,
 'spinRate': 7177,
 'batter_name': 0,
 'batter_side': 0,
 'batter_split': 0,
 'pitcher_id': 0,
 'pitcher_name': 0,
 'pitcher_hand': 0,
 'pitcher_split': 0,
 'Date': 0,
 'X': 10682,
 'Y': 10682,
 'isTopInning': 2336,
 'Inning': 2336,
 'awayScore': 2336,
 'homeScore': 2336,
 'strikeZoneTop': 2336,
 'strikeZoneBottom': 

In [13]:
total_df.to_csv("Play_by_play_batters.csv")

In [15]:
total_df.to_csv("Play_by_play.csv")

In [14]:
total_df.columns

Index(['Pitches_faced', 'Atbats_faced', 'Pitch_count', 'Other', 'Fastball',
       'Movement Fastball', 'Breaking', 'Offspeed', 'Non-Competitive',
       'Actual_pitch', 'Outs', 'location', 'spinRate_rolling',
       'breakLength_rolling', 'Start_speed_rolling',
       'breakVerticalInduced_rolling', 'breakHorizontal_rolling',
       'breakVertical_rolling', 'Outcome', 'Balls', 'Strikes', 'Event',
       'Pitch', 'Previous_Pitches', 'Start_speed', 'breakAngle', 'breakLength',
       'breakVertical', 'breakHorizontal', 'breakVerticalInduced', 'spinRate',
       'batter_name', 'batter_side', 'batter_split', 'pitcher_id',
       'pitcher_name', 'pitcher_hand', 'pitcher_split', 'Date', 'X', 'Y',
       'isTopInning', 'Inning', 'awayScore', 'homeScore', 'strikeZoneTop',
       'strikeZoneBottom', 'IsOut', 'AVG', 'OBP', 'SLG',
       'rolling_spinRate_Fastball', 'rolling_spinRate_Movement Fastball',
       'rolling_spinRate_Breaking', 'rolling_spinRate_Offspeed',
       'rolling_spinRate_Non

In [15]:
total_df['Fastball'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20])

In [53]:
final_df.loc[final_df['Description'] == 'Hit By Pitch'][['Event']]

Unnamed: 0,Event
118,Hit By Pitch
364,Hit By Pitch
409,Hit By Pitch
1285,Hit By Pitch
1536,Hit By Pitch
...,...
1248,Hit By Pitch
1932,Hit By Pitch
2319,Hit By Pitch
2346,Hit By Pitch


In [17]:
for i in dict_final:
    print(i)
    print(len(dict_final[i]))

Outcome
10401336
Balls
10401336
Strikes
10401336
Event
10401336
Pitch
10401336
Previous_Pitches
10401336
Start_speed
10401336
breakAngle
10401336
breakLength
10401336
breakVertical
10401336
breakHorizontal
10401336
breakVerticalInduced
10401336
spinRate
10401336
batter_name
10401336
batter_side
10401336
batter_split
10401336
pitcher_id
10401336
pitcher_name
10401336
pitcher_hand
10401336
pitcher_split
10401336
Date
10401336
X
10401336
Y
10401336
isTopInning
10401336
Outs
10401336
Inning
10401336
awayScore
10401336
homeScore
10401336
strikeZoneTop
10401336
strikeZoneBottom
10401336
IsOut
10401336
AVG
2680987
OBP
2680987
SLG
2680987


In [37]:
final_df[['breakAngle', 'breakLength',
       'breakY', 'breakVertical', 'breakHorizontal', 'breakVerticalInduced']]

Unnamed: 0,breakAngle,breakLength,breakY,breakVertical,breakHorizontal,breakVerticalInduced
0,,,,,,
1,,,,,,
2,,,,,,
3,,,,,,
4,,,,,,
5,,,,,,
6,,,,,,
7,,,,,,
8,,,,,,
9,,,,,,


In [38]:
final_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Inning,awayScore,homeScore,isTopInning,Outs,Balls,Strikes,X,...,location,batter_name,batter_side,batter_split,pitcher_id,pitcher_name,pitcher_hand,pitcher_split,Date,Pitch_Category
0,7834,7834,6,9,0,True,2,0,0,97.0,...,8.0,Jordan Pacheco,R,vs_RHP,446453,Levale Speigner,R,vs_RHB,2010-04-03,Other
1,7835,7835,6,10,0,True,2,0,0,102.15,...,8.0,Jay Payton,R,vs_RHP,446453,Levale Speigner,R,vs_RHB,2010-04-03,Other
2,7836,7836,6,10,0,True,3,0,0,86.7,...,6.0,Melvin Mora,R,vs_RHP,446453,Levale Speigner,R,vs_RHB,2010-04-03,Other
3,7846,7846,7,10,3,True,1,0,0,88.41,...,6.0,Paul Phillips,R,vs_RHP,446453,Levale Speigner,R,vs_RHB,2010-04-03,Other
4,7847,7847,7,10,3,True,1,0,0,103.0,...,8.0,Héctor Gómez,R,vs_RHP,446453,Levale Speigner,R,vs_RHB,2010-04-03,Other
5,7848,7848,7,10,3,True,2,0,0,97.85,...,4.0,Jonathan Herrera,L,vs_RHP,446453,Levale Speigner,R,vs_LHB,2010-04-03,Other
6,7849,7849,7,10,3,True,2,1,0,130.47,...,4.0,Eric Young Jr.,L,vs_RHP,446453,Levale Speigner,R,vs_LHB,2010-04-03,Other
7,7850,7850,7,10,3,True,2,2,0,137.34,...,4.0,Eric Young Jr.,L,vs_RHP,446453,Levale Speigner,R,vs_LHB,2010-04-03,Other
8,7851,7851,7,10,3,True,2,3,0,132.19,...,4.0,Eric Young Jr.,L,vs_RHP,446453,Levale Speigner,R,vs_LHB,2010-04-03,Other
9,7852,7852,7,10,3,True,2,4,0,127.9,...,4.0,Eric Young Jr.,L,vs_RHP,446453,Levale Speigner,R,vs_LHB,2010-04-03,Other


In [23]:
final_df['X']

0       207.07
1       133.98
2       120.38
3       144.57
4        90.94
         ...  
2975    132.62
2976    140.32
2977    147.70
2978    112.65
2979    143.01
Name: X, Length: 8827, dtype: float64

In [12]:
players.index('.DS_Store')

489

In [17]:
players.remove('.DS_Store')

In [30]:
final_df[['IsOut', 'Description']].head(10)

Unnamed: 0,IsOut,Description
0,False,Hit By Pitch
1,False,Called Strike
2,False,Foul
3,False,Ball
4,False,Ball
5,False,Ball In Dirt
6,False,Foul
7,False,Ball
8,False,Called Strike
9,False,"In play, run(s)"


In [32]:
final_df.head(10)[['IsOut', 'Description', 'Event']]

Unnamed: 0,IsOut,Description,Event
0,False,Hit By Pitch,Hit By Pitch
1,False,Called Strike,Walk
2,False,Foul,Walk
3,False,Ball,Walk
4,False,Ball,Walk
5,False,Ball In Dirt,Walk
6,False,Foul,Walk
7,False,Ball,Walk
8,False,Called Strike,Home Run
9,False,"In play, run(s)",Home Run


In [33]:
final_df['Description'].unique()

array(['Hit By Pitch', 'Called Strike', 'Foul', 'Ball', 'Ball In Dirt',
       'In play, run(s)', 'In play, out(s)', 'Swinging Strike',
       'In play, no out', 'Swinging Strike (Blocked)', 'Foul Tip',
       'Foul Bunt'], dtype=object)

In [35]:
final_df['Event'].unique()

array(['Hit By Pitch', 'Walk', 'Home Run', 'Groundout', 'Sac Fly',
       'Strikeout', 'Flyout', 'Double', 'Lineout', 'Grounded Into DP',
       'Single', 'Forceout', 'Triple', 'Fielders Choice Out', 'Pop Out',
       'Field Error', 'Fielders Choice', 'Sac Bunt',
       'Strikeout Double Play', 'Intent Walk', 'Catcher Interference',
       'Bunt Pop Out', 'Double Play', 'Caught Stealing 2B'], dtype=object)

In [25]:
final_df.head(10).columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Inning', 'awayScore', 'homeScore',
       'isTopInning', 'Outs', 'Balls', 'Strikes', 'X', 'Y', 'strikeZoneTop',
       'strikeZoneBottom', 'IsOut', 'Outcome', 'Inplay', 'Event', 'Pitch',
       'Description', 'Start_speed', 'End_speed', 'breakAngle', 'breakLength',
       'breakY', 'breakVertical', 'breakHorizontal', 'breakVerticalInduced',
       'spinRate', 'location', 'batter_name', 'batter_side', 'batter_split',
       'pitcher_id', 'pitcher_name', 'pitcher_hand', 'pitcher_split', 'Date',
       'Avg_spinRate_200', 'Avg_breakVertical_200', 'Avg_breakHorizontal_200',
       'Avg_Start_speed_200', 'Avg_breakAngle_200', 'Avg_spinRate_300',
       'Avg_breakVertical_300', 'Avg_breakHorizontal_300',
       'Avg_Start_speed_300', 'Avg_breakAngle_300'],
      dtype='object')

In [None]:
import pandas as pd
import os

# Folder path containing CSV files
folder_path = 'Data'
csv_files = os.listdir(folder_path)

def load_data(folder_path, csv_files):
    """Loads all CSV files and concatenates them into a single DataFrame."""
    return pd.concat(
        [pd.read_csv(f'{folder_path}/{csv}') for csv in csv_files], ignore_index=True
    )

def bin_pitch_types(df):
    """Bins pitch types into categories."""
    pitch_map = {
        'Fastball': ['4-Seam Fastball', 'Fastball'],
        'Movement Fastball': ['Sinker', 'Cutter'],
        'Breaking': ['Slider', 'Curveball', 'Sweeper', 'Slurve', 'Knuckle Curve'],
        'Offspeed': ['Changeup', 'Split-Finger', 'Forkball', 'Screwball'],
        'Non-Competitive': ['Eephus', 'Pitch Out', 'Intent Ball']
    }
    reverse_map = {pitch: category for category, pitches in pitch_map.items() for pitch in pitches}
    df['Pitch_Category'] = df['Pitch'].map(reverse_map).fillna('Other')
    return df

def calculate_rolling_averages(df, cols, windows=[100, 200, 300]):
    """Computes rolling averages for specified columns."""
    
    for window in windows:
        for col in cols:
            df[f'{col}_Rolling'] = df.groupby('Pitcher')[col].transform(lambda x: x.rolling(window, min_periods=1).mean())
    return df

def process_data(df):
    """Processes the dataset to compute pitch sequences, strike counts, and other metrics."""
    df = bin_pitch_types(df)
    df = calculate_rolling_averages(df, ['Spin Rate', 'Break Length', 'Velo'])
    
    results = []
    for (pitcher, game), group in df.groupby(['Pitcher', 'Date']):
        count_strike, count_ball, pitches = 0, 0, ""
        for _, row in group.iterrows():
            # Update strike and ball count
            if 'Foul' in row['Description'] and count_strike <= 1:
            if 'Strike' in row['Description']:
                count_strike += 1
            elif 'Ball' in row['Description']:
                count_ball += 1
            
            # Store processed data
            results.append({
                'Date': row['Date'],
                'Pitcher': row['Pitcher'],
                'Batter': row['Batter'],
                'Pitch': row['Pitch_Category'],
                'Strike_Count': count_strike,
                'Ball_Count': count_ball,
                'Previous_Pitches': pitches,
                'IsOut': row['IsOut']
            })
            
            # Append pitch history
            pitches += f"{row['Pitch_Category']},"
    
    return pd.DataFrame(results)

# Execute data processing
df = load_data(folder_path, csv_files)
processed_df = process_data(df)

# Save the processed DataFrame
processed_df.to_csv('processed_data.csv', index=False)