In [1]:
import pandas as pd
import numpy as np

# Load the Batting and Fielding data
batting = pd.read_csv("/Users/lizvitai/Documents/Pythonsports/Batting.csv")
fielding = pd.read_csv("/Users/lizvitai/Documents/Pythonsports/Fielding.csv")

# Exclude pitchers using the Fielding data
positional_players = fielding[fielding['POS'] != 'P']
positional_player_ids = positional_players['playerID'].unique()
batting = batting[batting['playerID'].isin(positional_player_ids)]

# Extract relevant columns
season = batting['yearID']
player_id = batting['playerID']
h = batting['H']
b2 = batting['2B']
b3 = batting['3B']
hr = batting['HR']
ab = batting['AB']

# Filter players from the 2010s
is_2010s = np.in1d(season, range(2010, 2020))
batting_2010s = batting[is_2010s]

# Exclude rows with missing or zero AB
batting_2010s = batting_2010s[batting_2010s['AB'] > 0]

# Aggregate stats by player ID
aggregated = batting_2010s.groupby('playerID').agg({
    'H': 'sum',
    '2B': 'sum',
    '3B': 'sum',
    'HR': 'sum',
    'AB': 'sum'
}).reset_index()

# Calculate singles
aggregated['1B'] = aggregated['H'] - (aggregated['2B'] + aggregated['3B'] + aggregated['HR'])

# Calculate total bases and slugging percentage
aggregated['Total_Bases'] = (
    aggregated['1B'] +
    2 * aggregated['2B'] +
    3 * aggregated['3B'] +
    4 * aggregated['HR']
)
aggregated['Slugging_Percentage'] = aggregated['Total_Bases'] / aggregated['AB']

# Debug intermediate calculations (before filtering by at-bats)
print("Intermediate Aggregated Data (First 25 Players):")
print(aggregated[['playerID', 'H', '1B', '2B', '3B', 'HR', 'AB', 'Total_Bases', 'Slugging_Percentage']].head(25))

# Filter players with at least 1000 at-bats
aggregated = aggregated[aggregated['AB'] >= 1000]

# Set playerID as the index
aggregated.set_index('playerID', inplace=True)

# Sort by Slugging Percentage in descending order
sorted_aggregated = aggregated.sort_values(by='Slugging_Percentage', ascending=False)

# Reset the index to make playerID a column again
sorted_aggregated.reset_index(inplace=True)

# Display the top 25 players by slugging percentage
print("\nTop 25 Players by Slugging Percentage with 1000 or More At Bats (High to Low):")
print(sorted_aggregated[['playerID', 'Slugging_Percentage']].head(25))

# Save the filtered and aggregated DataFrame to a CSV file
sorted_aggregated.to_csv("/Users/lizvitai/Documents/Pythonsports/Slugging_Percentage_2010s_with_1000AB.csv", index=False)


Intermediate Aggregated Data (First 25 Players):
     playerID     H   1B   2B  3B   HR    AB  Total_Bases  Slugging_Percentage
0   abreubo01   359  233   91   3   32  1427          552             0.386826
1   abreujo02  1038  627  218  14  179  3547         1821             0.513392
2   abreuto01   100   66   25   5    4   405          147             0.362963
3   ackledu01   512  354   94  18   46  2125          780             0.367059
4   acunaro01   302  181   48   6   67  1059          563             0.531634
5   adamecr01    70   55    9   4    2   328           93             0.283537
6   adamewi01   215  152   32   1   30   819          339             0.413919
7   adamsda02    27   19    5   1    2   140           40             0.285714
8   adamsla01    36   23    5   1    7   137           64             0.467153
9   adamsma01   609  360  127   6  116  2336         1096             0.469178
10  adamsry01    25   21    4   0    0    89           29             0.325843
11 