In [None]:
#Import libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

In [None]:
#loading dataset
path = 'C:/Users/kenne/mvp_analysis/mvp_voting_'
try:
    start_year = str(int(input('Enter start year:')))
    end_year = str(int(input('Enter end year:')))
except ValueError:
    print("Invalid input. Please enter a valid integer.")

file_path = path + start_year + "_" + end_year + ".csv"
df = pd.read_csv(file_path)
df.head()

In [None]:
#Labeling more features
df['Lg'] = None #League (NL or AL)
df.insert(0, 'Lg', df.pop('Lg')) 

df['Pos_Int'] = None #Position (Pitcher = 0, Batter = 1)
df.insert(0, 'Pos_Int', df.pop('Pos_Int'))

df['Pos'] = None #Position (String)
df.insert(0, 'Pos', df.pop('Pos')) 

df['LgYear'] = None #League + Year
df.insert(0, 'LgYear', df.pop('LgYear')) 

df['IsMvp_Int'] = None #Is MVP (Yes = 1, No = 0)
df.insert(0, 'IsMvp_Int', df.pop('IsMvp_Int')) 

df['IsMvp'] = None #Is MVP (String)
df.insert(0, 'IsMvp', df.pop('IsMvp')) 

df['Rank'] = None #Rank amongst competition
df.insert(0, 'Rank', df.pop('Rank')) 

df.head()

In [None]:
#Classifying of said features
rnk = 0
lg = "NL"

for row in df.itertuples(index=True):
    year = row.Year
    name = row.Name
    index = row.Index
    era = row.ERA
    
    if pd.isna(name):
        rnk = 0
    df.at[index, 'Rank'] = rnk
    
    if (rnk == 1):
        df.at[index, 'IsMvp'] = "Yes"
        df.at[index, 'IsMvp_Int'] = 1
    else:
        df.at[index, 'IsMvp'] = "No"
        df.at[index, 'IsMvp_Int'] = 0

    if pd.isna(era):
        df.at[index, 'Pos'] = "Batter"
        df.at[index, 'Pos_Int'] = 1
    else:
        df.at[index, 'Pos'] = "Pitcher"
        df.at[index, 'Pos_Int'] = 0

    if pd.isna(name):
        if (lg == "NL"):
            lg = "AL"
        else:
            lg = "NL"
    df.at[index, 'Lg'] = lg
    df.at[index, 'LgYear'] = lg + str(year)[2:]

    rnk += 1

In [None]:
#Drop rows without a name
df.dropna(subset=['Name'], inplace=True)
df.head()

In [None]:
#Write to new .csv file
file_path_2 = path + start_year + "_" + end_year + "_cleaned.csv"
df.to_csv(file_path_2, index=False)

In [None]:
df = pd.read_csv(file_path_2)
df.shape

In [None]:
al_mvp_winners = df[(df["IsMvp"] == "Yes") & (df["Lg"] == "AL")]
nl_mvp_winners = df[(df["IsMvp"] == "Yes") & (df["Lg"] == "NL")]

not_al_mvp_winners = df[(df["IsMvp"] == "No") & (df["Lg"] == "AL")]
not_nl_mvp_winners = df[(df["IsMvp"] == "No") & (df["Lg"] == "NL")]

not_al_mvp_winners_cnt = not_al_mvp_winners.groupby("Year").size()
not_nl_mvp_winners_cnt = not_nl_mvp_winners.groupby("Year").size()

#print(not_al_mvp_winners_cnt)
#print(not_nl_mvp_winners_cnt)

print("Avg number of AL non-winners in race from " + start_year + " to " + end_year + ": " + str(round(not_al_mvp_winners_cnt.mean(),2)))
print("Avg number of NL non-winners in race from " + start_year + " to " + end_year + ": " + str(round(not_nl_mvp_winners_cnt.mean(),2)))

In [None]:
# Filter the df for pitchers and batters
pitchers = df[df["Pos"] == "Pitcher"]
batters = df[df["Pos"] == "Batter"]

# Group by year and count the number of pitchers and batters
pitcher_cnt = pitchers.groupby("Year").size()
batter_cnt = batters.groupby("Year").size()

# Plotting
plt.figure(figsize=(10, 5))
plt.plot(pitcher_cnt.index, pitcher_cnt.values, marker='o', linestyle='-', color='blue', label='Pitchers')
plt.plot(batter_cnt.index, batter_cnt.values, marker='o', linestyle='-', color='red', label='Batters')
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Count of Pitchers vs Batters to get MVP Votes by Year (' + start_year + " to " + end_year +")")
plt.legend()
plt.grid(True)
plt.show()

correlation = pitcher_cnt.corr(batter_cnt)
print("Correlation between pitcher count and batter count:", correlation)
covariance = pitcher_cnt.cov(batter_cnt)
print("Covariance between pitcher count and batter count:", covariance, "\n")

In [None]:

# Filter the DataFrame for AL and NL pitchers and batters
al_pitchers = df[(df["Pos"] == "Pitcher") & (df["Lg"] == "AL")]
nl_pitchers = df[(df["Pos"] == "Pitcher") &  (df["Lg"] == "NL")]

al_batters = df[(df["Pos"] == "Batter") &  (df["Lg"] == "AL")]
nl_batters = df[(df["Pos"] == "Batter") &  (df["Lg"] == "NL")]

# Group by year and count the number of pitchers and batters for AL and NL
al_pitcher_cnt = al_pitchers.groupby("Year").size()
nl_pitcher_cnt = nl_pitchers.groupby("Year").size()

al_batter_cnt = al_batters.groupby("Year").size()
nl_batter_cnt = nl_batters.groupby("Year").size()

# Plotting AL vs NL pitchers
plt.figure(figsize=(15, 6))

plt.subplot(3, 2, 1)
plt.plot(al_pitcher_cnt.index, al_pitcher_cnt.values, marker='o', linestyle='-', color='blue', label='AL Pitchers')
plt.plot(nl_pitcher_cnt.index, nl_pitcher_cnt.values, marker='o', linestyle='-', color='red', label='NL Pitchers')
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Count of Pitchers to get MVP Votes by Year (AL vs NL)')
plt.legend()
plt.grid(True)

# Plotting AL vs NL batters
plt.subplot(3, 2, 2)
plt.plot(al_batter_cnt.index, al_batter_cnt.values, marker='o', linestyle='-', color='blue', label='AL Batters')
plt.plot(nl_batter_cnt.index, nl_batter_cnt.values, marker='o', linestyle='-', color='red', label='NL Batters')
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Count of Batters to get MVP Votes by Year (AL vs NL)')
plt.legend()
plt.grid(True)

correlation = al_pitcher_cnt.corr(nl_pitcher_cnt)
print("Correlation between pitcher counts between leagues:", correlation)
covariance = al_pitcher_cnt.cov(nl_pitcher_cnt)
print("Covariance between pitcher counts between leagues:", covariance, "\n")

correlation = al_batter_cnt.corr(nl_batter_cnt)
print("Correlation between batter counts between leagues:", correlation)
covariance = al_batter_cnt.cov(nl_batter_cnt)
print("Covariance between batter counts between leagues:", covariance,"\n")

In [None]:
al_batter_avg = al_batter_cnt.mean()
nl_batter_avg = nl_batter_cnt.mean()
al_pitcher_avg = al_pitcher_cnt.mean()
nl_pitcher_avg = nl_pitcher_cnt.mean()

# Data for plotting
leagues = ['AL Pitchers', 'NL Pitchers', 'AL Batters', 'NL Batters']
averages = [al_pitcher_avg, nl_pitcher_avg, al_batter_avg, nl_batter_avg]

# Creating the bar plot
plt.bar(leagues, averages, color=['blue', 'red', 'green', 'orange'])
plt.xlabel('League and Position')
plt.ylabel('Average Number')
plt.title('Average Number of Pitchers and Batters to get MVP Votes by League')
# Adding numbers on the bars
for i, value in enumerate(averages):
    plt.text(i, value, round(value, 2), ha='center', va='bottom')
plt.show()

In [None]:
df['IsMvp_Int'] = df['IsMvp_Int'].astype(int)

# Identify groups where the rank 1 player is a pitcher
pitcher_groups = df.groupby('LgYear').apply(lambda x: x.iloc[0]['Pos'] == 'Pitcher')

# Extract indices of groups where the rank 1 player is a pitcher
indices_to_remove = pitcher_groups[pitcher_groups].index

# Drop these indices from the DataFrame
df_filtered = df[~df['LgYear'].isin(indices_to_remove)]

# Now df_filtered contains the DataFrame with groups remove

df_filtered = df_filtered[df_filtered['Pos'] != 'Pitcher'] #filter out all pitchers

In [None]:
df_filtered.head()

In [None]:
df = df_filtered.drop(columns = ['ERA','WHIP','G.1','GS','SV','IP','H.1','HR.1','BB.1','SO', 'W', 
                                 'L', '1st Place', 'Name','Year','Lg','IsMvp','Pos','Pos_Int','Tm','Rank',
                                'Vote Pts', 'IsMvp_Int'])
df.head()

In [None]:
# Convert percentage strings into integers
df['Share'] = df['Share'].str.rstrip('%').astype(int)
df.head()

In [None]:
# Define the columns to normalize
columns_to_normalize = ['Share','WAR', 'G', 'AB', 'R', 'H', 'HR', 'RBI', 'SB', 'BB', 'BA', 'OBP', 'SLG', 'OPS']

# Define a function for normalization
def normalize(group):
    # Min-max normalization within each group
    return (group - group.min()) / (group.max() - group.min())

# Apply normalization within each group
df[columns_to_normalize] = df.groupby('LgYear')[columns_to_normalize].transform(normalize)

df.drop(columns=['LgYear'], inplace=True)
df.head()

In [None]:
# Define the features (X) and target variable (y)
X = df.drop(columns=['Share'])
y = df['Share']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features using Min-Max scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the TensorFlow model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, mae = model.evaluate(X_test_scaled, y_test)

import matplotlib.pyplot as plt

# Get the training history
train_loss = history.history['loss']
val_loss = history.history['val_loss']
train_mae = history.history['mae']
val_mae = history.history['val_mae']
epochs = range(1, len(train_loss) + 1)

# Plot the training and validation loss
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(epochs, train_loss, 'b', label='Training Loss')
plt.plot(epochs, val_loss, 'r', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Plot the training and validation MAE
plt.subplot(1, 2, 2)
plt.plot(epochs, train_mae, 'b', label='Training MAE')
plt.plot(epochs, val_mae, 'r', label='Validation MAE')
plt.title('Training and Validation MAE')
plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.legend()

# Predict on the test set
y_pred = model.predict(X_test_scaled).flatten()

# Calculate the residuals
residuals = y_test - y_pred

# Plot the predicted values against the actual values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred)
plt.plot([0, 1], [0, 1], color='red', linestyle='--')  # Diagonal line for reference
plt.xlabel('Actual Share')
plt.ylabel('Predicted Share')
plt.title('Actual vs. Predicted Share')
plt.grid(True)
plt.show()

# Plot the distribution of residuals
plt.figure(figsize=(8, 6))
plt.hist(residuals, bins=20)
plt.xlabel('Residuals (Actual - Predicted)')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.grid(True)
plt.show()

plt.tight_layout()
plt.show()

print("Mean Absolute Error:", mae)