Description: This program will compare the the accuracy between formatted and formatted & normalized data in a predictive neural network called Long Short Term Memory (LSTM) that will predict the result of a baseball game given input statistics
Sources: https://www.youtube.com/watch?v=QIUxPv5PJOY

In [None]:
import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt
import math
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM
import matplotlib.pyplot as plt
from random import sample
from sklearn.metrics import r2_score


In [None]:
df_normalized = pd.read_csv('../csv_files/mlb_normalized.csv')
df_base = pd.read_csv('../csv_files/mlb_formatted.csv')

In [None]:
def get_model_accuracy_year(year):    
    normalized_train_year = df_normalized[df_normalized['season'] == year]
    normalized_train_year_x = normalized_train_year[normalized_train_year.columns[1:13]]
    normalized_train_y_score_1 = normalized_train_year[normalized_train_year.columns[14:15]]
    normalized_train_y_score_2 = normalized_train_year[normalized_train_year.columns[15:16]]

    length_min = 0
    length_max = len(normalized_train_year_x)

    predicted_score_1 = []
    predicted_score_2 = []

    model_score_1 = Sequential()
    model_score_1.add(LSTM(50, return_sequences=True, input_shape=(normalized_train_year_x.shape[1], 1)))
    model_score_1.add(LSTM(50, return_sequences=False))
    model_score_1.add(Dense(25))
    model_score_1.add(Dense(1))
    model_score_1.compile(optimizer='adam', loss='mse')

    model_score_2 = Sequential()
    model_score_2.add(LSTM(50, return_sequences=True, input_shape=(normalized_train_year_x.shape[1], 1)))
    model_score_2.add(LSTM(50, return_sequences=False))
    model_score_2.add(Dense(25))
    model_score_2.add(Dense(1))
    model_score_2.compile(optimizer='adam', loss='mse')

    model_score_1.fit(normalized_train_year_x, normalized_train_y_score_1, batch_size=10000, epochs=1)
    model_score_2.fit(normalized_train_year_x, normalized_train_y_score_2, batch_size=10000, epochs=1)

    predicitons_score_1 = model_score_1.predict(normalized_train_year_x)
    predicitons_score_2 = model_score_2.predict(normalized_train_year_x)

    results = df_normalized[df_normalized.columns[16:17]]
    results = results[length_min:length_max]

    generated_results = []

    for i in range(length_max-length_min):
        if predicitons_score_1[i] < predicitons_score_2[i]:
            generated_results.append(1)
        else:
            generated_results.append(0)

    Accuracy = metrics.accuracy_score(generated_results, results)
    return Accuracy*100


In [None]:
def get_model_accuracy_all_years():
    normalized_train_year = df_normalized
    normalized_train_year_x = normalized_train_year[normalized_train_year.columns[1:13]]
    normalized_train_y_score_1 = normalized_train_year[normalized_train_year.columns[14:15]]
    normalized_train_y_score_2 = normalized_train_year[normalized_train_year.columns[15:16]]

    length_min = 0
    length_max = len(normalized_train_year_x)

    data = {}

    model_score_1 = Sequential()
    model_score_1.add(LSTM(50, return_sequences=True, input_shape=(normalized_train_year_x.shape[1], 1)))
    model_score_1.add(LSTM(50, return_sequences=False))
    model_score_1.add(Dense(25))
    model_score_1.add(Dense(1))
    model_score_1.compile(optimizer='adam', loss='mse')

    model_score_2 = Sequential()
    model_score_2.add(LSTM(50, return_sequences=True, input_shape=(normalized_train_year_x.shape[1], 1)))
    model_score_2.add(LSTM(50, return_sequences=False))
    model_score_2.add(Dense(25))
    model_score_2.add(Dense(1))
    model_score_2.compile(optimizer='adam', loss='mse')

    model_score_1.fit(normalized_train_year_x, normalized_train_y_score_1, batch_size=1000, epochs=15)
    model_score_2.fit(normalized_train_year_x, normalized_train_y_score_2, batch_size=1000, epochs=15)

    
    
    for y in range(1913, 2021):

        temp_test_year = normalized_train_year_x[normalized_train_year_x['season'] == y]
        temp_results_year = normalized_train_year[normalized_train_year['season'] == y]
        
        length_min = 0
        length_max = len(temp_test_year)
        
        predicted_score_1 = []
        predicted_score_2 = []

        predicted_score_1 = model_score_1.predict(temp_test_year)

        predicted_score_2  = model_score_1.predict(temp_test_year)

        results = temp_results_year[temp_results_year.columns[16:17]]
        results = results[length_min:length_max]

        generated_results = []

        for i in range(length_max - length_min):
            if predicted_score_1[i] < predicted_score_2[i]:
                generated_results.append(1)
            else:
                generated_results.append(0)

        Accuracy = metrics.accuracy_score(generated_results, results)
        data[y] = Accuracy * 100
    
    
    return data

In [None]:
data = {}
for i in range(1913, 2021):
    get_model_accuracy_year(i)
    data[i] = get_model_accuracy_year(i)

In [None]:
data_all = get_model_accuracy_all_years()

In [None]:
def model_year():    
    lists = sorted(data.items())
    lists = sample(lists,len(data))
    x, y = zip(*lists)

    plt.figure(dpi=500)

    plt.ylabel('accuracy')
    plt.xlabel('season')

    plt.ylim(min(y)-2, max(y)+2)
    plt.xlim(1910, 2025)

    mymodel = np.poly1d(np.polyfit(x, y, 3))
    myline = np.linspace(1913, 2021, 100)

    for i in range(len(x)):
        plt.annotate(x[i],(x[i],y[i]), size=4, textcoords='offset points', xytext=(0,5), ha='center')

    plt.scatter(x,y)
    plt.plot(myline, mymodel(myline))

    #center of years
    # (2021 + 1913) / 2 = 1967
    plt.plot([1967,1967],[0,100], color='red', linestyle='dotted')

    #horizontal_center
    horizontal_center = (min(y) + max(y)) / 2
    plt.plot([1900,2100],[horizontal_center,horizontal_center], color='green', linestyle='dotted')




    plt.show
    print(r2_score(y, mymodel(x)))
model_year()

In [None]:
def model_year_all():    
    lists = sorted(data_all.items())
    lists = sample(lists,len(data_all))
    x, y = zip(*lists)

    plt.figure(dpi=500)

    plt.ylabel('accuracy')
    plt.xlabel('season')

    plt.ylim(min(y)-2, max(y)+2)
    plt.xlim(1910, 2025)

    mymodel = np.poly1d(np.polyfit(x, y, 3))
    myline = np.linspace(1913, 2021, 100)

    for i in range(len(x)):
        plt.annotate(x[i],(x[i],y[i]), size=4, textcoords='offset points', xytext=(0,5), ha='center')

    plt.scatter(x,y)
    plt.plot(myline, mymodel(myline))

    #center of years
    # (2021 + 1913) / 2 = 1967
    plt.plot([1967,1967],[0,100], color='red', linestyle='dotted')

    #horizontal_center
    horizontal_center = (min(y) + max(y)) / 2
    plt.plot([1900,2100],[horizontal_center,horizontal_center], color='green', linestyle='dotted')




    plt.show
    print(r2_score(y, mymodel(x)))
model_year_all()