In [42]:
import requests
import re
from bs4 import BeautifulSoup

def get_player_info(root_url, page_amount):
    players = []
    # Look through each page
    for i in range(1, page_amount + 1):
        r = requests.get(root_url+"?page={}".format(i))
        soup = BeautifulSoup(r.text)
        table = soup.find('table')

        for info in table.find_all('tr'):
            player = {}
            
            # Get Player Name 
            if info.find('a') is not None:
                player['player_name'] = info.find('a').find_all('span')[0].get_text() + ' ' + info.find('a').find_all('span')[2].get_text()
            
            # Gets stats for player
            row_info = info.find_all('td')
            if row_info != []:
                player['games'] = row_info[1].get_text()
                player['ab'] = row_info[2].get_text()
                player['h'] = row_info[4].get_text()
                player['hr'] = row_info[7].get_text()
                player['bb'] = row_info[9].get_text()
                player['so'] = row_info[10].get_text()
                player['avg'] = row_info[13].get_text()
                player['obp'] = row_info[14].get_text()
                player['slg'] = row_info[15].get_text()
            players.append(player)


    return players


# Get training set
train_set = ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017']
train_dict = {}
for each in train_set:
    if each == '2010':
        pages = 7
    else:
        pages = 6
    train_dict[each] = get_player_info('https://www.mlb.com/stats/' + each, pages)
    
# Get test set
test_set = ['2018', '2019']
test_dict = {}
for each in test_set:
    test_dict[each] = get_player_info('https://www.mlb.com/stats/' + each, 6)

# Run on actual set
run_dict = {}
run_dict['2020'] = get_player_info('https://www.mlb.com/stats/', 6)

In [43]:
import numpy as np
import pandas as pd

# Transoforms our data into a dataframe
def transform_to_df(dict_to_map):
    df_return = pd.DataFrame()
    for key in dict_to_map:
        temp_df = pd.DataFrame(dict_to_map[key])
        df_return = df_return.append(temp_df, ignore_index = True)
    return df_return.dropna()


# Train Dataframe of all values
train_df = transform_to_df(train_dict)
# Test Dataframe of all values
test_df = transform_to_df(test_dict)
# Run Dataframe to predict values for HomeRuns
run_df = transform_to_df(run_dict)

Unnamed: 0,player_name,games,ab,h,hr,bb,so,avg,obp,slg
1,Josh Hamilton,133,518,186,32,43,95,.359,.411,.633
2,Miguel Cabrera,150,548,180,38,89,95,.328,.420,.622
3,Joey Votto,150,547,177,37,91,125,.324,.424,.600
4,Albert Pujols,159,587,183,42,103,76,.312,.414,.596
5,Jose Bautista,161,569,148,54,100,116,.260,.378,.617
...,...,...,...,...,...,...,...,...,...,...
1203,Dansby Swanson,144,488,113,6,59,120,.232,.312,.324
1204,Billy Hamilton,139,582,144,4,44,133,.247,.299,.335
1205,Alcides Escobar,162,599,150,6,15,102,.250,.272,.357
1206,Jose Peraza,143,487,126,5,20,70,.259,.297,.324


In [64]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Training set
x_train = train_df[['games', 'ab', 'h', 'bb', 'so', 'avg', 'obp', 'slg']]
y_train = train_df[['hr']]

# Testing Set
x_test = test_df[['games', 'ab', 'h', 'bb', 'so', 'avg', 'obp', 'slg']]
y_test = test_df[['hr']]

# Prediction Set may need to scale for full season
x_run = run_df[['games', 'ab', 'h', 'bb', 'so', 'avg', 'obp', 'slg']]
y_run = run_df[['hr']]

model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_run)
i = 0
for each in run_df['player_name']:
    print(str(each) + " Predicted HR: " + str(predictions[i][0]))
    i+=1

Juan Soto Predicted HR: 31.476805024286136
Freddie Freeman Predicted HR: 26.34898380824457
Marcell Ozuna Predicted HR: 26.98076209698877
DJ LeMahieu Predicted HR: 12.476515382103202
Jose Ramirez Predicted HR: 31.296455241794277
Mike Trout Predicted HR: 32.106866950087266
Dominic Smith Predicted HR: 26.546629897269867
Nelson Cruz Predicted HR: 25.69010396038484
Ronald Acuna Predicted HR: 33.00448289308504
Jose Abreu Predicted HR: 28.750912952614456
Trea Turner Predicted HR: 19.62087814217766
Mike Yastrzemski Predicted HR: 22.45049475676162
Bryce Harper Predicted HR: 23.719064212890636
Wil Myers Predicted HR: 31.48086572517854
Manny Machado Predicted HR: 24.522238406593473
Luke Voit Predicted HR: 35.02205428536483
Corey Seager Predicted HR: 24.320193395451398
Fernando Tatis Jr. Predicted HR: 28.420625906790256
Mookie Betts Predicted HR: 23.57972528892092
Michael Conforto Predicted HR: 8.330290606842603
Teoscar Hernandez Predicted HR: 26.367133758895022
Brandon Lowe Predicted HR: 26.08592