## Importing libraries

In [1]:
!pip install webdriver_manager
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import pandas as pd



## Defining functions and initializing web driver 

In [2]:
# Function for Web Scraping using Selenium
def scrape_table_data(driver, xpath): ##passing two elements - driver and xpath
    table = driver.find_element(By.XPATH, xpath) ##using x_path with By.Xpath and getting data
    return [row.text.split('\n') for row in table.find_elements(By.TAG_NAME, "tr")] ##Structuring all data from tags(trs)

# Function to click a button if present and clickable - important for cookies buttons and switching pages when on the same url
def click_button_if_present(driver, xpath): 
    try:
        button = driver.find_element(By.XPATH, xpath)
        if button.is_enabled():
            button.click()
            return True
    except:
        return False

# Set up the Chrome driver
s = Service(ChromeDriverManager().install())


# Scrape data for the year 2023 only
year = 2023
driver = webdriver.Chrome(service=s)
driver.get(f"https://www.mlssoccer.com/stats/players/#season={year}&competition=mls-regular-season&club=all&statType=goalkeeping&position=goalkeeper")
###Going to the url
driver.maximize_window()

# Accept cookies button
click_button_if_present(driver, '//*[@id="onetrust-accept-btn-handler"]')
time.sleep(3)

# Get initial table data
xpath = "//*[@id='main-content']/section/div/div[2]/div[1]/table/tbody"
first_page = scrape_table_data(driver, xpath) ###first_page separeately so it won't be deleted 
table_data = scrape_table_data(driver, xpath) ###getting all tables in this list
time.sleep(3)

new_data = []
last_page = []
max_iterations = 4  # Maximum number of page iterations (knowing that there are 4 pages on URL)

# Loop through pages
button_number = 2
iteration = 1
while iteration < max_iterations and click_button_if_present(driver, f"//div[@class='mls-o-pagination']/button[{button_number}]/div"):
    ### iterating through pages
    time.sleep(5)
    new_data += scrape_table_data(driver, xpath) ###Adding the table from each page

    # Check if the new data is the same as the previous data
    if new_data == table_data:
        print("No new data. Exiting the loop.")
        break ###Breaking the loop if it keeps getting the same data 

    iteration += 1
    

# Quit the driver
driver.quit()

## Getting first page table together with tables from other pages

In [3]:
df_list = first_page + new_data
df_list

[['M. Anchor', 'VAN 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'],
 ['CJ dos Santos', 'MIA 1 1 90 1 1 3 75 0 0 68 25 2 0 0 1 0 0'],
 ['L. Barraza',
  'NYC 24 24 2160 31 1.29 62 66.7 2 2 79.6 37.5 16 4 5 9 10 20.8'],
 ['G. Beavers', 'RSL 7 6 585 14 2.15 13 48.1 2 2 67.1 42.4 5 0 2 3 2 33.3'],
 ['J. Bendik', 'PHI 8 7 687 13 1.7 15 53.6 1 1 58.2 42 3 1 3 4 1 42.9'],
 ['M. Bersano', 'ATX 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'],
 ['R. Bilichuk', 'POR 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'],
 ['D. Bingham',
  'POR 19 19 1709 29 1.53 49 62.8 2 2 63.9 41.1 28 5 8 6 5 42.1'],
 ['A. Blake', 'PHI 27 27 2373 28 1.06 73 72.3 5 5 64.7 40.6 17 8 13 5 9 48.1'],
 ['I. Boehmer', 'VAN 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'],
 ['J. Bond', 'LA 24 24 2049 46 2.02 80 63.5 2 2 80.3 44 31 4 6 8 10 25'],
 ['A. Bono', 'DC 10 9 855 11 1.16 29 72.5 2 2 67.2 43.4 6 5 3 3 4 33.3'],
 ['C. Brady',
  'CHI 30 30 2640 44 1.5 94 68.1 10 9 60.5 36.1 33 8 10 11 9 33.3'],
 ['R. Bürki',
  'STL 33 33 2970 42 1.27 123 74.5 7 7 57.3 34.3 42 8 17 11 5

## Structuring data

In [4]:
split_teams = [item[1].split() for item in df_list]

# Define column names for the DataFrame
column_names = ['Club','GP','GS','Mins','GA','GAA','SV','SV%','PKA','PKC',
                'Pass%','LB%','CLR','CS','W','L','T','W%']
###Giveing Column Names by MLS official Website
# Create the DataFrame
df = pd.DataFrame(split_teams, columns=column_names)

# Add the "Player" column to the DataFrame
df['Player'] = [item[0] for item in df_list]
df['Season'] = 'Regular'

# Display the DataFrame
print(df)

    Club  GP  GS  Mins  GA   GAA  SV   SV% PKA PKC Pass%   LB% CLR  CS   W  \
0    VAN   0   0     0   0     0   0     0   0   0     0     0   0   0   0   
1    MIA   1   1    90   1     1   3    75   0   0    68    25   2   0   0   
2    NYC  24  24  2160  31  1.29  62  66.7   2   2  79.6  37.5  16   4   5   
3    RSL   7   6   585  14  2.15  13  48.1   2   2  67.1  42.4   5   0   2   
4    PHI   8   7   687  13   1.7  15  53.6   1   1  58.2    42   3   1   3   
..   ...  ..  ..   ...  ..   ...  ..   ...  ..  ..   ...   ...  ..  ..  ..   
110   NE   0   0     0   0     0   0     0   0   0     0     0   0   0   0   
111  ATL   6   6   495  10  1.82  10    50   2   2  86.9  55.1   5   1   2   
112  NSH  32  32  2880  31  0.97  99  76.2   5   5  60.5  40.3  11  10  13   
113  COL  20  20  1800  25  1.25  67  72.8   2   1  69.8  37.1  22   7   3   
114   DC   0   0     0   0     0   0     0   0   0     0     0   0   0   0   

      L   T    W%         Player   Season  
0     0   0     0  

## Exporting data to excel

In [None]:
df.to_excel('2023_data.xlsx', index=False)