In [24]:
import pandas as pd
import re

def batting_to_df(match_url):
    """Function that reads the batting scorecard from PlayCricket.com and converts it to a Pandas dataframe

    :param str match_url: The URL of the scorecard on PlayCricket.com
    """

    # Fetch all the tables on the page and store them in a list of dataframes
    tables = pd.read_html(match_url)

    # The first team's batting scorecard has index 1, the second team's has index 4. At some point, I need to edit
    # this code to cover both cases but for now we only cover one case.

    # Get the batting scorecard
    df = tables[4]
   
    # Replace any NaN values with 0s
    df = df.fillna(0)
    
    # Clean the cells in the batsman column 
    clean_cells = []
    for i in range(0,11):
        
        # Identify the clutter
        if df.iat[i, 1] == 0 or df.iat[i, 2] == 0:
            how_out = str(df.iat[i, 1]) + str(df.iat[i, 2])
            how_out = (how_out.replace('0', ''))
            how_out = how_out.replace(' ', '')
        else:
            how_out = df.iat[i, 1] + df.iat[i, 2]
            how_out = (how_out.replace(' ', ''))
        
        # Remove the clutter
        batsman_cell = ''.join(df.iat[i, 0].split()) # Remove whitespace 
        cleaned_cell = batsman_cell.replace(how_out, '').strip() # Remove how_out
        
        # Put space before any capital letter to separate the person's first name and last name
        cleaned_cell = re.sub(r"(\w)([A-Z])", r"\1 \2", cleaned_cell)
        
        # Append the cleaned element        
        clean_cells.append(cleaned_cell)
    
    print(clean_cells)
    
    # Update the batsman column with the cleaned cells
    df = df.assign(BATSMAN=clean_cells)
    
    # Remove the two redundant columns (the unnamed columns)
    df = df.drop(df.columns[[1, 2]], axis=1)
    
    # Change the names of the runs and balls columns 
    df.rename(columns={'RUNSR': 'RUNS', 'BALLSB': 'BALLS'}, inplace=True)
    
    
    return df

# batting_to_df("https://uniofwarwick.play-cricket.com/website/results/4055612")
# batting_to_df("https://englandseniors.play-cricket.com/website/results/4529883")
batting_to_df("https://mudeford.play-cricket.com/website/results/4515077")



['Dave Vines', 'Rich Wakelin', 'I Mayhew', 'Simon Mapstone', 'Sam Alban', 'Ryan Attree', 'Liam Pettengell', 'Jonny Worsfold', 'Dave Allen', 'Unsure', 'Unsure']


Unnamed: 0,BATSMAN,RUNS,BALLS,4s,6s,SR
0,Dave Vines,3.0,0.0,0.0,0.0,0.0
1,Rich Wakelin,22.0,0.0,0.0,0.0,0.0
2,I Mayhew,16.0,0.0,0.0,0.0,0.0
3,Simon Mapstone,50.0,0.0,0.0,0.0,0.0
4,Sam Alban,22.0,0.0,0.0,0.0,0.0
5,Ryan Attree,72.0,0.0,0.0,0.0,0.0
6,Liam Pettengell,19.0,0.0,0.0,0.0,0.0
7,Jonny Worsfold,0.0,0.0,0.0,0.0,0.0
8,Dave Allen,0.0,0.0,0.0,0.0,0.0
9,Unsure,0.0,0.0,0.0,0.0,0.0
