In [14]:
import pandas as pd
import re

def batting_to_df(match_url):
    """Function that reads the batting scorecard from PlayCricket.com and converts it to a Pandas dataframe

    :param str match_url: The URL of the scorecard on PlayCricket.com
    """

    # Fetch all the tables on the page and store them in a list of dataframes
    tables = pd.read_html(match_url)

    # The first team's batting scorecard has index 1, the second team's has index 4. At some point, I need to edit
    # this code to cover both cases but for now we only cover one case.

    # Get the unedited version of the batting scorecard
    dirty_batting_scorecard = tables[4]
   
    # Replace any NaN values with 0s
    dirty_batting_scorecard = dirty_batting_scorecard.fillna(0)
    
    # Clean the batsman column cells 
    clean_cells = []
    for i in range(0,11):
        
        # Identify the clutter
        if dirty_batting_scorecard.iat[i, 1] == 0 or dirty_batting_scorecard.iat[i, 2] == 0:
            how_out = str(dirty_batting_scorecard.iat[i, 1]) + str(dirty_batting_scorecard.iat[i, 2])
            how_out = (how_out.replace('0', ''))
            how_out = how_out.replace(' ', '')
        else:
            how_out = dirty_batting_scorecard.iat[i, 1] + dirty_batting_scorecard.iat[i, 2]
            how_out = (how_out.replace(' ', ''))
        
        # Remove the clutter
        batsman_cell = ''.join(dirty_batting_scorecard.iat[i, 0].split()) # Remove whitespace 
        cleaned_cell = batsman_cell.replace(how_out, '').strip() # Remove how_out
        
        # Put space before any capital letter to separate the names
        cleaned_cell = re.sub(r"(\w)([A-Z])", r"\1 \2", cleaned_cell)
        
        # Append the cleaned element        
        clean_cells.append(cleaned_cell)
    
    print(clean_cells)
    
    # Update the batsman column with the cleaned cells
    dirty_batting_scorecard = dirty_batting_scorecard.assign(BATSMAN=clean_cells)
    
    # Remove the two redundant columns (the unnamed columns)
    dirty_batting_scorecard = dirty_batting_scorecard.drop(dirty_batting_scorecard.columns[[1, 2]], axis=1)
    
    
    
    return dirty_batting_scorecard

batting_to_df("https://uniofwarwick.play-cricket.com/website/results/4055612")
# "https://englandseniors.play-cricket.com/website/results/4529883"
# "https://uniofwarwick.play-cricket.com/website/results/4055612"

['Sohayl Ujoodia', 'Greg Dann', 'Joe Randall', 'Sacha Abbasi', 'Charlie Royle', 'Dan Lewis', 'Parth Mannikar', 'Krishan Sachdeva', 'Kieron Patel', 'Jabez Weale', 'Pruthvish Anandpura']


Unnamed: 0,BATSMAN,RUNSR,BALLSB,4s,6s,SR
0,Sohayl Ujoodia,65.0,55.0,12.0,0.0,118.18
1,Greg Dann,4.0,5.0,1.0,0.0,80.0
2,Joe Randall,35.0,30.0,6.0,0.0,116.67
3,Sacha Abbasi,6.0,10.0,1.0,0.0,60.0
4,Charlie Royle,0.0,0.0,0.0,0.0,0.0
5,Dan Lewis,0.0,0.0,0.0,0.0,0.0
6,Parth Mannikar,0.0,0.0,0.0,0.0,0.0
7,Krishan Sachdeva,0.0,0.0,0.0,0.0,0.0
8,Kieron Patel,0.0,0.0,0.0,0.0,0.0
9,Jabez Weale,0.0,0.0,0.0,0.0,0.0
