In [44]:
import pandas as pd
import re

def batting_to_df(match_url):
    """Function that reads the batting scorecard from PlayCricket.com and converts it to a Pandas dataframe

    :param str match_url: The URL of the scorecard on PlayCricket.com
    """

    # Fetch all the tables on the page and store them in a list of dataframes
    tables = pd.read_html(match_url)

    # The first team's batting scorecard has index 1, the second team's has index 4. At some point, I need to edit
    # this code to cover both cases but for now we only cover one case.

    # Get the batting scorecard
    df = tables[4]
   
    # Replace any NaN values with 0s
    df = df.fillna(0)
    
    # Clean the cells in the batsman column 
    clean_cells = []
    for i in range(0,11):
        
        # Identify the clutter
        if df.iat[i, 1] == 0 or df.iat[i, 2] == 0:
            how_out = str(df.iat[i, 1]) + str(df.iat[i, 2])
            how_out = (how_out.replace('0', ''))
            how_out = how_out.replace(' ', '')
        else:
            how_out = df.iat[i, 1] + df.iat[i, 2]
            how_out = (how_out.replace(' ', ''))
        
        # Remove the clutter
        batsman_cell = ''.join(df.iat[i, 0].split()) # Remove whitespace 
        cleaned_cell = batsman_cell.replace(how_out, '').strip() # Remove how_out
        
        # Put space before any capital letter to separate the person's first name and last name
        cleaned_cell = re.sub(r"(\w)([A-Z])", r"\1 \2", cleaned_cell)
        
        # Append the cleaned element        
        clean_cells.append(cleaned_cell)
    
    # Update the batsman column with the cleaned cells
    df = df.assign(BATSMAN=clean_cells)
    
    # Remove the two redundant columns (the unnamed columns)
    df = df.drop(df.columns[[1, 2]], axis=1)
    
    # Change the names of the runs and balls columns 
    df.rename(columns={'RUNSR': 'RUNS', 'BALLSB': 'BALLS'}, inplace=True)
    
    return df

batting_to_df("https://uniofwarwick.play-cricket.com/website/results/4055612")

Unnamed: 0,BATSMAN,RUNS,BALLS,4s,6s,SR
0,Sohayl Ujoodia,65.0,55.0,12.0,0.0,118.18
1,Greg Dann,4.0,5.0,1.0,0.0,80.0
2,Joe Randall,35.0,30.0,6.0,0.0,116.67
3,Sacha Abbasi,6.0,10.0,1.0,0.0,60.0
4,Charlie Royle,0.0,0.0,0.0,0.0,0.0
5,Dan Lewis,0.0,0.0,0.0,0.0,0.0
6,Parth Mannikar,0.0,0.0,0.0,0.0,0.0
7,Krishan Sachdeva,0.0,0.0,0.0,0.0,0.0
8,Kieron Patel,0.0,0.0,0.0,0.0,0.0
9,Jabez Weale,0.0,0.0,0.0,0.0,0.0


In [47]:
import pandas as pd
import re

def bowling_to_df(match_url):
    """Function that reads the bowling scorecard from PlayCricket.com and converts it to a Pandas dataframe

    :param str match_url: The URL of the scorecard on PlayCricket.com
    """
    # Fetch all the tables on the page and store them in a list of dataframes
    tables = pd.read_html(match_url)
    
    # The first team's bowling scorecard has index 3, the second team's has index 6 . At some point, I need to edit
    # this code to cover both cases but for now we only cover one case.
    
    # Get the bowling scorecard
    df = tables[3]
    
    # Replace any NaN values with 0s
    df = df.fillna(0)
    
     # Change the names of the columns 
    df.rename(columns={'OVERSO': 'OVERS', 'MAIDENSM': 'MAIDENS', 'RUNSR': 'RUNS', 
                       'WICKETSW': 'WICKETS', 'WIDESWD': 'WIDES', 'NO BALLSNB': 'NO BALLS'}, inplace=True)
    
    return df

bowling_to_df("https://uniofwarwick.play-cricket.com/website/results/4055612")


Unnamed: 0,BOWLER,OVERS,MAIDENS,RUNS,WICKETS,WIDES,NO BALLS,ECON
0,Pruthvish Anandpura,8.0,0,35,2,2,4,4.38
1,Kieron Patel,7.1,0,27,2,3,0,3.77
2,Joe Randall,10.0,4,24,5,2,0,2.4
3,Jabez Weale,10.0,4,24,1,1,0,2.4
4,Krishan Sachdeva,1.0,0,6,0,0,0,6.0


In [None]:
import pandas as pd
import re

def get_table_indices(match_url):
     """Function that reads the bowling scorecard from PlayCricket.com and converts it to a Pandas dataframe

    :param str match_url: The URL of the scorecard on PlayCricket.com
    """