In [19]:
import os
import yaml
import pandas as pd
import pickle
from tqdm import tqdm

# Define the directory where YAML files are stored
T20S_FOLDER = "t20s"

# Define valid teams
VALID_TEAMS = {'Australia', 'India', 'Bangladesh', 'New Zealand', 'South Africa', 
               'England', 'West Indies', 'Afghanistan', 'Pakistan', 'Sri Lanka'}

# Initialize an empty list to store match data
all_matches = []

# Function to extract bowling team
def get_bowling_team(row):
    """Returns the bowling team by checking which team is not the batting team."""
    teams = row.get("teams", ["Unknown", "Unknown"])
    for team in teams:
        if team != row["batting_team"]:
            return team
    return "Unknown"

# Process each YAML file in the folder
for filename in tqdm(os.listdir(T20S_FOLDER)):
    if filename.endswith(".yaml"):
        filepath = os.path.join(T20S_FOLDER, filename)
        
        with open(filepath, "r", encoding="utf-8") as file:
            match_data = yaml.safe_load(file)
        
        # Extract match-level info safely
        info = match_data.get("info", {})
        match_id = filename.split(".")[0]  # Assign a unique match_id from filename
        teams = info.get("teams", ["Unknown", "Unknown"])
        city = info.get("city", "Unknown")
        venue = info.get("venue", "Unknown")
        gender = info.get("gender", "Unknown")
        match_type = info.get("match_type", "Unknown")

        # Skip if not a male T20 match
        if gender != "male" or match_type != "T20":
            continue

        # Process first innings deliveries safely
        innings = match_data.get("innings", [])
        if not innings or "1st innings" not in innings[0]:
            continue  # Skip if no innings data available
        
        first_innings = innings[0]["1st innings"]
        batting_team = first_innings.get("team", "Unknown")
        deliveries = first_innings.get("deliveries", [])

        # Process each ball in the innings
        ball_data = []
        current_score = 0
        player_dismissed_count = 0

        for ball in deliveries:
            for ball_number, ball_details in ball.items():
                runs = ball_details.get("runs", {}).get("total", 0)
                current_score += runs
                
                # Handle wickets safely
                player_out = ball_details.get("wicket", {}).get("player_out")
                if player_out:
                    player_dismissed_count += 1

                # Compute balls bowled
                overs, balls = map(int, str(ball_number).split('.'))
                balls_bowled = overs * 6 + balls
                balls_left = max(120 - balls_bowled, 0)  # Max 20 overs (120 balls)
                
                # Calculate Current Run Rate (CRR)
                crr = (current_score * 6) / balls_bowled if balls_bowled else 0
                
                # Compute last five overs' runs
                last_five_overs = deliveries[max(0, len(deliveries)-30):]
                last_five_runs = sum(
                    b.get(list(b.keys())[0], {}).get("runs", {}).get("total", 0)
                    for b in last_five_overs
                )

                ball_data.append({
                    "match_id": match_id,
                    "batting_team": batting_team,
                    "bowling_team": get_bowling_team({"teams": teams, "batting_team": batting_team}),
                    "city": city,
                    "venue": venue,
                    "current_score": current_score,
                    "balls_left": balls_left,
                    "wickets_left": 10 - player_dismissed_count,
                    "crr": crr,
                    "last_five": last_five_runs,
                    "runs": runs
                })
        
        all_matches.extend(ball_data)

# Convert to DataFrame
df = pd.DataFrame(all_matches)

# Filter for only valid teams
df = df[df["batting_team"].isin(VALID_TEAMS)]
df = df[df["bowling_team"].isin(VALID_TEAMS)]

# Drop missing values
df.dropna(inplace=True)

# Save processed dataset
pickle.dump(df, open("t20_score_prediction.pkl", "wb"))

print(f"Processed {len(df)} deliveries and saved dataset to 't20_score_prediction.pkl'")


100%|██████████| 1433/1433 [04:23<00:00,  5.43it/s]


Processed 64993 deliveries and saved dataset to 't20_score_prediction.pkl'


100%|██████████| 1433/1433 [04:08<00:00,  5.77it/s]


Final DataFrame shape: (110040, 5)
                                                meta  \
0  {'data_version': 0.9, 'created': 2017-02-18, '...   
1  {'data_version': 0.9, 'created': 2017-02-19, '...   
2  {'data_version': 0.9, 'created': 2017-02-23, '...   
3  {'data_version': 0.9, 'created': 2016-09-12, '...   
4  {'data_version': 0.9, 'created': 2016-06-19, '...   

                                                info  \
0  {'dates': [2017-02-17], 'gender': 'male', 'mat...   
1  {'city': 'Victoria', 'dates': [2017-02-19], 'g...   
2  {'dates': [2017-02-22], 'gender': 'male', 'mat...   
3  {'city': 'Londonderry', 'dates': [2016-09-05],...   
4  {'dates': [2016-06-18], 'gender': 'male', 'mat...   

                                             innings  match_id    0  
0  [{'1st innings': {'team': 'Australia', 'delive...         1  NaN  
1  [{'1st innings': {'team': 'Australia', 'delive...         2  NaN  
2  [{'1st innings': {'team': 'Australia', 'delive...         3  NaN  
3  [{'1st i

In [7]:
import pickle
pickle.dump(final_df, open('dataset_level1.pkl', 'wb'))

matches = pickle.load(open('dataset_level1.pkl', 'rb'))

# Data processing for deliveries
delivery_df = pd.DataFrame()

In [11]:
import pandas as pd

# Check available columns
print("Available columns in matches:", matches.columns)

for index, row in matches.iterrows():
    innings_data = row.get('innings', [])  # Avoid NaN issue
    
    if not isinstance(innings_data, list) or len(innings_data) == 0:
        continue  # Skip if innings_data is not a valid list

    match_id = index + 1
    first_innings = innings_data[0].get('1st innings', {})  # Ensure valid dictionary
    batting_team = first_innings.get('team', 'Unknown')

    # Use .get() to prevent KeyError
    info = row.get('info', {})
    teams = info.get('teams', ['Unknown', 'Unknown'])
    city = info.get('city', 'Unknown')
    venue = info.get('venue', 'Unknown')

    ball_data = []
    
    for ball in first_innings.get('deliveries', []):  # Avoid TypeError
        for key, value in ball.items():
            ball_data.append({
                'match_id': match_id,
                'batting_team': batting_team,
                'ball': key,
                'batsman': value['batsman'],
                'bowler': value['bowler'],
                'runs': value['runs']['total'],
                'player_dismissed': value.get('wicket', {}).get('player_out', '0'),
                'city': city,
                'venue': venue
            })

    loop_df = pd.DataFrame(ball_data)
    
    # Ensure delivery_df is defined before using it
    if 'delivery_df' not in locals():
        delivery_df = loop_df  # Initialize if not defined
    else:
        delivery_df = pd.concat([delivery_df, loop_df], ignore_index=True)

# Print final delivery DataFrame structure
print("Final DataFrame shape:", delivery_df.shape)
print(delivery_df.head())


Available columns in matches: Index(['meta', 'info', 'innings', 'match_id', 0], dtype='object')
Final DataFrame shape: (347436, 9)
   match_id batting_team  ball    batsman      bowler  runs player_dismissed  \
0         1    Australia   0.1   AJ Finch  SL Malinga     0                0   
1         1    Australia   0.2   AJ Finch  SL Malinga     0                0   
2         1    Australia   0.3   AJ Finch  SL Malinga     1                0   
3         1    Australia   0.4  M Klinger  SL Malinga     2                0   
4         1    Australia   0.5  M Klinger  SL Malinga     0                0   

      city                     venue  
0  Unknown  Melbourne Cricket Ground  
1  Unknown  Melbourne Cricket Ground  
2  Unknown  Melbourne Cricket Ground  
3  Unknown  Melbourne Cricket Ground  
4  Unknown  Melbourne Cricket Ground  


In [14]:
def bowl(row):
    teams = row.get('teams', ['Unknown', 'Unknown'])  # Default if 'teams' is missing
    for team in teams:
        if team != row['batting_team']:
            return team
    return 'Unknown'


In [17]:
import os
import yaml
import pandas as pd
import pickle
from tqdm import tqdm

# Define the directory where YAML files are stored
T20S_FOLDER = "t20s"

# Define valid teams
VALID_TEAMS = {'Australia', 'India', 'Bangladesh', 'New Zealand', 'South Africa', 
               'England', 'West Indies', 'Afghanistan', 'Pakistan', 'Sri Lanka'}

# Initialize an empty list to store match data
all_matches = []

# Function to extract bowling team
def get_bowling_team(row):
    """Returns the bowling team by checking which team is not the batting team."""
    teams = row.get("teams", ["Unknown", "Unknown"])
    for team in teams:
        if team != row["batting_team"]:
            return team
    return "Unknown"

# Process each YAML file in the folder
for filename in tqdm(os.listdir(T20S_FOLDER)):
    if filename.endswith(".yaml"):
        filepath = os.path.join(T20S_FOLDER, filename)
        
        with open(filepath, "r", encoding="utf-8") as file:
            match_data = yaml.safe_load(file)
        
        # Extract match-level info safely
        info = match_data.get("info", {})
        match_id = filename.split(".")[0]  # Assign a unique match_id from filename
        teams = info.get("teams", ["Unknown", "Unknown"])
        city = info.get("city", "Unknown")
        venue = info.get("venue", "Unknown")
        gender = info.get("gender", "Unknown")
        match_type = info.get("match_type", "Unknown")

        # Skip if not a male T20 match
        if gender != "male" or match_type != "T20":
            continue

        # Process first innings deliveries safely
        innings = match_data.get("innings", [])
        if not innings or "1st innings" not in innings[0]:
            continue  # Skip if no innings data available
        
        first_innings = innings[0]["1st innings"]
        batting_team = first_innings.get("team", "Unknown")
        deliveries = first_innings.get("deliveries", [])

        # Process each ball in the innings
        ball_data = []
        current_score = 0
        player_dismissed_count = 0

        for ball in deliveries:
            for ball_number, ball_details in ball.items():
                runs = ball_details.get("runs", {}).get("total", 0)
                current_score += runs
                
                # Handle wickets safely
                player_out = ball_details.get("wicket", {}).get("player_out")
                if player_out:
                    player_dismissed_count += 1

                # Compute balls bowled
                overs, balls = map(int, str(ball_number).split('.'))
                balls_bowled = overs * 6 + balls
                balls_left = max(120 - balls_bowled, 0)  # Max 20 overs (120 balls)
                
                # Calculate Current Run Rate (CRR)
                crr = (current_score * 6) / balls_bowled if balls_bowled else 0
                
                # Compute last five overs' runs
                last_five_overs = deliveries[max(0, len(deliveries)-30):]
                last_five_runs = sum(
                    b.get(list(b.keys())[0], {}).get("runs", {}).get("total", 0)
                    for b in last_five_overs
                )

                ball_data.append({
                    "match_id": match_id,
                    "batting_team": batting_team,
                    "bowling_team": get_bowling_team({"teams": teams, "batting_team": batting_team}),
                    "city": city,
                    "venue": venue,
                    "current_score": current_score,
                    "balls_left": balls_left,
                    "wickets_left": 10 - player_dismissed_count,
                    "crr": crr,
                    "last_five": last_five_runs,
                    "runs": runs
                })
        
        all_matches.extend(ball_data)

# Convert to DataFrame
df = pd.DataFrame(all_matches)

# Filter for only valid teams
df = df[df["batting_team"].isin(VALID_TEAMS)]
df = df[df["bowling_team"].isin(VALID_TEAMS)]

# Drop missing values
df.dropna(inplace=True)

# Save processed dataset
pickle.dump(df, open("t20_score_prediction.pkl", "wb"))

print(f"Processed {len(df)} deliveries and saved dataset to 't20_score_prediction.pkl'")


KeyError: "['runs'] not in index"

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error


In [None]:
X = final_df.drop(columns=['runs'])
Y = final_df['runs']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

trf = ColumnTransformer([
    ('onehot', OneHotEncoder(sparse_output=False, drop='first'), ['batting_team', 'bowling_team', 'city'])
], remainder='passthrough')

pipe = Pipeline([
    ('transform', trf),
    ('scale', StandardScaler()),
    ('model', XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12, random_state=1))
])

pipe.fit(X_train, Y_train)
Y_pred = pipe.predict(X_test)
print(r2_score(Y_test, Y_pred))
print(mean_absolute_error(Y_test, Y_pred))

pickle.dump(pipe, open('pipe2.pkl', 'wb'))