In [2]:
import pandas as pandas
import numpy as np
from yaml import safe_load
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib


In [3]:
import os
import yaml
import pandas as pd

folder_path = "t20s_male (1)"  # path to your folder
all_matches = []

for file in os.listdir(folder_path):
    if file.endswith(".yaml"):
        with open(os.path.join(folder_path, file), "r", encoding="utf-8") as f:
            data = yaml.safe_load(f)

        match_info = {
            "match_id": file.replace(".yaml", ""),
            "team1": data["info"]["teams"][0],
            "team2": data["info"]["teams"][1],
            "venue": data["info"].get("venue"),
            "city": data["info"].get("city"),
            "dates": data["info"].get("dates"),
            "winner": data["info"].get("outcome", {}).get("winner"),
            "by_runs": data["info"].get("outcome", {}).get("by", {}).get("runs"),
            "by_wickets": data["info"].get("outcome", {}).get("by", {}).get("wickets"),
            "player_of_match": data["info"].get("player_of_match")
        }

        all_matches.append(match_info)

matches_df = pd.DataFrame(all_matches)


In [4]:
matches_df.shape

(3046, 10)

In [9]:
import os
import yaml
import pandas as pd

ball_data = []
folder_path = "t20s_male (1)" # Update this path

for file in os.listdir(folder_path):
    if file.endswith(".yaml"):
        with open(os.path.join(folder_path, file), "r", encoding="utf-8") as f:
            data = yaml.safe_load(f)

        match_id = file.replace(".yaml", "")

        for innings in data["innings"]:
            inning_name = list(innings.keys())[0]
            # Some YAMLs have the team name at the innings level
            batting_team = innings[inning_name].get("team")
            deliveries = innings[inning_name]["deliveries"]

            for delivery in deliveries:
                ball_no = list(delivery.keys())[0]
                ball = delivery[ball_no]

                # Basic Information
                batter = ball.get("batter", ball.get("batsman"))
                non_striker = ball.get("non_striker")
                bowler = ball.get("bowler")
                runs = ball.get("runs", {})

                # Handling Extras
                extras_type = None
                if "extras" in ball:
                    extras_type = list(ball["extras"].keys())[0]

                # FIXED: Handling Wickets (List vs Dict and Plural vs Singular)
                wicket_type = None
                player_dismissed = None
                
                # Check for "wickets" (Plural/List) or "wicket" (Singular/Dict/List)
                w_key = "wickets" if "wickets" in ball else "wicket" if "wicket" in ball else None
                
                if w_key:
                    w_content = ball[w_key]
                    # If it's a list, take the first element; otherwise use it as a dict
                    wicket_data = w_content[0] if isinstance(w_content, list) else w_content
                    
                    wicket_type = wicket_data.get("kind")
                    # Fallback between 'player' and 'player_out' keys
                    player_dismissed = wicket_data.get("player", wicket_data.get("player_out"))

                ball_data.append({
                    "match_id": match_id,
                    "innings": inning_name,
                    "ball": ball_no,
                    "batting_team": ball.get("team", batting_team),
                    "batter": batter,
                    "non_striker": non_striker,
                    "bowler": bowler,
                    "runs_batter": runs.get("batter", runs.get("batsman", 0)),
                    "runs_extras": runs.get("extras", 0),
                    "runs_total": runs.get("total", 0),
                    "extras_type": extras_type,
                    "wicket_type": wicket_type,
                    "player_dismissed": player_dismissed
                })

ball_df = pd.DataFrame(ball_data)



In [10]:
import pickle
pickle.dump(ball_df, open("ball_data.pkl", "wb"))
pickle.dump(matches_df, open("match_data.pkl", "wb"))