In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate


In [11]:
data = pd.read_csv('../data/saved_data/standardized_per_game_data.csv')
data.head()

Unnamed: 0,game_id,innings,total_batter_runs_conceded_mean,total_runs_from_relevant_extras_mean,total_total_runs_conceded_mean,total_taken_from_relevant_wickets_mean,powerplay_batter_runs_conceded_mean,powerplay_runs_from_relevant_extras_mean,powerplay_total_runs_conceded_mean,powerplay_taken_from_relevant_wickets_mean,...,powerplay_runs_batter_mean,powerplay_high_scoring_hit_mean,powerplay_total_mean,powerplay_is_wicket_mean,non_powerplay_runs_batter_mean,non_powerplay_high_scoring_hit_mean,non_powerplay_total_mean,non_powerplay_is_wicket_mean,final_runs,final_wickets
0,002241a4-33c2-4760-98f5-2e948c8ba72d,1,0.822497,-0.152112,0.750288,0.725801,1.714255,0.786943,1.806211,-0.443284,...,-0.067017,0.048304,-0.176614,0.319407,0.636517,0.408743,0.603931,-0.444931,199,5
1,002241a4-33c2-4760-98f5-2e948c8ba72d,2,0.057194,-0.49448,0.025546,-0.414094,0.393297,-0.696873,0.311807,-0.866606,...,0.083423,-0.139888,-0.095311,-1.294053,0.839044,0.709896,0.757129,-0.090809,97,7
2,005ad76d-5118-42b3-b201-7a66ddd993ff,1,0.295511,-0.183686,0.251142,0.452743,0.20904,-0.159175,0.170795,2.076613,...,0.71371,0.908928,0.697545,-0.376023,0.303475,0.466266,0.345178,-0.174936,173,6
3,005ad76d-5118-42b3-b201-7a66ddd993ff,2,0.216071,-0.570194,0.194874,0.444315,0.202152,-0.105897,0.207373,0.036535,...,1.073143,1.151856,1.065811,0.062678,0.660881,0.726828,0.675743,0.055117,167,7
4,0061255e-76ea-4415-aa89-fb6e87c61186,1,1.757912,-0.207368,1.764689,0.451444,1.364354,-0.226842,1.38927,-0.220704,...,0.956534,1.018972,0.949863,-0.251526,1.408916,1.64957,1.4667,-0.347873,234,6


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7533 entries, 0 to 7532
Data columns (total 28 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   game_id                                         7533 non-null   object 
 1   innings                                         7533 non-null   int64  
 2   total_batter_runs_conceded_mean                 7533 non-null   float64
 3   total_runs_from_relevant_extras_mean            7533 non-null   float64
 4   total_total_runs_conceded_mean                  7533 non-null   float64
 5   total_taken_from_relevant_wickets_mean          7533 non-null   float64
 6   powerplay_batter_runs_conceded_mean             7533 non-null   float64
 7   powerplay_runs_from_relevant_extras_mean        7533 non-null   float64
 8   powerplay_total_runs_conceded_mean              7533 non-null   float64
 9   powerplay_taken_from_relevant_wickets_mea

In [19]:
# There's on game missing non-powerplay stats, so I will drop that game.
data = data.dropna()

In [20]:
run_estimation_df = data.drop(columns=['game_id', 'innings', 'final_wickets'])
wicket_estimation_df = data.drop(columns=['game_id', 'innings', 'final_runs'])


# Baseline Model

For my baseline I am going to use linear regression. I will use RFE for feature selection.

I belive innings score is given as runs and wickets, so I will train two models, one to predict runs and one to predict wickets. I will do these seperately.

# Run Estimation

In [49]:
train_data, test_data = train_test_split(run_estimation_df, test_size=0.2, random_state=42)

In [50]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

def initial_feature_selection(df, n_features_to_select=10, target_column='final_runs'):
    # Prepare features (exclude target and non-numeric columns)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    # Initialize estimator
    estimator = LinearRegression()
    
    # Create RFE selector
    selector = RFE(estimator=estimator, 
                  n_features_to_select=n_features_to_select,
                  step=1)  # Remove 1 feature at a time
    
    # Fit selector
    selector = selector.fit(X, y)
    
    # Get selected feature names
    selected_features = X.columns[selector.support_].tolist()
    
    # Print rankings
    feature_ranking = pd.DataFrame({
        'Feature': X.columns,
        'Ranking': selector.ranking_
    }).sort_values('Ranking')
    print("Feature Rankings:")
    print(feature_ranking)
    
    return selected_features

important_features = initial_feature_selection(train_data)


Feature Rankings:
                                           Feature  Ranking
0                  total_batter_runs_conceded_mean        1
20                  non_powerplay_runs_batter_mean        1
12                          total_runs_batter_mean        1
22                        non_powerplay_total_mean        1
10          non_powerplay_total_runs_conceded_mean        1
8          non_powerplay_batter_runs_conceded_mean        1
9     non_powerplay_runs_from_relevant_extras_mean        1
4              powerplay_batter_runs_conceded_mean        1
6               powerplay_total_runs_conceded_mean        1
18                            powerplay_total_mean        1
5         powerplay_runs_from_relevant_extras_mean        2
13                     total_high_scoring_hit_mean        3
17                 powerplay_high_scoring_hit_mean        4
21             non_powerplay_high_scoring_hit_mean        5
2                   total_total_runs_conceded_mean        6
1             total_ru

In [51]:
def check_correlations(df, features):
    # Create correlation matrix
    corr_matrix = df[features].corr()
    
    # Find highly correlated pairs
    high_corr = []
    for i in range(len(features)):
        for j in range(i+1, len(features)):
            if abs(corr_matrix.iloc[i,j]) > 0.7:  # Threshold
                high_corr.append({
                    'feature1': features[i],
                    'feature2': features[j],
                    'correlation': corr_matrix.iloc[i,j]
                })
    
    return pd.DataFrame(high_corr)

# Use it
correlations = check_correlations(train_data, important_features)
print("Highly correlated features:")
correlations

Highly correlated features:


Unnamed: 0,feature1,feature2,correlation
0,total_batter_runs_conceded_mean,powerplay_batter_runs_conceded_mean,0.880071
1,total_batter_runs_conceded_mean,powerplay_total_runs_conceded_mean,0.866095
2,total_batter_runs_conceded_mean,non_powerplay_batter_runs_conceded_mean,0.974354
3,total_batter_runs_conceded_mean,non_powerplay_total_runs_conceded_mean,0.961049
4,total_batter_runs_conceded_mean,total_runs_batter_mean,0.767335
5,total_batter_runs_conceded_mean,non_powerplay_runs_batter_mean,0.738026
6,total_batter_runs_conceded_mean,non_powerplay_total_mean,0.729522
7,powerplay_batter_runs_conceded_mean,powerplay_total_runs_conceded_mean,0.981187
8,powerplay_batter_runs_conceded_mean,non_powerplay_batter_runs_conceded_mean,0.804075
9,powerplay_batter_runs_conceded_mean,non_powerplay_total_runs_conceded_mean,0.789061


It is evnident that there is significant correlation between several of the best ranked features, likely stemming from the fact that total stats encompass powerplay and non-powerplay stats. I am going to start by only keeping the total stats for now.

Since I would be eliminating a large chunk of the 'important_features' I will start again from scratch with only the total stats.

In [52]:
cols_to_keep = [col for col in run_estimation_df.columns if 'total' in col[:5]] + ['final_runs']
totals_run_estimation_df = run_estimation_df[cols_to_keep]

train_data, test_data = train_test_split(totals_run_estimation_df, test_size=0.2, random_state=42)

In [53]:
important_features = initial_feature_selection(train_data, target_column='final_runs')

Feature Rankings:
                                  Feature  Ranking
0         total_batter_runs_conceded_mean        1
1    total_runs_from_relevant_extras_mean        1
2          total_total_runs_conceded_mean        1
3  total_taken_from_relevant_wickets_mean        1
4                  total_runs_batter_mean        1
5             total_high_scoring_hit_mean        1
6                        total_total_mean        1
7                    total_is_wicket_mean        1




RFE does not work with <10 features. That being said barring any significant correlations I don't think I have an amount that should negatively impact the model so I will keep all features.

In [54]:
correlations = check_correlations(train_data, important_features)
correlations

Unnamed: 0,feature1,feature2,correlation
0,total_batter_runs_conceded_mean,total_total_runs_conceded_mean,0.985675
1,total_batter_runs_conceded_mean,total_runs_batter_mean,0.767335
2,total_batter_runs_conceded_mean,total_high_scoring_hit_mean,0.726575
3,total_batter_runs_conceded_mean,total_total_mean,0.761869
4,total_total_runs_conceded_mean,total_runs_batter_mean,0.733073
5,total_total_runs_conceded_mean,total_total_mean,0.73853
6,total_runs_batter_mean,total_high_scoring_hit_mean,0.964654
7,total_runs_batter_mean,total_total_mean,0.994221
8,total_high_scoring_hit_mean,total_total_mean,0.965277


so... there are a lot of high correlations, but a couple very high ones.
- The mean runs conceded and the mean batter only runs conceded are highly very highly correlated, likely due to the fact that most of the runs conceded are batter runs and not extras.
- the mean runs conceded scored by the batter is very highly correlated with:
    - the mean no. of high scoring balls hit
    - the mean total runs made by the batting side including batter runs and extras.
- Finally the mean total runs scored is highly correlated with the mean no. of high scoring balls hit.

Based on this, I will remove:
- The mean total runs conceded per game
- The mean number of high scoring balls hit per game
- The mean runs scored by the batting side per game

For the sake of clarity I will also rename the existing features.

In [55]:
train_data = train_data.drop(columns=['total_total_runs_conceded_mean', 'total_total_mean', 'total_high_scoring_hit_mean'])
test_data = test_data.drop(columns=['total_total_runs_conceded_mean', 'total_total_mean', 'total_high_scoring_hit_mean'])

train_data.columns

Index(['total_batter_runs_conceded_mean',
       'total_runs_from_relevant_extras_mean',
       'total_taken_from_relevant_wickets_mean', 'total_runs_batter_mean',
       'total_is_wicket_mean', 'final_runs'],
      dtype='object')

In [56]:
cols_to_rename = {
    'total_batter_runs_conceded_mean': 'mean_batter_runs_conceded',
    'total_runs_from_relevant_extras_mean': 'mean_extras_runs_conceded',
    'total_taken_from_relevant_wickets_mean': 'mean_wickets_taken',
    'total_runs_batter_mean': 'mean_batter_runs_scored',
    'total_is_wicket_mean': 'mean_wickets_conceded'
}

train_data.rename(columns=cols_to_rename, inplace=True)
test_data.rename(columns=cols_to_rename, inplace=True)
train_data.columns

Index(['mean_batter_runs_conceded', 'mean_extras_runs_conceded',
       'mean_wickets_taken', 'mean_batter_runs_scored',
       'mean_wickets_conceded', 'final_runs'],
      dtype='object')

In [64]:
def train_and_evaluate_baseline_model(train_data, test_data, target_column):
    X_train = train_data.drop(columns=[target_column])
    y_train = train_data[target_column]
    X_test = test_data.drop(columns=[target_column])
    y_test = test_data[target_column]

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Calculate metrics
    metrics = {
        'train': {
            'r2': r2_score(y_train, y_pred_train),
            'rmse': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'mape': mean_absolute_percentage_error(y_train, y_pred_train)
        },
        'test': {
            'r2': r2_score(y_test, y_pred_test),
            'rmse': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'mape': mean_absolute_percentage_error(y_test, y_pred_test)
        }
    }
    
    # Print feature coefficients
    coefficients = pd.DataFrame({
        'Feature': X_train.columns,
        'Coefficient': model.coef_
    })
    
    print("Model Performance:")
    print(f"Train R²: {metrics['train']['r2']:.3f}")
    print(f"Test R²: {metrics['train']['r2']:.3f}")
    print(f"Train RMSE: {metrics['train']['rmse']:.3f}")
    print(f"Test RMSE: {metrics['test']['rmse']:.3f}")
    print(f"Train MAPE: {metrics['train']['mape']:.3f}")
    print(f"Test MAPE: {metrics['test']['mape']:.3f}")
    print("\nFeature Coefficients:")
    print(coefficients)
    
    return model, metrics, coefficients

# Use the function
model, metrics, coefficients = train_and_evaluate_baseline_model(train_data, test_data, 'final_runs')



Model Performance:
Train R²: 0.446
Test R²: 0.446
Train RMSE: 33.140
Test RMSE: 33.414
Train MAPE: 0.334
Test MAPE: 0.301

Feature Coefficients:
                     Feature  Coefficient
0  mean_batter_runs_conceded    11.028226
1  mean_extras_runs_conceded     7.054916
2         mean_wickets_taken    -0.711567
3    mean_batter_runs_scored    20.738551
4      mean_wickets_conceded    -4.659263


So... The model is not great - but that's fine for a baseline.

The similar test and train results suggest no overfitting, though considering the number of features and the model itself this was unlikely anyway.

The R^2 is 0.446 which is not great, suggests out baseline struggled to fit the data well.

The MAPE is 0.301, suggesting that while the model is not great, it is faily okay for getting a ballpark estimation... 

Anyways, time to improve on this.

## Random Forest

I am going to try an approach using tree based models to see if they can improve results over simple linear regression.

I will start with a random forest model.

Since random forest is not significantly affected by multicollinearity, I will keep the original feature set as it should filter what it believes is most important/informative at each step. Depending on performance however I may then try removing some features.

I will be using a train/valid/test split this time as there are some hyperparameters to tune.

In [98]:
data = run_estimation_df
target_column = 'final_runs'
test_size = 0.15
random_state = 42

# Prepare data
X = data.drop(columns=[target_column])
y = data[target_column]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state
)

In [99]:
def do_random_forest(X_train, y_train, cv=5, random_state=42, **kwargs):
    model = RandomForestRegressor(random_state=random_state, **kwargs)
    
    # Perform cross-validation
    cv_results = cross_validate(
        model,
        X_train,
        y_train,
        cv=cv,
        scoring={
            'r2': 'r2',
            'rmse': 'neg_root_mean_squared_error',
            'mape': 'neg_mean_absolute_percentage_error'
        },
        return_train_score=True
    )
    
    # Fit final model
    model.fit(X_train, y_train)
    
    # Analyze CV results
    analysis = {
        'r2': {
            'train_avg': cv_results['train_r2'].mean(),
            'train_std': cv_results['train_r2'].std(),
            'cv_avg': cv_results['test_r2'].mean(),
            'cv_std': cv_results['test_r2'].std(),
            'gap': cv_results['train_r2'].mean() - cv_results['test_r2'].mean(),
            'consistency': 'Stable' if cv_results['test_r2'].std() < 0.05 else 'Unstable'
        },
        'rmse': {
            'train_avg': -cv_results['train_rmse'].mean(),
            'train_std': cv_results['train_rmse'].std(),
            'cv_avg': -cv_results['test_rmse'].mean(),
            'cv_std': cv_results['test_rmse'].std(),
            'gap': -cv_results['test_rmse'].mean() - (-cv_results['train_rmse'].mean())
        },
        'mape': {
            'train_avg': -cv_results['train_mape'].mean(),
            'train_std': cv_results['train_mape'].std(),
            'cv_avg': -cv_results['test_mape'].mean(),
            'cv_std': cv_results['test_mape'].std(),
            'gap': -cv_results['test_mape'].mean() - (-cv_results['train_mape'].mean())
        }
    }
    
    # Print detailed analysis
    print("Cross-Validation Analysis:")
    print("\nR² Scores:")
    print(f"Training Average: {analysis['r2']['train_avg']:.3f} ± {analysis['r2']['train_std']:.3f}")
    print(f"CV Average: {analysis['r2']['cv_avg']:.3f} ± {analysis['r2']['cv_std']:.3f}")
    print(f"Gap (Train-CV): {analysis['r2']['gap']:.3f}")
    print(f"CV Consistency: {analysis['r2']['consistency']}")
    
    print("\nRMSE Scores:")
    print(f"Training Average: {analysis['rmse']['train_avg']:.3f} ± {analysis['rmse']['train_std']:.3f}")
    print(f"CV Average: {analysis['rmse']['cv_avg']:.3f} ± {analysis['rmse']['cv_std']:.3f}")
    print(f"Gap (CV-Train): {analysis['rmse']['gap']:.3f}")
    
    print("\nMAPE Scores:")
    print(f"Training Average: {analysis['mape']['train_avg']:.3f} ± {analysis['mape']['train_std']:.3f}")
    print(f"CV Average: {analysis['mape']['cv_avg']:.3f} ± {analysis['mape']['cv_std']:.3f}")
    print(f"Gap (CV-Train): {analysis['mape']['gap']:.3f}")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))
    
    return model, analysis, feature_importance

In [100]:
model, analysis, feature_importance = do_random_forest(X_train, y_train)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.924 ± 0.001
CV Average: 0.454 ± 0.021
Gap (Train-CV): 0.471
CV Consistency: Stable

RMSE Scores:
Training Average: 12.287 ± 0.076
CV Average: 32.971 ± 0.753
Gap (CV-Train): 20.684

MAPE Scores:
Training Average: 0.121 ± 0.003
CV Average: 0.327 ± 0.035
Gap (CV-Train): 0.206

Top 10 Most Important Features:
                                           Feature  Importance
22                        non_powerplay_total_mean    0.288054
14                                total_total_mean    0.123146
2                   total_total_runs_conceded_mean    0.050088
5         powerplay_runs_from_relevant_extras_mean    0.035849
1             total_runs_from_relevant_extras_mean    0.034163
10          non_powerplay_total_runs_conceded_mean    0.032838
15                            total_is_wicket_mean    0.031734
13                     total_high_scoring_hit_mean    0.030962
11  non_powerplay_taken_from_relevant_wickets_mean    0.030770
9   

So... with no tuning the model is not great. It is able to fit the training data very well, but the CV gap demonstrates that there is significant overfitting going on. In fact, we see scores that are marginally worse than the baseline model.

I am going to begin by seeing how much I can improve the model by tuning the hyperparameters. Hopefully automated tuning should help a decent amount, followed by manual tuning depending on results.

In [111]:
def tune_random_forest(X_train, y_train):    
    # Define parameter grid
    param_grid = {
        'max_depth': [10, 20, 30, 50, 100, None],
        'min_samples_split': [2, 5, 10, 20, 50],
        'min_samples_leaf': [1, 2, 4, 8, 16],
        'n_estimators': [100, 200, 300, 400, 500],
        'max_features': ['sqrt', 'log2', None]
    }
    
    # Initialize search with cross-validation
    search = RandomizedSearchCV(
        RandomForestRegressor(random_state=42),
        param_grid,
        cv=5,  # 5-fold cross-validation
        scoring='r2',
        n_jobs=-1,
        verbose=2
    )
    
    # Fit on training data only
    search.fit(X_train, y_train)
    
    # Evaluate on test set
    best_model = search.best_estimator_
    final_score = best_model.score(X_train, y_train)
    
    print("Best parameters:", search.best_params_)
    print("CV Score:", search.best_score_)
    print("Final Score:", final_score)
    
    return best_model, search.best_params_

In [112]:
best_rf, kwargs = tune_random_forest(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=50, n_estimators=300; total time=   4.8s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=50, n_estimators=300; total time=   4.9s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=50, n_estimators=300; total time=   4.9s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=50, n_estimators=300; total time=   4.8s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=50, n_estimators=300; total time=   4.9s
[CV] END max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time=   8.3s
[CV] END max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time=   8.4s
[CV] END max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimator

In [113]:
kwargs

{'n_estimators': 400,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'log2',
 'max_depth': 50}

In [114]:
model, analysis, feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42, **kwargs)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.871 ± 0.001
CV Average: 0.466 ± 0.018
Gap (Train-CV): 0.405
CV Consistency: Stable

RMSE Scores:
Training Average: 16.042 ± 0.096
CV Average: 32.604 ± 0.716
Gap (CV-Train): 16.562

MAPE Scores:
Training Average: 0.164 ± 0.005
CV Average: 0.326 ± 0.033
Gap (CV-Train): 0.163

Top 10 Most Important Features:
                                    Feature  Importance
22                 non_powerplay_total_mean    0.093018
14                         total_total_mean    0.089463
20           non_powerplay_runs_batter_mean    0.074490
12                   total_runs_batter_mean    0.070077
13              total_high_scoring_hit_mean    0.067810
21      non_powerplay_high_scoring_hit_mean    0.059272
2            total_total_runs_conceded_mean    0.051413
10   non_powerplay_total_runs_conceded_mean    0.042186
8   non_powerplay_batter_runs_conceded_mean    0.041568
18                     powerplay_total_mean    0.037055


Compared to the earlier results, we see very little improvement. The CV gap has been reduced, however the actual scores are only very marginally imroved. This suggests that the model is overfitting less, but this is not leading to an improvement in the model's ability to predict new data. 

I am going to try some extremenly agressive hyperparameter values to see if it helps. I expect they may reduce overfitting, but I doubt they will improve the model's overall performance. Possibly worth a shot though

In [115]:
kwargs = {
    'max_depth': 10,
    'min_samples_split': 10,
    'min_samples_leaf': 5,
    'n_estimators': 500,
    'max_features': 'log2'
}
model, analysis, feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42, **kwargs)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.646 ± 0.005
CV Average: 0.462 ± 0.018
Gap (Train-CV): 0.184
CV Consistency: Stable

RMSE Scores:
Training Average: 26.549 ± 0.200
CV Average: 32.711 ± 0.749
Gap (CV-Train): 6.162

MAPE Scores:
Training Average: 0.263 ± 0.007
CV Average: 0.329 ± 0.033
Gap (CV-Train): 0.065

Top 10 Most Important Features:
                                    Feature  Importance
22                 non_powerplay_total_mean    0.119190
14                         total_total_mean    0.111705
20           non_powerplay_runs_batter_mean    0.092864
12                   total_runs_batter_mean    0.084491
13              total_high_scoring_hit_mean    0.078198
21      non_powerplay_high_scoring_hit_mean    0.067099
2            total_total_runs_conceded_mean    0.056044
10   non_powerplay_total_runs_conceded_mean    0.039902
8   non_powerplay_batter_runs_conceded_mean    0.036524
0           total_batter_runs_conceded_mean    0.036102


Turns out my hypothesis was indeed correct, the model is overfitting far less, however the scores have not really improved.
I will finally use this final model with my test set.

In [116]:
def test_final_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print("Final Model Performance:")
    print(f"R²: {r2:.3f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAPE: {mape:.3f}")
    return r2, rmse, mape

r2, rmse, mape = test_final_model(best_rf, X_test, y_test)


Final Model Performance:
R²: 0.504
RMSE: 31.269
MAPE: 0.265


Compared to the baseline model, we do see improvements in all values, albeit not significant ones. MAPE is 0.265 compared to 0.301, RMSE 31.3 compared to 33.4 and R^2 is 0.504 compared to 0.446.

I would however like to repeat this with only the total features. While random forest should filter out features itself, I am curious to see how it will affect performance.

In [119]:
cols_to_rename = {
    'total_batter_runs_conceded_mean': 'mean_batter_runs_conceded',
    'total_runs_from_relevant_extras_mean': 'mean_extras_runs_conceded',
    'total_taken_from_relevant_wickets_mean': 'mean_wickets_taken',
    'total_runs_batter_mean': 'mean_batter_runs_scored',
    'total_is_wicket_mean': 'mean_wickets_conceded'
}

data = totals_run_estimation_df.drop(columns=['total_total_runs_conceded_mean', 'total_total_mean', 'total_high_scoring_hit_mean']) \
                               .rename(columns=cols_to_rename)

X = data.drop(columns=['final_runs'])
y = data['final_runs']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [120]:
# No Hyperparameter tuning:
model, analysis, feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.919 ± 0.001
CV Average: 0.418 ± 0.017
Gap (Train-CV): 0.501
CV Consistency: Stable

RMSE Scores:
Training Average: 12.636 ± 0.087
CV Average: 33.942 ± 0.841
Gap (CV-Train): 21.307

MAPE Scores:
Training Average: 0.122 ± 0.002
CV Average: 0.332 ± 0.012
Gap (CV-Train): 0.210

Top 10 Most Important Features:
                     Feature  Importance
3    mean_batter_runs_scored    0.493358
0  mean_batter_runs_conceded    0.143125
1  mean_extras_runs_conceded    0.139528
2         mean_wickets_taken    0.113458
4      mean_wickets_conceded    0.110531


Intereestingly, we see marginally worse results than using all features with no hyperparameter tuning.

In [121]:
model, kwargs = tune_random_forest(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   2.0s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   2.0s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   2.1s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   2.1s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   2.1s
[CV] END max_depth=100, max_features=None, min_samples_leaf=8, min_samples_split=20, n_estimators=200; total time=   3.5s
[CV] END max_depth=100, max_features=None, min_samples_leaf=8, min_samples_split=20, n_estimators=200; total time=   3.7s
[CV] END max_depth=100, max_features=None, min_samples_leaf=8, min_samples_split=20, n_est

In [125]:
model, analysis, feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42, **kwargs)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.636 ± 0.003
CV Average: 0.449 ± 0.014
Gap (Train-CV): 0.187
CV Consistency: Stable

RMSE Scores:
Training Average: 26.869 ± 0.169
CV Average: 33.042 ± 0.792
Gap (CV-Train): 6.173

MAPE Scores:
Training Average: 0.268 ± 0.003
CV Average: 0.329 ± 0.015
Gap (CV-Train): 0.061

Top 10 Most Important Features:
                     Feature  Importance
3    mean_batter_runs_scored    0.440788
0  mean_batter_runs_conceded    0.275927
4      mean_wickets_conceded    0.128514
1  mean_extras_runs_conceded    0.088376
2         mean_wickets_taken    0.066395


So... There is very little overfitting which is good (likely stemming from the reduced feature set). However scores are marginally worse than our other model with the entire feature set.

Based on the fact that we are experiencing very little overfitting, I am going to attempt to increase the model complexity to see if it improves performance.

In [123]:
kwargs 

{'n_estimators': 300,
 'min_samples_split': 20,
 'min_samples_leaf': 16,
 'max_features': 'log2',
 'max_depth': 20}

In [124]:
kwargs = {
    'n_estimators': 1000,
    'min_samples_split': 10,
    'min_samples_leaf': 8,
    'max_features': 'log2',
    'max_depth': 50
}
tuned_model, tuned_analysis, tuned_feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42, **kwargs)


Cross-Validation Analysis:

R² Scores:
Training Average: 0.636 ± 0.003
CV Average: 0.449 ± 0.014
Gap (Train-CV): 0.187
CV Consistency: Stable

RMSE Scores:
Training Average: 26.869 ± 0.169
CV Average: 33.042 ± 0.792
Gap (CV-Train): 6.173

MAPE Scores:
Training Average: 0.268 ± 0.003
CV Average: 0.329 ± 0.015
Gap (CV-Train): 0.061

Top 10 Most Important Features:
                     Feature  Importance
3    mean_batter_runs_scored    0.440788
0  mean_batter_runs_conceded    0.275927
4      mean_wickets_conceded    0.128514
1  mean_extras_runs_conceded    0.088376
2         mean_wickets_taken    0.066395


Unfortunately, the actaul results stay relatively similar while performance on the training data improves, suggesting that the increase in complexity is only leading to more overfitting.

I will therefore use the previous model with the test set.

In [126]:
r2, rmse, mape = test_final_model(model, X_test, y_test)

Final Model Performance:
R²: 0.457
RMSE: 33.054
MAPE: 0.296


Comparing the test scores to those of the random forest model with all features, we see that the scores marginally worse. The R^2 is 0.457 compared to 0.504, RMSE is 33.1 compared to 31.3 and MAPE is 0.296 compared to 0.265.

As per my original theory, it is therefore evident that reducing the feature set has not led to an improvement in the random forest model's performance, in fact leading to the opposite. While it does still marginally outperform the baseline model, it is far from ideal.

I am curious to see if a more complex model such as a neural network could improve results. If I have time I will try this later

---
# Wicket Estimation

Learning from run estimation, I will start by using only total stats. Furthermore I won't bother with feature selection due to the small number of features we'll have.

While RFE is useless as above due to the small number of features, I did consider dropping everything except the number of wickets taken/conceded. That being said, in my head it makes sense that either:
- More aggressive runs will lead to more wickets being taken
- more runs = a better batting side and therefore less wickets conceded

I'll evaluate the model results to see if these are the case or if these features were completely insignificant. Hopefully leaving them in shouldn't harm baseline performance much.

In [68]:
cols_to_keep = [col for col in wicket_estimation_df.columns if 'total' in col[:5]] + ['final_wickets']
totals_wicket_estimation_df = wicket_estimation_df[cols_to_keep].copy()

cols_to_rename = {
    'total_batter_runs_conceded_mean': 'mean_batter_runs_conceded',
    'total_runs_from_relevant_extras_mean': 'mean_extras_runs_conceded',
    'total_taken_from_relevant_wickets_mean': 'mean_wickets_taken',
    'total_runs_batter_mean': 'mean_batter_runs_scored',
    'total_is_wicket_mean': 'mean_wickets_conceded'
}

totals_wicket_estimation_df.rename(columns=cols_to_rename, inplace=True)

train_data, test_data = train_test_split(totals_wicket_estimation_df, test_size=0.2, random_state=42)

In [70]:
totals_wicket_estimation_df.columns

Index(['mean_batter_runs_conceded', 'mean_extras_runs_conceded',
       'total_total_runs_conceded_mean', 'mean_wickets_taken',
       'mean_batter_runs_scored', 'total_high_scoring_hit_mean',
       'total_total_mean', 'mean_wickets_conceded', 'final_wickets'],
      dtype='object')

In [69]:
model, metrics, coefficients = train_and_evaluate_baseline_model(train_data, test_data, 'final_wickets')

Model Performance:
Train R²: 0.456
Test R²: 0.456
Train RMSE: 2.066
Test RMSE: 2.152
Train MAPE: 292375532727009.438
Test MAPE: 402287741477278.312

Feature Coefficients:
                          Feature  Coefficient
0       mean_batter_runs_conceded     1.804795
1       mean_extras_runs_conceded    -0.153227
2  total_total_runs_conceded_mean    -1.608579
3              mean_wickets_taken     0.474210
4         mean_batter_runs_scored    -1.244086
5     total_high_scoring_hit_mean    -2.424828
6                total_total_mean     3.149803
7           mean_wickets_conceded     1.454342


In [71]:
totals_wicket_estimation_df['final_wickets'].mean()

np.float64(6.295273499734466)

Okay interesting. The model is okay, but not amazing.

The absurd MAPE values are liekly due to games with 0 wickets conceded as I know that MAPE struggles with 0 values.

THat being said, the result of only 2 wickets taken off is okay. Based on the mean value of 6.30 wickets conceded per game, this suggests that the actual MAPE is again around 0.3 which is not terrible but again really not great.

Similar results for R^2 compared to run estimation, hence similar conclusions.

It is however interesting that the model does not find mean_wickets_taken very important. I want to repeat this one more time with only wickets taken statistics.

In [72]:
wickets_only = totals_wicket_estimation_df[['mean_wickets_taken', 'mean_wickets_conceded', 'final_wickets']]

train_data, test_data = train_test_split(wickets_only, test_size=0.2, random_state=42)

model, metrics, coefficients = train_and_evaluate_baseline_model(train_data, test_data, 'final_wickets')

Model Performance:
Train R²: 0.388
Test R²: 0.388
Train RMSE: 2.190
Test RMSE: 2.285
Train MAPE: 348523715092883.062
Test MAPE: 449098048095465.750

Feature Coefficients:
                 Feature  Coefficient
0     mean_wickets_taken     0.530018
1  mean_wickets_conceded     1.538253


Interestingly this performs even worse, and mean_wickets_taken is not much more important than when all of the other features were included.

This suggests they may indeed be of somewhat importance to predicting the number of wickets taken.

## Random Forest for Wicket Estimation

I will now repeat the same process for wicket estimation as I did for run estimation.
Based on my results above with the baseline and also those of the random forest with all features, I will start with all features. I may attempt to slim down the feature set later, however my experiments on run estimation suggest that this will not lead to any meaningful improvements and would probably be a waste of time.

In [134]:
data = wicket_estimation_df
X = data.drop(columns=['final_wickets'])
y = data['final_wickets']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

In [135]:
model, analysis, feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.934 ± 0.001
CV Average: 0.535 ± 0.012
Gap (Train-CV): 0.399
CV Consistency: Stable

RMSE Scores:
Training Average: 0.719 ± 0.002
CV Average: 1.910 ± 0.024
Gap (CV-Train): 1.192

MAPE Scores:
Training Average: 97973539648279.156 ± 4057527847656.355
CV Average: 264303690240606.938 ± 45837545190482.172
Gap (CV-Train): 166330150592327.781

Top 10 Most Important Features:
                                           Feature  Importance
15                            total_is_wicket_mean    0.450925
3           total_taken_from_relevant_wickets_mean    0.053848
19                        powerplay_is_wicket_mean    0.049884
11  non_powerplay_taken_from_relevant_wickets_mean    0.049578
23                    non_powerplay_is_wicket_mean    0.044251
13                     total_high_scoring_hit_mean    0.033180
7       powerplay_taken_from_relevant_wickets_mean    0.026933
5         powerplay_runs_from_relevant_extras_mean    0.025796
1   

Compared to out baseline, we see at least a marginally better model performance across the board. The model is able to fit the data better, as evidenced by the greater R^2 values, as well as having a slightly lower RMSE. Again due to the presence of 0 values, the MAPE is pretty useless here.

There is however clear evidence of overfitting, as evidenced by the somewhat high CV gap for both R^2 and RMSE.

In [136]:
model, kwargs = tune_random_forest(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   4.2s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   4.2s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   4.2s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   4.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   4.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   4.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   4.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimator

In [137]:
model, analysis, feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42, **kwargs)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.799 ± 0.002
CV Average: 0.535 ± 0.014
Gap (Train-CV): 0.264
CV Consistency: Stable

RMSE Scores:
Training Average: 1.257 ± 0.004
CV Average: 1.910 ± 0.029
Gap (CV-Train): 0.654

MAPE Scores:
Training Average: 184495958698196.062 ± 7839447938444.005
CV Average: 270531481338957.938 ± 49280875876057.430
Gap (CV-Train): 86035522640761.875

Top 10 Most Important Features:
                                           Feature  Importance
15                            total_is_wicket_mean    0.539399
3           total_taken_from_relevant_wickets_mean    0.053853
11  non_powerplay_taken_from_relevant_wickets_mean    0.047092
19                        powerplay_is_wicket_mean    0.046759
23                    non_powerplay_is_wicket_mean    0.040804
13                     total_high_scoring_hit_mean    0.031512
5         powerplay_runs_from_relevant_extras_mean    0.019222
7       powerplay_taken_from_relevant_wickets_mean    0.018788
17  

While the model is overfitting less, the scores have not improved whatsoever. We still see the same CV R^2 and RMSE, suggesting that perhaps the model is a) not a good fit for the data or b) the model has reached the capacity of how much it can learn from the data.

Since the model has not capped out at any of the hyperparameters (on an end of either complexity or reducing overfitting), this suggets that it further hyperparameter tuning will be unlikely to improve performance significantly.

Despite this, it is a 5-10 percent improvement in RMSE over the baseline model, so it is not nothing, though not significant. I will evaluate the final performance of the model with the test set and see how it compares.


In [139]:
r2, rmse, mape = test_final_model(model, X_test, y_test)

Final Model Performance:
R²: 0.541
RMSE: 1.914
MAPE: 298297340872460.750


The results here are very similar to those of the validation set. It does suggest minimal overfitting for the model, however again the scores are not a great improvement over the baseline model.

While I do not expect much or any of an improvement, I will repeat this with only the total features to see if I am wrong.

In [141]:
data = totals_wicket_estimation_df
X = data.drop(columns=['final_wickets'])
y = data['final_wickets']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

In [142]:
# No Hyperparameter tuning:
model, analysis, feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.927 ± 0.000
CV Average: 0.477 ± 0.012
Gap (Train-CV): 0.450
CV Consistency: Stable

RMSE Scores:
Training Average: 0.757 ± 0.003
CV Average: 2.025 ± 0.022
Gap (CV-Train): 1.269

MAPE Scores:
Training Average: 101482163357139.844 ± 3807303728736.096
CV Average: 278734270683032.312 ± 55129956444989.617
Gap (CV-Train): 177252107325892.469

Top 10 Most Important Features:
                          Feature  Importance
7           mean_wickets_conceded    0.490366
3              mean_wickets_taken    0.121427
5     total_high_scoring_hit_mean    0.099266
1       mean_extras_runs_conceded    0.084365
4         mean_batter_runs_scored    0.052550
6                total_total_mean    0.051002
2  total_total_runs_conceded_mean    0.050753
0       mean_batter_runs_conceded    0.050270


In [143]:
model, kwargs = tune_random_forest(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=50, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   3.6s
[CV] END max_depth=50, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   3.7s
[CV] END max_depth=50, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   3.7s[CV] END max_depth=50, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   3.7s

[CV] END max_depth=50, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   3.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=16, min_samples_split=2, n_estimators=500; total time=   3.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=16, min_samples_split=2, n_estimators=500; total time=   3.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=16, min_samples_split=2, n_estimators=

In [144]:
model, analysis, feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42, **kwargs)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.768 ± 0.001
CV Average: 0.500 ± 0.014
Gap (Train-CV): 0.268
CV Consistency: Stable

RMSE Scores:
Training Average: 1.349 ± 0.005
CV Average: 1.980 ± 0.034
Gap (CV-Train): 0.631

MAPE Scores:
Training Average: 203878937434920.500 ± 8348068630364.134
CV Average: 290926562965318.438 ± 60395113572165.492
Gap (CV-Train): 87047625530397.938

Top 10 Most Important Features:
                          Feature  Importance
7           mean_wickets_conceded    0.478821
3              mean_wickets_taken    0.132890
5     total_high_scoring_hit_mean    0.107585
1       mean_extras_runs_conceded    0.066748
6                total_total_mean    0.063595
4         mean_batter_runs_scored    0.058719
0       mean_batter_runs_conceded    0.045857
2  total_total_runs_conceded_mean    0.045786


Interestingly, the hyperparameter tuning does lead to more of an improvement here than while using all features. That being said, it still performs worse than said model (albeit marginally better than the baseline). I expect altering model complexity to again have very limited impact based on all of the results up to this point, hence I do not expect to gain anything from trying it.

In [145]:
r2, rmse, mape = test_final_model(model, X_test, y_test)

Final Model Performance:
R²: 0.503
RMSE: 1.992
MAPE: 330637638622981.625


As seen with the cross validation results, the model does not outperform the similar one with a full feature set, albeit it does marginally better than the baseline.