In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from joblib import dump, load

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate


In [2]:
data = pd.read_csv('../data/saved_data/standardized_per_game_data.csv')
data.head()

Unnamed: 0,game_id,innings,total_batter_runs_conceded_mean,total_runs_from_relevant_extras_mean,total_total_runs_conceded_mean,total_taken_from_relevant_wickets_mean,powerplay_batter_runs_conceded_mean,powerplay_runs_from_relevant_extras_mean,powerplay_total_runs_conceded_mean,powerplay_taken_from_relevant_wickets_mean,...,powerplay_runs_batter_mean,powerplay_high_scoring_hit_mean,powerplay_total_mean,powerplay_is_wicket_mean,non_powerplay_runs_batter_mean,non_powerplay_high_scoring_hit_mean,non_powerplay_total_mean,non_powerplay_is_wicket_mean,final_runs,final_wickets
0,0021ca69-72b5-4080-9e2e-f7cd9e558b47,1,1.06067,-0.51367,1.011385,-0.551914,0.691736,-0.427798,0.586786,-0.163023,...,0.906615,1.147216,0.869648,-0.755857,0.516191,0.368048,0.520429,0.3956,170,6
1,0021ca69-72b5-4080-9e2e-f7cd9e558b47,2,0.861048,0.147495,0.893025,0.872679,0.559383,-0.006142,0.576349,1.45565,...,0.711668,0.555493,0.538625,-0.559719,0.789972,0.55527,0.796122,-0.310116,147,10
2,002795ac-3340-4ccf-9763-865ab0fb8268,1,0.197794,2.108226,0.479992,0.819249,-0.584629,1.416903,-0.411425,1.396177,...,0.255541,0.150034,0.39928,0.147236,0.737713,0.817304,1.010981,0.106142,136,10
3,002795ac-3340-4ccf-9763-865ab0fb8268,2,0.275919,0.611244,0.38953,-0.137472,0.523102,1.365255,0.900301,-0.881982,...,0.108497,0.167389,0.243849,-0.900453,-0.658484,-1.20801,-0.604155,0.118649,120,7
4,0033798e-430e-4aa1-a401-30db51622847,1,-0.092653,-0.665319,-0.183638,1.374086,0.372929,-0.95686,0.202154,-0.733449,...,1.43199,2.080202,1.475127,-0.129252,0.719002,1.13004,0.81744,0.595391,161,9


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7533 entries, 0 to 7532
Data columns (total 28 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   game_id                                         7533 non-null   object 
 1   innings                                         7533 non-null   int64  
 2   total_batter_runs_conceded_mean                 7533 non-null   float64
 3   total_runs_from_relevant_extras_mean            7533 non-null   float64
 4   total_total_runs_conceded_mean                  7533 non-null   float64
 5   total_taken_from_relevant_wickets_mean          7533 non-null   float64
 6   powerplay_batter_runs_conceded_mean             7533 non-null   float64
 7   powerplay_runs_from_relevant_extras_mean        7533 non-null   float64
 8   powerplay_total_runs_conceded_mean              7533 non-null   float64
 9   powerplay_taken_from_relevant_wickets_mea

In [4]:
# There's one game missing non-powerplay stats, so I will drop that game.
data = data.dropna()

In [5]:
# Seperate test data for final evaluation
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
test_data.to_csv('../data/saved_data/match_test_data.csv', index=False)
data = train_data

In [6]:
run_estimation_df = data.drop(columns=['game_id', 'innings', 'final_wickets'])
wicket_estimation_df = data.drop(columns=['game_id', 'innings', 'final_runs'])


# Baseline Model

For my baseline I am going to use linear regression. I will use RFE for feature selection.

I belive innings score is given as runs and wickets, so I will train two models, one to predict runs and one to predict wickets. I will do these seperately.

# Run Estimation

In [7]:
train_data, test_data = train_test_split(run_estimation_df, test_size=0.2, random_state=42)

In [8]:
def initial_feature_selection(df, n_features_to_select=10, target_column='final_runs'):
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    estimator = LinearRegression()
    
    selector = RFE(estimator=estimator, 
                  n_features_to_select=n_features_to_select,
                  step=1)
    selector = selector.fit(X, y)
    
    selected_features = X.columns[selector.support_].tolist()
    
    feature_ranking = pd.DataFrame({
        'Feature': X.columns,
        'Ranking': selector.ranking_
    }).sort_values('Ranking')
    print("Feature Rankings:")
    print(feature_ranking)
    
    return selected_features

important_features = initial_feature_selection(train_data)


Feature Rankings:
                                           Feature  Ranking
0                  total_batter_runs_conceded_mean        1
13                     total_high_scoring_hit_mean        1
12                          total_runs_batter_mean        1
22                        non_powerplay_total_mean        1
17                 powerplay_high_scoring_hit_mean        1
20                  non_powerplay_runs_batter_mean        1
18                            powerplay_total_mean        1
21             non_powerplay_high_scoring_hit_mean        1
2                   total_total_runs_conceded_mean        1
1             total_runs_from_relevant_extras_mean        1
15                            total_is_wicket_mean        2
16                      powerplay_runs_batter_mean        3
4              powerplay_batter_runs_conceded_mean        4
9     non_powerplay_runs_from_relevant_extras_mean        5
6               powerplay_total_runs_conceded_mean        6
5         powerplay_ru

In [9]:
def check_correlations(df, features):
    corr_matrix = df[features].corr()
    
    # Find highly correlated 
    high_corr = []
    for i in range(len(features)):
        for j in range(i+1, len(features)):
            if abs(corr_matrix.iloc[i,j]) > 0.7:
                high_corr.append({
                    'feature1': features[i],
                    'feature2': features[j],
                    'correlation': corr_matrix.iloc[i,j]
                })
    
    return pd.DataFrame(high_corr)

# Use it
correlations = check_correlations(train_data, important_features)
print("Highly correlated features:")
correlations

Highly correlated features:


Unnamed: 0,feature1,feature2,correlation
0,total_batter_runs_conceded_mean,total_total_runs_conceded_mean,0.984783
1,total_batter_runs_conceded_mean,total_runs_batter_mean,0.767626
2,total_batter_runs_conceded_mean,total_high_scoring_hit_mean,0.722504
3,total_batter_runs_conceded_mean,non_powerplay_runs_batter_mean,0.741551
4,total_batter_runs_conceded_mean,non_powerplay_total_mean,0.731489
5,total_total_runs_conceded_mean,total_runs_batter_mean,0.733432
6,total_total_runs_conceded_mean,non_powerplay_runs_batter_mean,0.7082
7,total_total_runs_conceded_mean,non_powerplay_total_mean,0.710996
8,total_runs_batter_mean,total_high_scoring_hit_mean,0.963584
9,total_runs_batter_mean,powerplay_high_scoring_hit_mean,0.866549


It is evnident that there is significant correlation between several of the best ranked features, likely stemming from the fact that total stats encompass powerplay and non-powerplay stats. I am going to start by only keeping the total stats for now.

Since I would be eliminating a large chunk of the 'important_features' I will start again from scratch with only the total stats.

In [10]:
cols_to_keep = [col for col in run_estimation_df.columns if 'total' in col[:5]] + ['final_runs']
totals_run_estimation_df = run_estimation_df[cols_to_keep]

train_data, test_data = train_test_split(totals_run_estimation_df, test_size=0.2, random_state=42)

In [11]:
important_features = initial_feature_selection(train_data, target_column='final_runs')

Feature Rankings:
                                  Feature  Ranking
0         total_batter_runs_conceded_mean        1
1    total_runs_from_relevant_extras_mean        1
2          total_total_runs_conceded_mean        1
3  total_taken_from_relevant_wickets_mean        1
4                  total_runs_batter_mean        1
5             total_high_scoring_hit_mean        1
6                        total_total_mean        1
7                    total_is_wicket_mean        1




RFE does not work with <10 features. That being said barring any significant correlations I don't think I have an amount that should negatively impact the model so I will keep all features.

In [12]:
correlations = check_correlations(train_data, important_features)
correlations

Unnamed: 0,feature1,feature2,correlation
0,total_batter_runs_conceded_mean,total_total_runs_conceded_mean,0.984783
1,total_batter_runs_conceded_mean,total_runs_batter_mean,0.767626
2,total_batter_runs_conceded_mean,total_high_scoring_hit_mean,0.722504
3,total_batter_runs_conceded_mean,total_total_mean,0.760415
4,total_total_runs_conceded_mean,total_runs_batter_mean,0.733432
5,total_total_runs_conceded_mean,total_total_mean,0.739177
6,total_runs_batter_mean,total_high_scoring_hit_mean,0.963584
7,total_runs_batter_mean,total_total_mean,0.993446
8,total_high_scoring_hit_mean,total_total_mean,0.964398


so... there are a lot of high correlations, but a couple very high ones.
- The mean runs conceded and the mean batter only runs conceded are highly very highly correlated, likely due to the fact that most of the runs conceded are batter runs and not extras.
- the mean runs conceded scored by the batter is very highly correlated with:
    - the mean no. of high scoring balls hit
    - the mean total runs made by the batting side including batter runs and extras.
- Finally the mean total runs scored is highly correlated with the mean no. of high scoring balls hit.

Based on this, I will remove:
- The mean total runs conceded per game
- The mean number of high scoring balls hit per game
- The mean runs scored by the batting side per game

For the sake of clarity I will also rename the existing features.

In [13]:
train_data = train_data.drop(columns=['total_total_runs_conceded_mean', 'total_total_mean', 'total_high_scoring_hit_mean'])
test_data = test_data.drop(columns=['total_total_runs_conceded_mean', 'total_total_mean', 'total_high_scoring_hit_mean'])

train_data.columns

Index(['total_batter_runs_conceded_mean',
       'total_runs_from_relevant_extras_mean',
       'total_taken_from_relevant_wickets_mean', 'total_runs_batter_mean',
       'total_is_wicket_mean', 'final_runs'],
      dtype='object')

In [14]:
cols_to_rename = {
    'total_batter_runs_conceded_mean': 'mean_batter_runs_conceded',
    'total_runs_from_relevant_extras_mean': 'mean_extras_runs_conceded',
    'total_taken_from_relevant_wickets_mean': 'mean_wickets_taken',
    'total_runs_batter_mean': 'mean_batter_runs_scored',
    'total_is_wicket_mean': 'mean_wickets_conceded'
}

train_data.rename(columns=cols_to_rename, inplace=True)
test_data.rename(columns=cols_to_rename, inplace=True)
train_data.columns

Index(['mean_batter_runs_conceded', 'mean_extras_runs_conceded',
       'mean_wickets_taken', 'mean_batter_runs_scored',
       'mean_wickets_conceded', 'final_runs'],
      dtype='object')

In [15]:
def train_and_evaluate_baseline_model(train_data, test_data, target_column):
    X_train = train_data.drop(columns=[target_column])
    y_train = train_data[target_column]
    X_test = test_data.drop(columns=[target_column])
    y_test = test_data[target_column]

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    metrics = {
        'train': {
            'r2': r2_score(y_train, y_pred_train),
            'rmse': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'mape': mean_absolute_percentage_error(y_train, y_pred_train)
        },
        'test': {
            'r2': r2_score(y_test, y_pred_test),
            'rmse': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'mape': mean_absolute_percentage_error(y_test, y_pred_test)
        }
    }
    coefficients = pd.DataFrame({
        'Feature': X_train.columns,
        'Coefficient': model.coef_
    })
    
    print("Model Performance:")
    print(f"Train R²: {metrics['train']['r2']:.3f}")
    print(f"Test R²: {metrics['train']['r2']:.3f}")
    print(f"Train RMSE: {metrics['train']['rmse']:.3f}")
    print(f"Test RMSE: {metrics['test']['rmse']:.3f}")
    print(f"Train MAPE: {metrics['train']['mape']:.3f}")
    print(f"Test MAPE: {metrics['test']['mape']:.3f}")
    print("\nFeature Coefficients:")
    print(coefficients)
    
    return model, metrics, coefficients

model, metrics, coefficients = train_and_evaluate_baseline_model(train_data, test_data, 'final_runs')

Model Performance:
Train R²: 0.456
Test R²: 0.456
Train RMSE: 32.589
Test RMSE: 32.774
Train MAPE: 0.306
Test MAPE: 0.282

Feature Coefficients:
                     Feature  Coefficient
0  mean_batter_runs_conceded    11.130904
1  mean_extras_runs_conceded     6.829709
2         mean_wickets_taken    -0.660033
3    mean_batter_runs_scored    20.373649
4      mean_wickets_conceded    -5.456100


So... The model is not great - but that's fine for a baseline.

The similar test and train results suggest no overfitting, though considering the number of features and the model itself this was unlikely anyway.

The R^2 is 0.446 which is not great, suggests out baseline struggled to fit the data well.

The MAPE is 0.301, suggesting that while the model is not great, it is faily okay for getting a ballpark estimation... 

Anyways, time to improve on this.

## Random Forest

I am going to try an approach using tree based models to see if they can improve results over simple linear regression.

I will start with a random forest model.

Since random forest is not significantly affected by multicollinearity, I will keep the original feature set as it should filter what it believes is most important/informative at each step. Depending on performance however I may then try removing some features.

I will be using a train/valid/test split this time as there are some hyperparameters to tune.

In [16]:
data = run_estimation_df
target_column = 'final_runs'
test_size = 0.15
random_state = 42


X = data.drop(columns=[target_column])
y = data[target_column]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state
)

In [17]:
def do_random_forest(X_train, y_train, cv=5, random_state=42, **kwargs):
    model = RandomForestRegressor(random_state=random_state, **kwargs)
    
    cv_results = cross_validate(
        model,
        X_train,
        y_train,
        cv=cv,
        scoring={
            'r2': 'r2',
            'rmse': 'neg_root_mean_squared_error',
            'mape': 'neg_mean_absolute_percentage_error'
        },
        return_train_score=True
    )
    
    model.fit(X_train, y_train)
    
    analysis = {
        'r2': {
            'train_avg': cv_results['train_r2'].mean(),
            'train_std': cv_results['train_r2'].std(),
            'cv_avg': cv_results['test_r2'].mean(),
            'cv_std': cv_results['test_r2'].std(),
            'gap': cv_results['train_r2'].mean() - cv_results['test_r2'].mean(),
            'consistency': 'Stable' if cv_results['test_r2'].std() < 0.05 else 'Unstable'
        },
        'rmse': {
            'train_avg': -cv_results['train_rmse'].mean(),
            'train_std': cv_results['train_rmse'].std(),
            'cv_avg': -cv_results['test_rmse'].mean(),
            'cv_std': cv_results['test_rmse'].std(),
            'gap': -cv_results['test_rmse'].mean() - (-cv_results['train_rmse'].mean())
        },
        'mape': {
            'train_avg': -cv_results['train_mape'].mean(),
            'train_std': cv_results['train_mape'].std(),
            'cv_avg': -cv_results['test_mape'].mean(),
            'cv_std': cv_results['test_mape'].std(),
            'gap': -cv_results['test_mape'].mean() - (-cv_results['train_mape'].mean())
        }
    }
    
    print("Cross-Validation Analysis:")
    print("\nR² Scores:")
    print(f"Training Average: {analysis['r2']['train_avg']:.3f} ± {analysis['r2']['train_std']:.3f}")
    print(f"CV Average: {analysis['r2']['cv_avg']:.3f} ± {analysis['r2']['cv_std']:.3f}")
    print(f"Gap (Train-CV): {analysis['r2']['gap']:.3f}")
    print(f"CV Consistency: {analysis['r2']['consistency']}")
    
    print("\nRMSE Scores:")
    print(f"Training Average: {analysis['rmse']['train_avg']:.3f} ± {analysis['rmse']['train_std']:.3f}")
    print(f"CV Average: {analysis['rmse']['cv_avg']:.3f} ± {analysis['rmse']['cv_std']:.3f}")
    print(f"Gap (CV-Train): {analysis['rmse']['gap']:.3f}")
    
    print("\nMAPE Scores:")
    print(f"Training Average: {analysis['mape']['train_avg']:.3f} ± {analysis['mape']['train_std']:.3f}")
    print(f"CV Average: {analysis['mape']['cv_avg']:.3f} ± {analysis['mape']['cv_std']:.3f}")
    print(f"Gap (CV-Train): {analysis['mape']['gap']:.3f}")
    
    feature_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))
    
    return model, analysis, feature_importance

In [18]:
model, analysis, feature_importance = do_random_forest(X_train, y_train)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.926 ± 0.001
CV Average: 0.472 ± 0.016
Gap (Train-CV): 0.454
CV Consistency: Stable

RMSE Scores:
Training Average: 12.070 ± 0.092
CV Average: 32.132 ± 0.509
Gap (CV-Train): 20.061

MAPE Scores:
Training Average: 0.111 ± 0.004
CV Average: 0.295 ± 0.040
Gap (CV-Train): 0.184

Top 10 Most Important Features:
                                           Feature  Importance
22                        non_powerplay_total_mean    0.328556
14                                total_total_mean    0.115499
10          non_powerplay_total_runs_conceded_mean    0.039960
2                   total_total_runs_conceded_mean    0.035861
5         powerplay_runs_from_relevant_extras_mean    0.034760
15                            total_is_wicket_mean    0.034508
1             total_runs_from_relevant_extras_mean    0.031531
11  non_powerplay_taken_from_relevant_wickets_mean    0.029799
9     non_powerplay_runs_from_relevant_extras_mean    0.029393
8   

So... with no tuning the model is not great. It is able to fit the training data very well, but the CV gap demonstrates that there is significant overfitting going on. In fact, we see scores that are marginally worse than the baseline model.

I am going to begin by seeing how much I can improve the model by tuning the hyperparameters. Hopefully automated tuning should help a decent amount, followed by manual tuning depending on results.

In [19]:
def tune_random_forest(X_train, y_train):    
    param_grid = {
        'max_depth': [10, 20, 30, 50, 100, None],
        'min_samples_split': [2, 5, 10, 20, 50],
        'min_samples_leaf': [1, 2, 4, 8, 16],
        'n_estimators': [100, 200, 300, 400, 500],
        'max_features': ['sqrt', 'log2', None]
    }
    
    search = RandomizedSearchCV(
        RandomForestRegressor(random_state=42),
        param_grid,
        cv=5,
        scoring='r2',
        n_jobs=-1,
        verbose=2
    )
    
    search.fit(X_train, y_train)
    
    best_model = search.best_estimator_
    final_score = best_model.score(X_train, y_train)
    
    print("Best parameters:", search.best_params_)
    print("CV Score:", search.best_score_)
    print("Final Score:", final_score)
    
    return best_model, search.best_params_

In [20]:
best_rf, kwargs = tune_random_forest(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=16, min_samples_split=10, n_estimators=400; total time=   3.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=16, min_samples_split=10, n_estimators=400; total time=   3.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=16, min_samples_split=10, n_estimators=400; total time=   4.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=16, min_samples_split=10, n_estimators=400; total time=   4.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=16, min_samples_split=10, n_estimators=400; total time=   4.0s
[CV] END max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=   5.9s
[CV] END max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=   6.2s
[CV] END max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split

In [21]:
kwargs

{'n_estimators': 400,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'log2',
 'max_depth': None}

In [22]:
model, analysis, feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42, **kwargs)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.873 ± 0.002
CV Average: 0.480 ± 0.017
Gap (Train-CV): 0.393
CV Consistency: Stable

RMSE Scores:
Training Average: 15.754 ± 0.073
CV Average: 31.884 ± 0.516
Gap (CV-Train): 16.130

MAPE Scores:
Training Average: 0.147 ± 0.006
CV Average: 0.297 ± 0.039
Gap (CV-Train): 0.149

Top 10 Most Important Features:
                                    Feature  Importance
22                 non_powerplay_total_mean    0.097153
14                         total_total_mean    0.096426
20           non_powerplay_runs_batter_mean    0.076919
12                   total_runs_batter_mean    0.073876
13              total_high_scoring_hit_mean    0.063561
21      non_powerplay_high_scoring_hit_mean    0.056652
2            total_total_runs_conceded_mean    0.045199
18                     powerplay_total_mean    0.041084
10   non_powerplay_total_runs_conceded_mean    0.040734
8   non_powerplay_batter_runs_conceded_mean    0.040404


Compared to the earlier results, we see very little improvement. The CV gap has been reduced, however the actual scores are only very marginally imroved. This suggests that the model is overfitting less, but this is not leading to an improvement in the model's ability to predict new data. 

I am going to try some extremenly agressive hyperparameter values to see if it helps. I expect they may reduce overfitting, but I doubt they will improve the model's overall performance. Possibly worth a shot though

In [23]:
kwargs = {
    'max_depth': 10,
    'min_samples_split': 10,
    'min_samples_leaf': 5,
    'n_estimators': 500,
    'max_features': 'log2'
}
model, analysis, feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42, **kwargs)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.668 ± 0.002
CV Average: 0.476 ± 0.014
Gap (Train-CV): 0.192
CV Consistency: Stable

RMSE Scores:
Training Average: 25.504 ± 0.116
CV Average: 32.007 ± 0.486
Gap (CV-Train): 6.503

MAPE Scores:
Training Average: 0.235 ± 0.007
CV Average: 0.299 ± 0.038
Gap (CV-Train): 0.065

Top 10 Most Important Features:
                                    Feature  Importance
22                 non_powerplay_total_mean    0.118805
14                         total_total_mean    0.114493
20           non_powerplay_runs_batter_mean    0.094458
12                   total_runs_batter_mean    0.089984
13              total_high_scoring_hit_mean    0.074657
21      non_powerplay_high_scoring_hit_mean    0.065176
2            total_total_runs_conceded_mean    0.046585
10   non_powerplay_total_runs_conceded_mean    0.042021
18                     powerplay_total_mean    0.040333
8   non_powerplay_batter_runs_conceded_mean    0.035816


Turns out my hypothesis was indeed correct, the model is overfitting far less, however the scores have not really improved.
I will finally use this final model with my test set.

In [24]:
def test_final_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print("Final Model Performance:")
    print(f"R²: {r2:.3f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAPE: {mape:.3f}")
    return r2, rmse, mape

r2, rmse, mape = test_final_model(best_rf, X_test, y_test)


Final Model Performance:
R²: 0.449
RMSE: 32.648
MAPE: 0.287


In [25]:
dump(best_rf, '../models/match_models/run_model.joblib')

['../models/match_models/run_model.joblib']

Compared to the baseline model, we do see improvements in all values, albeit not significant ones. MAPE is 0.265 compared to 0.301, RMSE 31.3 compared to 33.4 and R^2 is 0.504 compared to 0.446.

I would however like to repeat this with only the total features. While random forest should filter out features itself, I am curious to see how it will affect performance.

In [26]:
cols_to_rename = {
    'total_batter_runs_conceded_mean': 'mean_batter_runs_conceded',
    'total_runs_from_relevant_extras_mean': 'mean_extras_runs_conceded',
    'total_taken_from_relevant_wickets_mean': 'mean_wickets_taken',
    'total_runs_batter_mean': 'mean_batter_runs_scored',
    'total_is_wicket_mean': 'mean_wickets_conceded'
}

data = totals_run_estimation_df.drop(columns=['total_total_runs_conceded_mean', 'total_total_mean', 'total_high_scoring_hit_mean']) \
                               .rename(columns=cols_to_rename)

X = data.drop(columns=['final_runs'])
y = data['final_runs']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
model, analysis, feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.920 ± 0.001
CV Average: 0.435 ± 0.007
Gap (Train-CV): 0.485
CV Consistency: Stable

RMSE Scores:
Training Average: 12.497 ± 0.067
CV Average: 33.177 ± 0.664
Gap (CV-Train): 20.681

MAPE Scores:
Training Average: 0.114 ± 0.003
CV Average: 0.306 ± 0.038
Gap (CV-Train): 0.192

Top 10 Most Important Features:
                     Feature  Importance
3    mean_batter_runs_scored    0.497249
0  mean_batter_runs_conceded    0.143924
1  mean_extras_runs_conceded    0.137422
2         mean_wickets_taken    0.111080
4      mean_wickets_conceded    0.110324


Intereestingly, we see marginally worse results than using all features with no hyperparameter tuning.

In [28]:
model, kwargs = tune_random_forest(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   1.4s
[CV] END max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   1.5s
[CV] END max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.6s
[CV] END max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.6s
[CV] END max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   1.6s
[CV] END max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.6s
[CV] END max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.7s
[CV] END max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_est

In [29]:
model, analysis, feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42, **kwargs)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.564 ± 0.003
CV Average: 0.459 ± 0.013
Gap (Train-CV): 0.106
CV Consistency: Stable

RMSE Scores:
Training Average: 29.161 ± 0.162
CV Average: 32.482 ± 0.744
Gap (CV-Train): 3.322

MAPE Scores:
Training Average: 0.275 ± 0.008
CV Average: 0.306 ± 0.038
Gap (CV-Train): 0.031

Top 10 Most Important Features:
                     Feature  Importance
3    mean_batter_runs_scored    0.467138
0  mean_batter_runs_conceded    0.293383
4      mean_wickets_conceded    0.130179
1  mean_extras_runs_conceded    0.064664
2         mean_wickets_taken    0.044636


So... There is very little overfitting which is good (likely stemming from the reduced feature set). However scores are marginally worse than our other model with the entire feature set.

Based on the fact that we are experiencing very little overfitting, I am going to attempt to increase the model complexity to see if it improves performance.

In [30]:
kwargs 

{'n_estimators': 200,
 'min_samples_split': 2,
 'min_samples_leaf': 16,
 'max_features': 'log2',
 'max_depth': 100}

In [31]:
kwargs = {
    'n_estimators': 1000,
    'min_samples_split': 10,
    'min_samples_leaf': 8,
    'max_features': 'log2',
    'max_depth': 50
}
tuned_model, tuned_analysis, tuned_feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42, **kwargs)


Cross-Validation Analysis:

R² Scores:
Training Average: 0.641 ± 0.002
CV Average: 0.458 ± 0.014
Gap (Train-CV): 0.184
CV Consistency: Stable

RMSE Scores:
Training Average: 26.460 ± 0.148
CV Average: 32.516 ± 0.761
Gap (CV-Train): 6.056

MAPE Scores:
Training Average: 0.248 ± 0.008
CV Average: 0.304 ± 0.039
Gap (CV-Train): 0.056

Top 10 Most Important Features:
                     Feature  Importance
3    mean_batter_runs_scored    0.446598
0  mean_batter_runs_conceded    0.270235
4      mean_wickets_conceded    0.133793
1  mean_extras_runs_conceded    0.085035
2         mean_wickets_taken    0.064340


Unfortunately, the actaul results stay relatively similar while performance on the training data improves, suggesting that the increase in complexity is only leading to more overfitting.

I will therefore use the previous model with the test set.

In [32]:
r2, rmse, mape = test_final_model(model, X_test, y_test)

Final Model Performance:
R²: 0.443
RMSE: 33.078
MAPE: 0.284


Comparing the test scores to those of the random forest model with all features, we see that the scores marginally worse. The R^2 is 0.457 compared to 0.504, RMSE is 33.1 compared to 31.3 and MAPE is 0.296 compared to 0.265.

As per my original theory, it is therefore evident that reducing the feature set has not led to an improvement in the random forest model's performance, in fact leading to the opposite. While it does still marginally outperform the baseline model, it is far from ideal.

I am curious to see if a more complex model such as a neural network could improve results. If I have time I will try this later

---
# Wicket Estimation

Learning from run estimation, I will start by using only total stats. Furthermore I won't bother with feature selection due to the small number of features we'll have.

While RFE is useless as above due to the small number of features, I did consider dropping everything except the number of wickets taken/conceded. That being said, in my head it makes sense that either:
- More aggressive runs will lead to more wickets being taken
- more runs = a better batting side and therefore less wickets conceded

I'll evaluate the model results to see if these are the case or if these features were completely insignificant. Hopefully leaving them in shouldn't harm baseline performance much.

In [33]:
cols_to_keep = [col for col in wicket_estimation_df.columns if 'total' in col[:5]] + ['final_wickets']
totals_wicket_estimation_df = wicket_estimation_df[cols_to_keep].copy()

cols_to_rename = {
    'total_batter_runs_conceded_mean': 'mean_batter_runs_conceded',
    'total_runs_from_relevant_extras_mean': 'mean_extras_runs_conceded',
    'total_taken_from_relevant_wickets_mean': 'mean_wickets_taken',
    'total_runs_batter_mean': 'mean_batter_runs_scored',
    'total_is_wicket_mean': 'mean_wickets_conceded'
}

totals_wicket_estimation_df.rename(columns=cols_to_rename, inplace=True)

train_data, test_data = train_test_split(totals_wicket_estimation_df, test_size=0.2, random_state=42)

In [34]:
totals_wicket_estimation_df.columns

Index(['mean_batter_runs_conceded', 'mean_extras_runs_conceded',
       'total_total_runs_conceded_mean', 'mean_wickets_taken',
       'mean_batter_runs_scored', 'total_high_scoring_hit_mean',
       'total_total_mean', 'mean_wickets_conceded', 'final_wickets'],
      dtype='object')

In [35]:
model, metrics, coefficients = train_and_evaluate_baseline_model(train_data, test_data, 'final_wickets')

Model Performance:
Train R²: 0.439
Test R²: 0.439
Train RMSE: 2.083
Test RMSE: 1.988
Train MAPE: 296482630123492.750
Test MAPE: 218963026911124.406

Feature Coefficients:
                          Feature  Coefficient
0       mean_batter_runs_conceded     0.360705
1       mean_extras_runs_conceded    -0.337859
2  total_total_runs_conceded_mean    -0.263096
3              mean_wickets_taken     0.454511
4         mean_batter_runs_scored    -1.042803
5     total_high_scoring_hit_mean    -2.386746
6                total_total_mean     2.919206
7           mean_wickets_conceded     1.416949


In [36]:
totals_wicket_estimation_df['final_wickets'].mean()

np.float64(6.32298755186722)

Okay interesting. The model is okay, but not amazing.

The absurd MAPE values are liekly due to games with 0 wickets conceded as I know that MAPE struggles with 0 values.

THat being said, the result of only 2 wickets taken off is okay. Based on the mean value of 6.30 wickets conceded per game, this suggests that the actual MAPE is again around 0.3 which is not terrible but again really not great.

Similar results for R^2 compared to run estimation, hence similar conclusions.

It is however interesting that the model does not find mean_wickets_taken very important. I want to repeat this one more time with only wickets taken statistics.

In [37]:
wickets_only = totals_wicket_estimation_df[['mean_wickets_taken', 'mean_wickets_conceded', 'final_wickets']]

train_data, test_data = train_test_split(wickets_only, test_size=0.2, random_state=42)

model, metrics, coefficients = train_and_evaluate_baseline_model(train_data, test_data, 'final_wickets')

Model Performance:
Train R²: 0.373
Test R²: 0.373
Train RMSE: 2.204
Test RMSE: 2.128
Train MAPE: 346457614875782.062
Test MAPE: 255628949893465.969

Feature Coefficients:
                 Feature  Coefficient
0     mean_wickets_taken     0.500921
1  mean_wickets_conceded     1.514715


Interestingly this performs even worse, and mean_wickets_taken is not much more important than when all of the other features were included.

This suggests they may indeed be of somewhat importance to predicting the number of wickets taken.

## Random Forest for Wicket Estimation

I will now repeat the same process for wicket estimation as I did for run estimation.
Based on my results above with the baseline and also those of the random forest with all features, I will start with all features. I may attempt to slim down the feature set later, however my experiments on run estimation suggest that this will not lead to any meaningful improvements and would probably be a waste of time.

In [38]:
data = wicket_estimation_df
X = data.drop(columns=['final_wickets'])
y = data['final_wickets']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

In [39]:
model, analysis, feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.931 ± 0.002
CV Average: 0.510 ± 0.048
Gap (Train-CV): 0.421
CV Consistency: Stable

RMSE Scores:
Training Average: 0.729 ± 0.009
CV Average: 1.943 ± 0.075
Gap (CV-Train): 1.214

MAPE Scores:
Training Average: 94192542934474.469 ± 5166293985991.008
CV Average: 253152973135584.312 ± 29277357519708.965
Gap (CV-Train): 158960430201109.844

Top 10 Most Important Features:
                                           Feature  Importance
15                            total_is_wicket_mean    0.446536
3           total_taken_from_relevant_wickets_mean    0.056654
19                        powerplay_is_wicket_mean    0.049234
11  non_powerplay_taken_from_relevant_wickets_mean    0.047735
23                    non_powerplay_is_wicket_mean    0.047499
13                     total_high_scoring_hit_mean    0.033176
7       powerplay_taken_from_relevant_wickets_mean    0.026767
5         powerplay_runs_from_relevant_extras_mean    0.024932
1   

Compared to out baseline, we see at least a marginally better model performance across the board. The model is able to fit the data better, as evidenced by the greater R^2 values, as well as having a slightly lower RMSE. Again due to the presence of 0 values, the MAPE is pretty useless here.

There is however clear evidence of overfitting, as evidenced by the somewhat high CV gap for both R^2 and RMSE.

In [40]:
model, kwargs = tune_random_forest(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=None, max_features=log2, min_samples_leaf=8, min_samples_split=5, n_estimators=500; total time=   5.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=8, min_samples_split=5, n_estimators=500; total time=   5.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=8, min_samples_split=5, n_estimators=500; total time=   5.8s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=8, min_samples_split=5, n_estimators=500; total time=   5.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=8, min_samples_split=5, n_estimators=500; total time=   5.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=8, min_samples_split=10, n_estimators=500; total time=   5.8s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=20, n_estimators=200; total time=  13.7s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=20, n_e

In [41]:
model, analysis, feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42, **kwargs)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.916 ± 0.003
CV Average: 0.514 ± 0.047
Gap (Train-CV): 0.402
CV Consistency: Stable

RMSE Scores:
Training Average: 0.806 ± 0.010
CV Average: 1.934 ± 0.073
Gap (CV-Train): 1.129

MAPE Scores:
Training Average: 111323872801157.031 ± 4633725188239.227
CV Average: 254275595828873.312 ± 30329714765770.172
Gap (CV-Train): 142951723027716.281

Top 10 Most Important Features:
                                           Feature  Importance
15                            total_is_wicket_mean    0.456886
3           total_taken_from_relevant_wickets_mean    0.055818
19                        powerplay_is_wicket_mean    0.049334
11  non_powerplay_taken_from_relevant_wickets_mean    0.048233
23                    non_powerplay_is_wicket_mean    0.047270
13                     total_high_scoring_hit_mean    0.031911
7       powerplay_taken_from_relevant_wickets_mean    0.025743
5         powerplay_runs_from_relevant_extras_mean    0.024840
1  

While the model is overfitting less, the scores have not improved whatsoever. We still see the same CV R^2 and RMSE, suggesting that perhaps the model is a) not a good fit for the data or b) the model has reached the capacity of how much it can learn from the data.

Since the model has not capped out at any of the hyperparameters (on an end of either complexity or reducing overfitting), this suggets that it further hyperparameter tuning will be unlikely to improve performance significantly.

Despite this, it is a 5-10 percent improvement in RMSE over the baseline model, so it is not nothing, though not significant. I will evaluate the final performance of the model with the test set and see how it compares.


In [42]:
r2, rmse, mape = test_final_model(model, X_test, y_test)

Final Model Performance:
R²: 0.560
RMSE: 1.816
MAPE: 178390577160366.719


In [43]:
dump(model, '../models/match_models/wicket_model.joblib')

['../models/match_models/wicket_model.joblib']

The results here are very similar to those of the validation set. It does suggest minimal overfitting for the model, however again the scores are not a great improvement over the baseline model.

While I do not expect much or any of an improvement, I will repeat this with only the total features to see if I am wrong.

In [44]:
data = totals_wicket_estimation_df
X = data.drop(columns=['final_wickets'])
y = data['final_wickets']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

In [45]:
# No Hyperparameter tuning:
model, analysis, feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.925 ± 0.002
CV Average: 0.461 ± 0.050
Gap (Train-CV): 0.464
CV Consistency: Unstable

RMSE Scores:
Training Average: 0.761 ± 0.009
CV Average: 2.039 ± 0.073
Gap (CV-Train): 1.277

MAPE Scores:
Training Average: 97283759358546.141 ± 5209345242949.933
CV Average: 253691937645106.094 ± 39418892437442.836
Gap (CV-Train): 156408178286559.938

Top 10 Most Important Features:
                          Feature  Importance
7           mean_wickets_conceded    0.491560
3              mean_wickets_taken    0.123204
5     total_high_scoring_hit_mean    0.100704
1       mean_extras_runs_conceded    0.081088
0       mean_batter_runs_conceded    0.051725
2  total_total_runs_conceded_mean    0.051554
4         mean_batter_runs_scored    0.051499
6                total_total_mean    0.048667


In [46]:
model, kwargs = tune_random_forest(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=50, max_features=None, min_samples_leaf=4, min_samples_split=5, n_estimators=500; total time=  11.9s
[CV] END max_depth=50, max_features=None, min_samples_leaf=4, min_samples_split=5, n_estimators=500; total time=  11.9s
[CV] END max_depth=50, max_features=None, min_samples_leaf=4, min_samples_split=5, n_estimators=500; total time=  12.0s
[CV] END max_depth=100, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=500; total time=  12.3s
[CV] END max_depth=100, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=500; total time=  12.4s
[CV] END max_depth=100, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=500; total time=  12.4s
[CV] END max_depth=100, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=500; total time=  12.5s
[CV] END max_depth=100, max_features=None, min_samples_leaf=2, min_samples_split=10, n_esti

In [47]:
model, analysis, feature_importance = do_random_forest(X_train, y_train, cv=5, random_state=42, **kwargs)

Cross-Validation Analysis:

R² Scores:
Training Average: 0.789 ± 0.005
CV Average: 0.484 ± 0.046
Gap (Train-CV): 0.305
CV Consistency: Stable

RMSE Scores:
Training Average: 1.278 ± 0.011
CV Average: 1.995 ± 0.066
Gap (CV-Train): 0.717

MAPE Scores:
Training Average: 185879094819999.875 ± 6600367833445.821
CV Average: 278097212303978.406 ± 34934462933705.086
Gap (CV-Train): 92218117483978.531

Top 10 Most Important Features:
                          Feature  Importance
7           mean_wickets_conceded    0.415649
3              mean_wickets_taken    0.131446
5     total_high_scoring_hit_mean    0.117771
6                total_total_mean    0.077023
4         mean_batter_runs_scored    0.072836
1       mean_extras_runs_conceded    0.069087
2  total_total_runs_conceded_mean    0.058419
0       mean_batter_runs_conceded    0.057769


Interestingly, the hyperparameter tuning does lead to more of an improvement here than while using all features. That being said, it still performs worse than said model (albeit marginally better than the baseline). I expect altering model complexity to again have very limited impact based on all of the results up to this point, hence I do not expect to gain anything from trying it.

In [48]:
r2, rmse, mape = test_final_model(model, X_test, y_test)

Final Model Performance:
R²: 0.515
RMSE: 1.908
MAPE: 200142408852437.250


As seen with the cross validation results, the model does not outperform the similar one with a full feature set, albeit it does marginally better than the baseline.

---
# Putting it all together

In [49]:
def calculate_results(data, run_model, wicket_model):
    X = data.drop(columns=['game_id', 'innings', 'final_runs', 'final_wickets'])
    y_wicket = data['final_wickets']
    y_run = data['final_runs']

    run_pred = run_model.predict(X)
    wicket_pred = wicket_model.predict(X)
    
    r2_run = r2_score(y_run, run_pred)
    rmse_run = np.sqrt(mean_squared_error(y_run, run_pred))
    mape_run = mean_absolute_percentage_error(y_run, run_pred)
    
    r2_wicket = r2_score(y_wicket, wicket_pred)
    rmse_wicket = np.sqrt(mean_squared_error(y_wicket, wicket_pred))
    mape_wicket = mean_absolute_percentage_error(y_wicket, wicket_pred)

    final_results = np.column_stack((run_pred, wicket_pred))

    print(f"R^2 Run: {r2_run:.3f}, RMSE Run: {rmse_run:.3f}, MAPE Run: {mape_run:.3f}")
    print(f"R^2 Wicket: {r2_wicket:.3f}, RMSE Wicket: {rmse_wicket:.3f}, MAPE Wicket: {mape_wicket:.3f}")
    
    return {
        'final_results': final_results,
        'r2_run': r2_run,
        'rmse_run': rmse_run,
        'mape_run': mape_run,
        'r2_wicket': r2_wicket,
        'rmse_wicket': rmse_wicket,
        'mape_wicket': mape_wicket
    }


In [50]:
test_data = pd.read_csv('../data/saved_data/match_test_data.csv')
run_model = load('../models/match_models/run_model.joblib')
wicket_model = load('../models/match_models/wicket_model.joblib')
results = calculate_results(test_data, run_model, wicket_model)
results['final_results']

R^2 Run: 0.452, RMSE Run: 34.097, MAPE Run: 0.407
R^2 Wicket: 0.550, RMSE Wicket: 1.962, MAPE Wicket: 423238373516824.688


array([[127.80650631,   2.41243519],
       [121.76272114,   7.18928956],
       [107.42967857,   7.26761917],
       ...,
       [102.97829969,   4.03808057],
       [ 84.90747863,   8.36710053],
       [101.88337698,   5.71592155]])