In [61]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [66]:
print(f"pandas version: {pd.__version__}")
print(f"numpy version: {np.__version__}")
print(f"scikit-learn version: {sklearn.__version__}")

pandas version: 2.2.2
numpy version: 2.0.2
scikit-learn version: 1.6.1


In [31]:
ipl_df = pd.read_csv('ipl_data.csv')
print(f"Dataset successfully Imported. Shape: {ipl_df.shape}")

Dataset successfully Imported. Shape: (76014, 15)


In [32]:
print("\nFirst 5 Rows:")
print(ipl_df.head())


First 5 Rows:
   mid        date                  venue               bat_team  \
0    1  2008-04-18  M Chinnaswamy Stadium  Kolkata Knight Riders   
1    1  2008-04-18  M Chinnaswamy Stadium  Kolkata Knight Riders   
2    1  2008-04-18  M Chinnaswamy Stadium  Kolkata Knight Riders   
3    1  2008-04-18  M Chinnaswamy Stadium  Kolkata Knight Riders   
4    1  2008-04-18  M Chinnaswamy Stadium  Kolkata Knight Riders   

                     bowl_team      batsman   bowler  runs  wickets  overs  \
0  Royal Challengers Bangalore   SC Ganguly  P Kumar     1        0    0.1   
1  Royal Challengers Bangalore  BB McCullum  P Kumar     1        0    0.2   
2  Royal Challengers Bangalore  BB McCullum  P Kumar     2        0    0.2   
3  Royal Challengers Bangalore  BB McCullum  P Kumar     2        0    0.3   
4  Royal Challengers Bangalore  BB McCullum  P Kumar     2        0    0.4   

   runs_last_5  wickets_last_5  striker  non-striker  total  
0            1               0        0      

In [33]:
print("\nStatistical Description:")
print(ipl_df.describe())


Statistical Description:
                mid          runs       wickets         overs   runs_last_5  \
count  76014.000000  76014.000000  76014.000000  76014.000000  76014.000000   
mean     308.627740     74.889349      2.415844      9.783068     33.216434   
std      178.156878     48.823327      2.015207      5.772587     14.914174   
min        1.000000      0.000000      0.000000      0.000000      0.000000   
25%      154.000000     34.000000      1.000000      4.600000     24.000000   
50%      308.000000     70.000000      2.000000      9.600000     34.000000   
75%      463.000000    111.000000      4.000000     14.600000     43.000000   
max      617.000000    263.000000     10.000000     19.600000    113.000000   

       wickets_last_5       striker   non-striker         total  
count    76014.000000  76014.000000  76014.000000  76014.000000  
mean         1.120307     24.962283      8.869287    160.901452  
std          1.053343     20.079752     10.795742     29.246231 

In [34]:
print("\nNumber of Unique Values in Each Column:")
print(ipl_df.nunique())


Number of Unique Values in Each Column:
mid               617
date              442
venue              35
bat_team           14
bowl_team          14
batsman           411
bowler            329
runs              252
wickets            11
overs             140
runs_last_5       102
wickets_last_5      8
striker           155
non-striker        88
total             138
dtype: int64


In [35]:
print("\nDataset Info:")
ipl_df.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76014 entries, 0 to 76013
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mid             76014 non-null  int64  
 1   date            76014 non-null  object 
 2   venue           76014 non-null  object 
 3   bat_team        76014 non-null  object 
 4   bowl_team       76014 non-null  object 
 5   batsman         76014 non-null  object 
 6   bowler          76014 non-null  object 
 7   runs            76014 non-null  int64  
 8   wickets         76014 non-null  int64  
 9   overs           76014 non-null  float64
 10  runs_last_5     76014 non-null  int64  
 11  wickets_last_5  76014 non-null  int64  
 12  striker         76014 non-null  int64  
 13  non-striker     76014 non-null  int64  
 14  total           76014 non-null  int64  
dtypes: float64(1), int64(8), object(6)
memory usage: 8.7+ MB


In [36]:
print("\nData Types of All Columns:")
print(ipl_df.dtypes)


Data Types of All Columns:
mid                 int64
date               object
venue              object
bat_team           object
bowl_team          object
batsman            object
bowler             object
runs                int64
wickets             int64
overs             float64
runs_last_5         int64
wickets_last_5      int64
striker             int64
non-striker         int64
total               int64
dtype: object


In [37]:
print("\nColumn Names in the Dataset:")
print(ipl_df.columns)


Column Names in the Dataset:
Index(['mid', 'date', 'venue', 'bat_team', 'bowl_team', 'batsman', 'bowler',
       'runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5', 'striker',
       'non-striker', 'total'],
      dtype='object')


In [38]:
# Drop irrelevant columns
irrelevant = ['mid', 'date', 'venue', 'batsman', 'bowler', 'striker', 'non-striker']
print(f'Before Removing Irrelevant Columns : {ipl_df.shape}')
ipl_df = ipl_df.drop(irrelevant, axis=1)
print(f'After Removing Irrelevant Columns : {ipl_df.shape}')

Before Removing Irrelevant Columns : (76014, 15)
After Removing Irrelevant Columns : (76014, 8)


In [39]:
const_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
               'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
               'Delhi Daredevils', 'Sunrisers Hyderabad']

In [40]:
print("Unique Batting Teams:", ipl_df['bat_team'].unique())
print(f'Before Removing Inconsistent Teams : {ipl_df.shape}')
ipl_df = ipl_df[(ipl_df['bat_team'].isin(const_teams)) & (ipl_df['bowl_team'].isin(const_teams))]
print(f'After Removing Inconsistent Teams : {ipl_df.shape}')

Unique Batting Teams: ['Kolkata Knight Riders' 'Chennai Super Kings' 'Rajasthan Royals'
 'Mumbai Indians' 'Deccan Chargers' 'Kings XI Punjab'
 'Royal Challengers Bangalore' 'Delhi Daredevils' 'Kochi Tuskers Kerala'
 'Pune Warriors' 'Sunrisers Hyderabad' 'Rising Pune Supergiants'
 'Gujarat Lions' 'Rising Pune Supergiant']
Before Removing Inconsistent Teams : (76014, 8)
After Removing Inconsistent Teams : (53811, 8)


In [41]:
print(f'Before Removing Overs < 5 : {ipl_df.shape}')
ipl_df = ipl_df[ipl_df['overs'] >= 5.0]
print(f'After Removing Overs < 5 : {ipl_df.shape}')

Before Removing Overs < 5 : (53811, 8)
After Removing Overs < 5 : (40108, 8)


In [42]:
categorical_features = ['bat_team', 'bowl_team']

In [43]:
columnTransformer = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(sparse_output=False), categorical_features)
    ],
    remainder='passthrough'
)

In [44]:
ipl_encoded = columnTransformer.fit_transform(ipl_df)

In [45]:
encoded_columns = columnTransformer.named_transformers_['encoder'].get_feature_names_out(categorical_features)
numeric_cols = [col for col in ipl_df.columns if col not in categorical_features]
final_columns = list(encoded_columns) + numeric_cols

In [46]:
df = pd.DataFrame(ipl_encoded, columns=final_columns)

In [47]:
print("\nEncoded DataFrame Head:")
print(df.head())


Encoded DataFrame Head:
   bat_team_Chennai Super Kings  bat_team_Delhi Daredevils  \
0                           0.0                        0.0   
1                           0.0                        0.0   
2                           0.0                        0.0   
3                           0.0                        0.0   
4                           0.0                        0.0   

   bat_team_Kings XI Punjab  bat_team_Kolkata Knight Riders  \
0                       0.0                             1.0   
1                       0.0                             1.0   
2                       0.0                             1.0   
3                       0.0                             1.0   
4                       0.0                             1.0   

   bat_team_Mumbai Indians  bat_team_Rajasthan Royals  \
0                      0.0                        0.0   
1                      0.0                        0.0   
2                      0.0                        0.

In [48]:
features = df.drop(['total'], axis=1)
labels = df['total']

In [49]:
train_features, test_features, train_labels, test_labels = train_test_split(
    features, labels,
    test_size=0.20,
    shuffle=True
)

In [50]:
print(f"\nTraining Set : {train_features.shape}\nTesting Set : {test_features.shape}")


Training Set : (32086, 21)
Testing Set : (8022, 21)


# **RANDOM FOREST CLASSIFIER**

In [51]:
models = dict()

In [52]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse
import numpy as np

In [53]:
forest = RandomForestRegressor()
forest.fit(train_features, train_labels)

In [54]:
train_score_forest = forest.score(train_features, train_labels) * 100
test_score_forest = forest.score(test_features, test_labels) * 100


In [55]:
print(f'Train Score : {train_score_forest:.2f}%')
print(f'Test Score  : {test_score_forest:.2f}%')

Train Score : 99.09%
Test Score  : 93.24%


In [56]:
y_pred = forest.predict(test_features)

In [57]:
print("\nModel Evaluation Metrics:")
print("Mean Absolute Error (MAE): {:.2f}".format(mae(test_labels, y_pred)))
print("Mean Squared Error (MSE): {:.2f}".format(mse(test_labels, y_pred)))
print("Root Mean Squared Error (RMSE): {:.2f}".format(np.sqrt(mse(test_labels, y_pred))))


Model Evaluation Metrics:
Mean Absolute Error (MAE): 4.48
Mean Squared Error (MSE): 60.15
Root Mean Squared Error (RMSE): 7.76


# **SAVING THE MODEL**

In [58]:
pip install joblib



In [59]:
import joblib

joblib.dump(forest, 'random_forest_model.joblib')
print("Model saved successfully!")

Model saved successfully!


# **TESTING DATA**

In [65]:
import joblib
import pandas as pd
import numpy as np

model_filename = '/content/random_forest_model.joblib'
forest = joblib.load(model_filename)
print("Model loaded successfully!")

const_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
               'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
               'Delhi Daredevils', 'Sunrisers Hyderabad']

batting_team = 'Rajasthan Royals'
bowling_team = 'Mumbai Indians'
overs = 5.1
runs = 50
wickets = 2
runs_last_5 = 20
wickets_last_5 = 1

prediction_array = []

for team in const_teams:
    prediction_array.append(1 if team == batting_team else 0)
for team in const_teams:
    prediction_array.append(1 if team == bowling_team else 0)

prediction_array += [runs, wickets, overs, runs_last_5, wickets_last_5]
prediction_array = np.array([prediction_array])

predicted_score = int(round(forest.predict(prediction_array)[0]))

print(f"Predicted First Innings Score: {predicted_score - 5} to {predicted_score + 5}")


Model loaded successfully!
Predicted First Innings Score: 147 to 157




In [68]:
import joblib
import pandas as pd
import numpy as np

model_filename = '/content/random_forest_model.joblib'
forest = joblib.load(model_filename)
print("Model loaded successfully!")

const_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
               'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
               'Delhi Daredevils', 'Sunrisers Hyderabad']

batting_team = 'Sunrisers Hyderabad'
bowling_team = 'Chennai Super Kings'
overs = 5.1
runs = 50
wickets = 2
runs_last_5 = 20
wickets_last_5 = 1

prediction_array = []

for team in const_teams:
    prediction_array.append(1 if team == batting_team else 0)
for team in const_teams:
    prediction_array.append(1 if team == bowling_team else 0)

prediction_array += [runs, wickets, overs, runs_last_5, wickets_last_5]
prediction_array = np.array([prediction_array])

predicted_score = int(round(forest.predict(prediction_array)[0]))

print(f"Predicted First Innings Score: {predicted_score - 5} to {predicted_score + 5}")


Model loaded successfully!
Predicted First Innings Score: 156 to 166


