In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv('data/aggregated_df.csv')
df.columns

Index(['match_id', 'inning', 'bowling_team', 'batter', 'batter_total_balls',
       'batter_SR', 'batter_score', 'dismissal_kind', 'date', 'match_type',
       'venue', 'no_of_right_arm_pacers_faced', 'no_of_left_arm_pacers_faced',
       'no_of_right_arm_offspinners_faced', 'no_of_left_arm_offpacers_faced',
       'no_of_right_arm_legspinners_faced', 'no_of_left_arm_legspinners_faced',
       'no_of_left_arm_offspinners_faced', 'score_last_5', 'balls_last_5',
       'score_last_1', 'balls_last_1', 'score_last_2', 'balls_last_2',
       'score_last_3', 'balls_last_3', 'score_last_4', 'balls_last_4'],
      dtype='object')

In [2]:
df.drop(columns=['batter_total_balls', "batter_SR", "dismissal_kind", "match_id", "date"], inplace=True)

Index(['inning', 'bowling_team', 'batter', 'batter_score', 'match_type',
       'venue', 'no_of_right_arm_pacers_faced', 'no_of_left_arm_pacers_faced',
       'no_of_right_arm_offspinners_faced', 'no_of_left_arm_offpacers_faced',
       'no_of_right_arm_legspinners_faced', 'no_of_left_arm_legspinners_faced',
       'no_of_left_arm_offspinners_faced', 'score_last_5', 'balls_last_5',
       'score_last_1', 'balls_last_1', 'score_last_2', 'balls_last_2',
       'score_last_3', 'balls_last_3', 'score_last_4', 'balls_last_4'],
      dtype='object')

In [3]:
df = df.reindex(columns=[col for col in df.columns if col != 'batter_score'] + ['batter_score'])
df.columns

Index(['inning', 'bowling_team', 'batter', 'match_type', 'venue',
       'no_of_right_arm_pacers_faced', 'no_of_left_arm_pacers_faced',
       'no_of_right_arm_offspinners_faced', 'no_of_left_arm_offpacers_faced',
       'no_of_right_arm_legspinners_faced', 'no_of_left_arm_legspinners_faced',
       'no_of_left_arm_offspinners_faced', 'score_last_5', 'balls_last_5',
       'score_last_1', 'balls_last_1', 'score_last_2', 'balls_last_2',
       'score_last_3', 'balls_last_3', 'score_last_4', 'balls_last_4',
       'batter_score'],
      dtype='object')

In [4]:
gdf = df[df['batter'] == 'G Gambhir']
gdf.drop(columns=['batter'], inplace=True)

gdf = gdf.fillna(-1).astype({"no_of_left_arm_offspinners_faced": 'int', "score_last_5": 'int', "balls_last_5": 'int', "score_last_1": 'int', "balls_last_1": 'int', "score_last_2": 'int', "balls_last_2": 'int', "score_last_3": 'int', "balls_last_3": 'int', "score_last_4": 'int', "balls_last_4": 'int'})

print(gdf.dtypes)

inning                                int64
bowling_team                         object
match_type                           object
venue                                object
no_of_right_arm_pacers_faced          int64
no_of_left_arm_pacers_faced           int64
no_of_right_arm_offspinners_faced     int64
no_of_left_arm_offpacers_faced        int64
no_of_right_arm_legspinners_faced     int64
no_of_left_arm_legspinners_faced      int64
no_of_left_arm_offspinners_faced      int64
score_last_5                          int64
balls_last_5                          int64
score_last_1                          int64
balls_last_1                          int64
score_last_2                          int64
balls_last_2                          int64
score_last_3                          int64
balls_last_3                          int64
score_last_4                          int64
balls_last_4                          int64
batter_score                          int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf.drop(columns=['batter'], inplace=True)


In [5]:
X = gdf.iloc[:, :-1].values
y = gdf.iloc[:, -1].values

type(X[:][1])
# print(y)

for i in range(0, len(X)):
    if type(X[i][3]) != str:
        print(i)

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [1,2,3])], remainder = 'passthrough')
X = np.array(ct.fit_transform(X))

In [12]:
print(X)

[[1.0 0.0 1.0 ... -1 -1 -1]
 [0.0 1.0 1.0 ... -1 -1 -1]
 [1.0 0.0 1.0 ... -1 -1 -1]
 ...
 [1.0 0.0 1.0 ... 15 32 19]
 [1.0 0.0 1.0 ... 42 12 15]
 [1.0 0.0 1.0 ... 16 55 42]]


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

print(X_train[0])


# #we don't have to check for dummy variables and which method to choose like backward elimination because the class does that for us
# from sklearn.linear_model import LinearRegression
# regressor = LinearRegression()
# regressor.fit(X_train, y_train)
# #regressor.coef_ gives the coefficients
# #regressor.intercept ....

# y_pred = regressor.predict(X_test)
# #to make a single prediction just do regressor.predict([[1, 0, 0, 160000, 130000, 300000]])
# np.set_printoptions(precision=2)
# print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 2
 'League' 'JSCA International Stadium Complex' 2 1 0 0 0 0 0 14 8 14 15 50
 44 0 2 12 14]
