In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv('data/aggregated_df.csv')
df.columns

Index(['match_id', 'inning', 'bowling_team', 'batter', 'batter_total_balls',
       'batter_SR', 'batter_score', 'dismissal_kind', 'date', 'match_type',
       'venue', 'no_of_right_arm_pacers_faced', 'no_of_left_arm_pacers_faced',
       'no_of_right_arm_offspinners_faced', 'no_of_left_arm_offpacers_faced',
       'no_of_right_arm_legspinners_faced', 'no_of_left_arm_legspinners_faced',
       'no_of_left_arm_offspinners_faced', 'score_last_5', 'balls_last_5',
       'score_last_1', 'balls_last_1', 'score_last_2', 'balls_last_2',
       'score_last_3', 'balls_last_3', 'score_last_4', 'balls_last_4'],
      dtype='object')

In [2]:
df.drop(columns=['batter_total_balls', "batter_SR", "dismissal_kind", "match_id", "date"], inplace=True)
df.columns

Index(['inning', 'bowling_team', 'batter', 'batter_score', 'match_type',
       'venue', 'no_of_right_arm_pacers_faced', 'no_of_left_arm_pacers_faced',
       'no_of_right_arm_offspinners_faced', 'no_of_left_arm_offpacers_faced',
       'no_of_right_arm_legspinners_faced', 'no_of_left_arm_legspinners_faced',
       'no_of_left_arm_offspinners_faced', 'score_last_5', 'balls_last_5',
       'score_last_1', 'balls_last_1', 'score_last_2', 'balls_last_2',
       'score_last_3', 'balls_last_3', 'score_last_4', 'balls_last_4'],
      dtype='object')

In [3]:
df = df.reindex(columns=[col for col in df.columns if col != 'batter_score'] + ['batter_score'])
df.columns

Index(['inning', 'bowling_team', 'batter', 'match_type', 'venue',
       'no_of_right_arm_pacers_faced', 'no_of_left_arm_pacers_faced',
       'no_of_right_arm_offspinners_faced', 'no_of_left_arm_offpacers_faced',
       'no_of_right_arm_legspinners_faced', 'no_of_left_arm_legspinners_faced',
       'no_of_left_arm_offspinners_faced', 'score_last_5', 'balls_last_5',
       'score_last_1', 'balls_last_1', 'score_last_2', 'balls_last_2',
       'score_last_3', 'balls_last_3', 'score_last_4', 'balls_last_4',
       'batter_score'],
      dtype='object')

In [4]:
gdf = df[df['batter'] == 'G Gambhir']
gdf.drop(columns=['batter'], inplace=True)

gdf = gdf.fillna(-1).astype({"no_of_left_arm_offspinners_faced": 'int', "score_last_5": 'int', "balls_last_5": 'int', "score_last_1": 'int', "balls_last_1": 'int', "score_last_2": 'int', "balls_last_2": 'int', "score_last_3": 'int', "balls_last_3": 'int', "score_last_4": 'int', "balls_last_4": 'int'})

print(gdf.dtypes)

inning                                int64
bowling_team                         object
match_type                           object
venue                                object
no_of_right_arm_pacers_faced          int64
no_of_left_arm_pacers_faced           int64
no_of_right_arm_offspinners_faced     int64
no_of_left_arm_offpacers_faced        int64
no_of_right_arm_legspinners_faced     int64
no_of_left_arm_legspinners_faced      int64
no_of_left_arm_offspinners_faced      int64
score_last_5                          int64
balls_last_5                          int64
score_last_1                          int64
balls_last_1                          int64
score_last_2                          int64
balls_last_2                          int64
score_last_3                          int64
balls_last_3                          int64
score_last_4                          int64
balls_last_4                          int64
batter_score                          int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf.drop(columns=['batter'], inplace=True)


In [5]:
X = gdf.iloc[:, :-1].values
y = gdf.iloc[:, -1].values

print(X)
print(y)

[[2 'Rajasthan Royals' 'League' ... -1 -1 -1]
 [2 'Deccan Chargers' 'League' ... -1 -1 -1]
 [1 'Kings XI Punjab' 'League' ... -1 -1 -1]
 ...
 [2 'Kolkata Knight Riders' 'League' ... 15 32 19]
 [1 'Royal Challengers Bangalore' 'League' ... 42 12 15]
 [2 'Kings XI Punjab' 'League' ... 16 55 42]]
[58 12 18 86 50  1 80 31 10 79 40 39 19 11 15  0 16  8 17 13 71 19 18 19
  8  8 27 47  0 72  9  1 43  1 47 26 17 57  4  1 29 75 35  3 48 18 45 35
  0 16  7 54  8  4 16  0 64 11 22 66 30 93 63 56 36  0 62 27 10 32  2 41
 22 59 53 60 25 26  8 14 12  0 50 14 12 10  0  0  0  1 45  6 54 69 63 14
  6 21  4 28  1 23 57 58 19 11 60  4 12  0 12 31 24 38  1 38 64 90 34 11
 59  6 37 54  5  0 51  8 16 28 76 19 72 15 14 33 14 62 71 11 24 14  8 21
 32 12 55 15  8  3  4]


In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [1, 2, 3])], remainder = 'passthrough')
X = np.array(ct.fit_transform(X))

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


TypeError: Singleton array array(<151x72 sparse matrix of type '<class 'numpy.float64'>'
	with 2423 stored elements in Compressed Sparse Row format>, dtype=object) cannot be considered a valid collection.