# Predict the first Innings score in a ODI


1. Load the dataset from the csv file.
2. Use “groupby” operation, to find the average number of runs, scored by each country,
and represent it on a bar graph.
3. Handle Missing values:
a. If there are null values in continuous numerical column, replace the null values by
the mean of that column
b. If there are null values in ordinal numerical column, replace the null values by the
mode of that column
c. If there are null values in categorical column, replace the null values by the mode
of that column
d. If more than 50%the values in a column are null, then drop that entire column
4. Remove the columns, that you think, do not contribute to the total score, in the first
innings.
5. Convert the categorical columns (if any), to numeric, using one hot encoding/ dummy
encoding.
6. Pick “total” column, as the target variable
7. Select the relevant features.
8. Perform train-test-split
9. Perform Feature scaling
10. Use
a. Use Linear Regression
b. Use Decision Tree Regression
c. Use Random Forest Regression
11. Evaluate the model
12. Apply prediction

In [1]:
#importing all the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Loading the dataset from csv file

In [2]:
dataset=pd.read_csv('odi.csv')

FileNotFoundError: [Errno 2] File odi.csv does not exist: 'odi.csv'

In [None]:
dataset.head(100)

In [None]:
#checking the data types of the columns
dataset.dtypes

Use “groupby” operation, to find the average number of runs, scored by each country,and represent it on a bar graph.

In [None]:
dataset.groupby('bat_team').mean()['runs'].plot(kind="bar")

In [None]:
dataset.shape

3.Checking the null values in the dataset

In [None]:
dataset.isnull().sum()

**Analysis**<br>
This shows us their are no null values in each column.<br>

Convert the categorical columns (if any), to numeric, using one hot encoding/ dummy encoding.

In [None]:
# Converting categorical features using OneHotEncoding method
encoded_df = pd.get_dummies(data=dataset, columns=['bat_team','bowl_team','batsman','bowler','date','venue'])
encoded_df.columns

In [None]:
encoded_df.head()

Perform train-test-split

In [None]:
def custom_accuracy(y_test,y_pred,thresold):
    right = 0

    l = len(y_pred)
    for i in range(0,l):
        if(abs(y_pred[i]-y_test[i]) <= thresold):
            right += 1
    return ((right/l)*100)
X = dataset.iloc[:,[7,8,9,12,13]].values
y = dataset.iloc[:, 14].values


# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)




# Linear Regression

In [None]:
# Training the dataset
from sklearn.linear_model import LinearRegression
lin = LinearRegression()
lin.fit(X_train,y_train)

# Testing the dataset on trained model
y_pred = lin.predict(X_test)
score = lin.score(X_test,y_test)*100
print("R square value:" , score)
print("Custom accuracy:" , custom_accuracy(y_test,y_pred,20))

# Testing with a custom input
import numpy as np
new_prediction = lin.predict(sc.transform(np.array([[100,0,13,50,50]])))
print("Prediction score:" , new_prediction)


# Random forest regressor

In [None]:
def custom_accuracy(y_test,y_pred,thresold):
    right = 0
    l = len(y_pred)
    for i in range(0,l):
        if(abs(y_pred[i]-y_test[i]) <= thresold):
            right += 1
    return ((right/l)*100)


In [None]:

# Training the dataset
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(n_estimators=100,max_features=None)
reg.fit(X_train,y_train)

# Testing the dataset on trained model
y_pred = reg.predict(X_test)
score = reg.score(X_test,y_test)*100
print("R square value:" , score)
print("Custom accuracy:" , custom_accuracy(y_test,y_pred,20))

# Testing with a custom input
import numpy as np
new_prediction = reg.predict(sc.transform(np.array([[100,0,13,50,50]])))
print("Prediction score:" , new_prediction)



# Decision Tree Regression

In [None]:
# Decision Tree Regression Model
from sklearn import tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

D_T = tree.DecisionTreeRegressor(max_depth=8,random_state=42)
D_T.fit(X_train, y_train)
y_pred_dt = D_T.predict(X_test)
print('The Score on the training set with a decision tree regressor is:',D_T.score(X_train,y_train))
print('The Score on the test set with a decision tree regressor is:',D_T.score(X_test,y_test))

# The Root mean squared error
print("Mean squared error: %.2f"% np.sqrt(mean_squared_error(y_test, y_pred_dt)))

# The R^2 score 
print("The r2_score is: ", r2_score(y_test, y_pred_dt))

We can compare all the models by checking their R2 score.<br>
Condition: <br>
* If R2 score is near to 1 -> Best model<br>
* If R2 score is near to 0 -> Worst model<br>

**Models**<br>
Linear Regression :-     0.37<br>
Decison tree Regressor:-      0.80<br>
K Nearest Neighbours(KNN):- 0.73<br>
Random Forest Regressor:- 0.90<br>

So from observing the "R2 score" we conclude that *RANDOM FOREST REGRESSOR* is the best model for predicitng the price of the second hand car. 

In [None]:

def predict_score(bat_team='Chennai Super Kings', bowl_team='Mumbai Indians', overs=5.1, runs=50, wickets=0, runs_last_5=50, wickets_last_5=0):
    temp_array = list()

      # Batting Team
    if bat_team == 'Chennai Super Kings':
        temp_array = temp_array + [1,0,0,0,0,0,0,0]
    elif bat_team == 'Delhi Daredevils':
        temp_array = temp_array + [0,1,0,0,0,0,0,0]
    elif bat_team == 'Kings XI Punjab':
        temp_array = temp_array + [0,0,1,0,0,0,0,0]
    elif bat_team == 'Kolkata Knight Riders':
        temp_array = temp_array + [0,0,0,1,0,0,0,0]
    elif bat_team == 'Mumbai Indians':
        temp_array = temp_array + [0,0,0,0,1,0,0,0]
    elif bat_team == 'Rajasthan Royals':
        temp_array = temp_array + [0,0,0,0,0,1,0,0]
    elif bat_team == 'Royal Challengers Bangalore':
        temp_array = temp_array + [0,0,0,0,0,0,1,0]
    elif bat_team == 'Sunrisers Hyderabad':
        temp_array = temp_array + [0,0,0,0,0,0,0,1]

      # Bowling Team
    if bowl_team == 'Chennai Super Kings':
        temp_array = temp_array + [1,0,0,0,0,0,0,0]
    elif bowl_team == 'Delhi Daredevils':
        temp_array = temp_array + [0,1,0,0,0,0,0,0]
    elif bowl_team == 'Kings XI Punjab':
        temp_array = temp_array + [0,0,1,0,0,0,0,0]
    elif bowl_team == 'Kolkata Knight Riders':
        temp_array = temp_array + [0,0,0,1,0,0,0,0]
    elif bowl_team == 'Mumbai Indians':
        temp_array = temp_array + [0,0,0,0,1,0,0,0]
    elif bowl_team == 'Rajasthan Royals':
        temp_array = temp_array + [0,0,0,0,0,1,0,0]
    elif bowl_team == 'Royal Challengers Bangalore':
        temp_array = temp_array + [0,0,0,0,0,0,1,0]
    elif bowl_team == 'Sunrisers Hyderabad':
        temp_array = temp_array + [0,0,0,0,0,0,0,1]

      # Overs, Runs, Wickets, Runs_in_prev_5, Wickets_in_prev_5
    temp_array = temp_array + [overs, runs, wickets, runs_last_5, wickets_last_5]

      # Converting into numpy array
    temp_array = np.array([temp_array])

      # Prediction
    return int(lin.predict(temp_array)[0])

In [None]:
final_score = predict_score(bat_team='England', bowl_team='Ireland', overs=0.1, runs=0, wickets=0, runs_last_5=0, wickets_last_5=0)
print("The final predicted score (range): {} to {}".format(final_score-10, final_score+5))