#### "Learning Data Mining with Python - Chapter 3"
This Jupyter notebook contains an implementation of a data mining algorithm
to predict the winner of a basketball game.
#### Results :
#Accuracy can be improved by adding new features, using different classes.
#Increased number of features may not be result in better accuracy, model could not learn from those many features(OneHotEncoding Class).

In [38]:
# Necessary packages
import pandas as pd
import numpy as np
from datetime import datetime

In [40]:
#Cleaning up the data set
#The date column includes months' names at some indexes rather than a date obj.
df_original = pd.read_csv(r'NBA_games_2014.csv') 
df_original.columns = ["Date","Time" ,"ScoreType", "VisitorTeam", "VisitorPts", "HomeTeam", "HomePts", "OverTime", "Notes"]
df_original.describe()

Unnamed: 0,Date,Time,ScoreType,VisitorTeam,VisitorPts,HomeTeam,HomePts,OverTime,Notes
count,1235,1235,1235,1235,1235,1235,1235,84,6
unique,168,24,6,35,77,35,70,8,6
top,Wed Apr 16 2014,8:00 pm,Box Score,Chicago Bulls,103,Chicago Bulls,102,OT,at London England
freq,15,269,1230,41,50,41,50,65,1


In [41]:
months = ['October', 'November', 'December', 'January', 'February', 
          'March', 'May', 'April']
row_deleted = []
for month in months:
    a = df_original[df_original.loc[:, 'Date'] == month]
    if not a.empty:
        row_deleted.append(a.index[0])
print("The rows will be DELETED: {0}".format(row_deleted))

The rows will be DELETED: [247, 467, 696, 874, 1112]


In [5]:
df_new = df_original.drop(row_deleted, axis = 0)
df_new.to_csv('new_NBA_games_2014.csv', index=False, header=True)

In [6]:
# Load the clean data
df = pd.read_csv(r"new_NBA_games_2014.csv", parse_dates =["Date"])

In [7]:
df['HomeWin'] = df['HomePts'] > df['VisitorPts']
df.drop(["Notes","OverTime","ScoreType","Time"], axis=1).head()

Unnamed: 0,Date,VisitorTeam,VisitorPts,HomeTeam,HomePts,HomeWin
0,2013-10-29,Orlando Magic,87,Indiana Pacers,97,True
1,2013-10-29,Los Angeles Clippers,103,Los Angeles Lakers,116,True
2,2013-10-29,Chicago Bulls,95,Miami Heat,107,True
3,2013-10-30,Brooklyn Nets,94,Cleveland Cavaliers,98,True
4,2013-10-30,Atlanta Hawks,109,Dallas Mavericks,118,True


In [8]:
# Calculate Win rate for HomeTeam.
n_home_winners = df[df['HomeWin'] == True]['HomeWin'].count()
n_teams = df['HomeWin'].count()
print(f"HomeTeams win at a rate: {round(100*n_home_winners/n_teams,2)}%")

HomeTeams win at a rate: 58.05%


In [9]:
# Add a new column to show Last win of HomeTeam and VisitorTeam.
from collections import defaultdict
won_last = defaultdict(int)

df['HomeLastWin'] = False
df['VisitorLastWin'] = False

for index, row in df.iterrows():
    home_team = row["HomeTeam"]
    visitor_team = row["VisitorTeam"]
    row["HomeLastWin"] = won_last[home_team]
    row["VisitorLastWin"] = won_last[visitor_team]
    df.loc[index] = row
    won_last[home_team] = row["HomeWin"]
    won_last[visitor_team] = not row["HomeWin"]    

In [10]:
# Targed data to be predicted.
y_true = df['HomeWin'].values

In [43]:
# DecisionTree estimation of the accuracy
# Accuracy of 'HomeTeam wins' assumption based on the LastWin features.
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
clf_tree = DecisionTreeClassifier(random_state = 14)
X_previouswins = df[["HomeLastWin", "VisitorLastWin"]].values
scores_tree = cross_val_score(clf_tree, X_previouswins, y_true, scoring='accuracy', cv = 10)
print(f"Accuracy : {round(100*scores_tree.mean(),2)}%")

Accuracy : 59.2%


In [12]:
# new feature: standings of the previous year.
standings = pd.read_csv(r"NBA_standings_2013.csv",  header=1)
standings.head()

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,Post,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Miami Heat,66-16,37-4,29-12,41-11,25-5,14-4,12-6,15-1,...,30-2,9-3,39-8,1-0,10-3,10-5,8-5,12-1,17-1,8-1
1,2,Oklahoma City Thunder,60-22,34-7,26-15,21-9,39-13,7-3,8-2,6-4,...,21-8,3-6,44-6,,13-4,11-2,11-5,7-4,12-5,6-2
2,3,San Antonio Spurs,58-24,35-6,23-18,25-5,33-19,8-2,9-1,8-2,...,16-12,9-5,31-10,1-0,12-4,12-4,12-3,8-3,10-4,3-6
3,4,Denver Nuggets,57-25,38-3,19-22,19-11,38-14,5-5,10-0,4-6,...,24-4,11-7,28-8,0-1,8-8,9-6,12-3,8-4,13-2,7-1
4,5,Los Angeles Clippers,56-26,32-9,24-17,21-9,35-17,7-3,8-2,6-4,...,17-9,3-5,38-12,1-0,8-6,16-0,9-7,8-5,7-7,7-1


In [44]:
# New feature: whether HomeTeam Ranks higher than VisitorTeam in the prev. year.
df["HomeRnkHigher"] = False
for index, row in df.iterrows():
    home_team = row["HomeTeam"]
    visitor_team = row["VisitorTeam"]
    
    if home_team == "New Orleans Pelicans": 
        home_team = "New Orleans Hornets"
    elif visitor_team == "New Orleans Pelicans":
        visitor_team = "New Orleans Hornets"
    
    home_rank = standings[standings["Team"] == home_team]["Rk"].values[0]
    visitor_rank = standings[standings["Team"] == visitor_team]["Rk"].values[0]
    #print("HomeTeamRank: {0}, VisitorTeamRank: {1}".format(home_rank, visitor_rank))
    df["HomeRnkHigher"].values[index] = home_rank < visitor_rank

In [46]:
# Same DecisionTree class with a new feature
X_homehigher = df[["HomeLastWin", "VisitorLastWin", "HomeRnkHigher"]].values
scores_tree_2 = cross_val_score(clf_tree, X_homehigher, y_true, scoring='accuracy', cv=5)
print("Accuracy of the HomeTeam Wins assumption Based on 2013 standings: ", end='')
print(f"{round(100*scores_tree_2.mean(),2)}%")

Accuracy of the HomeTeam Wins assumption Based on 2013 standings: 60.4%


In [15]:
# New feature: Who won the last match between the two teams ?
last_winner = defaultdict(int)
df["HomeTeamWonLast"] = 0

for index, row in df.iterrows():
    home_team = row["HomeTeam"]
    visitor_team = row["VisitorTeam"]
    teams = tuple(sorted([visitor_team, home_team]))
    
    if last_winner[teams] == row["HomeTeam"]:
        row["HomeTeamWonLast"] = 1
    else:
        row["HomeTeamWonLast"] = 0
    df.loc[index] = row
    if row["HomeWin"]:
        winner = row["HomeTeam"]
    else:
        winner = row["VisitorTeam"]
    last_winner[teams] = winner


In [47]:
# Estimation of the accuracy with the new feature.
X_homewonlast = df[["HomeRnkHigher", "HomeTeamWonLast"]].values
scores_3 = cross_val_score(clf_tree, X_homewonlast, y_true, scoring='accuracy', cv=5)
scores_3 = round(scores_3.mean(),2)
print(f"Accuracy Based on the winner of the last match between two teams: {100*scores_3}%")

Accuracy Based on the winner of the last match between two teams: 61.0%


In [50]:
# Estimation using different class of sklearn, OneHotEncoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

encoding = LabelEncoder()
encoding.fit(df["HomeTeam"].values)
home_teams = encoding.transform(df["HomeTeam"].values)
visitor_teams = encoding.transform(df["VisitorTeam"].values)
X_teams = np.vstack([home_teams,visitor_teams]).T

onehot = OneHotEncoder()
X_teams_expanded = onehot.fit_transform(X_teams).todense()

scores_encoder = cross_val_score(clf_tree, X_teams_expanded, y_true, scoring='accuracy', cv=5)
scores_encoder = round(scores_encoder.mean(),3)
print(f"Accuracy: {100*scores_encoder}%")

Accuracy: 60.4%


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [51]:
# RandomForests estimation
from sklearn.ensemble import RandomForestClassifier
clf_forest = RandomForestClassifier(random_state = 14)
scores_forest = cross_val_score(clf_forest, X_teams_expanded, y_true, scoring='accuracy', cv=5)
print("Accuracy Using RandomForest Algorithm: {0:.1f}%".format(100*scores_forest.mean()))

Accuracy Using RandomForest Algorithm: 61.0%




In [31]:
X_all = np.hstack([X_homehigher, X_teams_expanded])
scores_forest_2 = cross_val_score(clf_forest, X_all, y_true, scoring='accuracy', cv=5)
print("Accuracy with new feature added: {0:.1f}%".format(100*scores_forest_2.mean()))

Accuracy with new feature added: 61.3%




In [52]:
# Accuracy estimation using GridSearchCV class
from sklearn.model_selection import GridSearchCV
parameter_space = {
    "max_features": [2, 10, 'auto'],
    "n_estimators": [100,],
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": [2, 4, 6]
    
}
grid = GridSearchCV(clf_forest, parameter_space)
grid.fit(X_all, y_true)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))



Accuracy: 64.7%
