# About this notebook 

In [1]:
#importing dependencies
import pandas as pd
import numpy as np

#basketball reference dependencies
from basketball_reference_scraper.box_scores import get_box_scores
from basketball_reference_scraper.seasons import get_schedule, get_standings

#machine learning portion dependencies
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
#mapping all nba teams to abbreviations
team_to_abbreviation = {
    "ATLANTA HAWKS" : "ATL",
    "ST. LOUIS HAWKS" : "SLH",
    "MILWAUKEE HAWKS" : "MIL",
    "TRI-CITIES BLACKHAWKS" : "TCB",
    "BOSTON CELTICS" : "BOS",
    "BROOKLYN NETS" : "BRK",
    "NEW JERSEY NETS" : "NJN",
    "CHICAGO BULLS" : "CHI",
    "CHARLOTTE HORNETS": "CHH",
    "CHARLOTTE HORNETS": "CHO",
    "CHARLOTTE BOBCATS" : "CHA",
    "CLEVELAND CAVALIERS": "CLE",
    "DALLAS MAVERICKS" : "DAL",
    "DENVER NUGGETS" : "DEN",
    "DETROIT PISTONS" : "DET",
    "FORT WAYNE PISTONS" : "FWP",
    "GOLDEN STATE WARRIORS" : "GSW",
    "SAN FRANCISCO WARRIORS" : "SFW",
    "PHILADELPHIA WARRIORS" : "PHI",
    "HOUSTON ROCKETS" : "HOU",
    "INDIANA PACERS" : "IND",
    "LOS ANGELES CLIPPERS" : "LAC",
    "SAN DIEGO CLIPPERS" : "SDC",
    "BUFFALO BRAVES" : "BUF",
    "LOS ANGELES LAKERS" : "LAL",
    "MINNEAPOLIS LAKERS" : "MIN",
    "MEMPHIS GRIZZLIES" : "MEM",
    "VANCOUVER GRIZZLIES" : 'VAN',
    "MIAMI HEAT" : "MIA",
    "MILWAUKEE BUCKS" : "MIL",
    "MINNESOTA TIMBERWOLVES" : "MIN",
    "NEW ORLEANS PELICANS" : "NOP",
    "NEW ORLEANS HORNETS" : "NOK",
    "NEW ORLEANS HORNETS" : "NOH",
    "NEW YORK KNICKS" : "NYK",
    "OKLAHOMA CITY HORNETS" : "NOK",
    "OKLAHOMA CITY THUNDER" : "OKC",
    "SEATTLE SUPERSONICS" : "SEA",
    "ORLANDO MAGIC" : "ORL",
    "PHILADELPHIA 76ERS" : "PHI",
    "SYRACUSE NATIONALS" : "SYR",
    "PHOENIX SUNS" : "PHO",
    "PORTLAND TRAIL BLAZERS" : "POR",
    "SACRAMENTO KINGS" : "SAC",
    "KANSAS CITY KINGS" : "KCK",
    "KANSAS CITY-OMAHA KINGS" : "KCK",
    "CINCINNATI ROYALS" : "CIN",
    "ROCHESTER ROYALS" : "ROR",
    "SAN ANTONIO SPURS" : "SAS",
    "TORONTO RAPTORS" : "TOR",
    "UTAH JAZZ" : "UTA",
    "NEW ORLEANS JAZZ" : "NOJ",
    "WASHINGTON WIZARDS" : "WAS",
    "WASHINGTON BULLETS" : "WAS",
    "CAPITAL BULLETS" : "CAP",
    "BALTIMORE BULLETS" : "BAL",
    "CHICAGO ZEPHYRS" : "CHI",
    "CHICAGO PACKERS" : "CHI",
    "ANDERSON PACKERS" : "AND",
    "CHICAGO STAGS" : "CHI",
    "INDIANAPOLIS OLYMPIANS" : "IND",
    "SHEBOYGAN RED SKINS" : "SRS",
    "ST. LOUIS BOMBERS" : "SLB",
    "WASHINGTON CAPITOLS" : "WAS",
    "WATERLOO HAWKS" : "WAT"
}

In [5]:
'''
function made to grab a teams past 10 games before a valid date. still has some bugs
inputs:
    -team: (str) an NBA team's full name (example: Golden State Warriors), no need to worry about capitalization here
    -date: (str) a valid date within the NBA season you are looking for (formatted in: YYYY-MM-DD)
    -season_end_year: (int) the end year to the entered NBA season (example: looking within the 2015-2016 season, you would enter 2016)
    
output:
    -a pandas dataframe containing a teams past 10 game POST-GAME team stats
    
notes:

'''
def last10Games(team, date, season_end_year):
    
    #use the dictionary to get the associated abbreviation with the team
    abbreviation = team_to_abbreviation[team.upper()]
    
    print(f"team abbreviation for {team.title()} is: {abbreviation}")
    print(f"looking up {str(season_end_year-1)}-{season_end_year} season data before {date}")
    
    #get the schedule for the entered season
    season_df = get_schedule(season_end_year, playoffs=False)
    
    #filter all the games played in the season BEFORE that entered day, not inclusive
    season_before_date_df = season_df.loc[season_df["DATE"] < date] 
    
    #look for all the games where inputted team played (either home or visitor)
    team_home_df = season_before_date_df.loc[season_before_date_df["HOME"] == team]
    team_away_df = season_before_date_df.loc[season_before_date_df["VISITOR"] == team]
    
    '''
    combine the collected home and away games, and create a new dataframe 
    containing all the games a team played in a season before a given date
    '''
    team_season = [team_home_df, team_away_df]
    
    #sort by most descending order
    team_season_df = pd.concat(team_season).sort_values(by="DATE", ascending=False)
    
    #grab the 10 most recent games, and if there are less than 10, grab as many as possible
    if (len(team_season_df.index) < 10):
        last_10_games = pd.DataFrame(team_season_df[0:])
    else:
        last_10_games = pd.DataFrame(team_season_df[0:10])
    
    
    last_10_game_data = []
    #for each game, get the box score of the specified team, and create a new dataframe containing the team stats
    for index, row in last_10_games.iterrows():
        game_date = row["DATE"]
        date_no_time = str(game_date).split()[0]
        
        visitor_team = team_to_abbreviation[row["VISITOR"].upper()]
        home_team = team_to_abbreviation[row["HOME"].upper()]
        
        print(f"getting info on game between: {home_team}(HOME) vs. {visitor_team}(AWAY) on {game_date}")
        
        team_box_score = get_box_scores(date_no_time, visitor_team, home_team, period='GAME', stat_type='BASIC')
        team_box_score = pd.DataFrame(team_box_score[abbreviation]).tail(1)
        
        last_10_game_data.append(team_box_score)
    
    
    last_10_game_stats = pd.concat(last_10_game_data)
    last_10_game_stats_df = pd.DataFrame(last_10_game_stats)
    #each index was 13 from each of the box score dataframes, so we try to reset it here
    last_10_game_stats_df = last_10_game_stats_df.reset_index()
    
    #grab the numerical results
    last_10_game_stats_df = last_10_game_stats_df[['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA',
       'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']]
    
    #the types originally were Object types, so we convert them into int/floats so we can call functions like .mean(), .describe(), etc.
    last_10_game_stats_df["FG"] = last_10_game_stats_df["FG"].astype(int)
    last_10_game_stats_df["FGA"] = last_10_game_stats_df["FGA"].astype(int)
    last_10_game_stats_df["FG%"] = last_10_game_stats_df["FG%"].astype(float)
    last_10_game_stats_df["3P"] = last_10_game_stats_df["3P"].astype(int)
    last_10_game_stats_df["3PA"] = last_10_game_stats_df["3PA"].astype(int)
    last_10_game_stats_df["3P%"] = last_10_game_stats_df["3P%"].astype(float)
    last_10_game_stats_df["FT"] = last_10_game_stats_df["FT"].astype(int)
    last_10_game_stats_df["FTA"] = last_10_game_stats_df["FTA"].astype(int)
    last_10_game_stats_df["FT%"] = last_10_game_stats_df["FT%"].astype(float)
    last_10_game_stats_df["ORB"] = last_10_game_stats_df["ORB"].astype(int)
    last_10_game_stats_df["DRB"] = last_10_game_stats_df["DRB"].astype(int)
    last_10_game_stats_df["TRB"] = last_10_game_stats_df["TRB"].astype(int)
    last_10_game_stats_df["AST"] = last_10_game_stats_df["AST"].astype(int)
    last_10_game_stats_df["STL"] = last_10_game_stats_df["STL"].astype(int)
    last_10_game_stats_df["BLK"] = last_10_game_stats_df["BLK"].astype(int)
    last_10_game_stats_df["TOV"] = last_10_game_stats_df["TOV"].astype(int)
    last_10_game_stats_df["PF"] = last_10_game_stats_df["PF"].astype(int)
    last_10_game_stats_df["PTS"] = last_10_game_stats_df["PTS"].astype(int)

    return last_10_game_stats_df
    
    

An example of what we can do with this:

Depending on how familiar you are with basketball, you may or may not recall the 2015-2016 Golden State Warriors when they went 73-9, beating the previously held record by the 1995-1996 Chicago Bulls, which was 72-10.

If you look back at basketball-reference.com, you can see that the Warrior's biggest loss was against the Portland Trail Blazers on February 19, 2016 (02/19/2016), by a margin of 32 points (137-105)

Using this function, we can try to see if there are any upward/downward trends with each team, whether or not a team is building momentum, or losing steam.

Of course, there is going to be more to the data then meets the eye, which will require you to dig deeper to understand what happened on a given day, maybe using the get_box_scores() function to see if someone under/overperformed, maybe looking at get_injury_report() to see if any key players were missing due to injury, etc. These will require you to import more dependencies, which you can refer to at the top of this notebook.

In [6]:
gsw = last10Games("Golden State Warriors", "2016-02-19", 2016)

gsw

team abbreviation for Golden State Warriors is: GSW
looking up 2015-2016 season data before 2016-02-19
getting info on game between: PHO(HOME) vs. GSW(AWAY) on 2016-02-10 00:00:00
getting info on game between: GSW(HOME) vs. HOU(AWAY) on 2016-02-09 00:00:00
getting info on game between: GSW(HOME) vs. OKC(AWAY) on 2016-02-06 00:00:00
getting info on game between: WAS(HOME) vs. GSW(AWAY) on 2016-02-03 00:00:00
getting info on game between: NYK(HOME) vs. GSW(AWAY) on 2016-01-31 00:00:00
getting info on game between: PHI(HOME) vs. GSW(AWAY) on 2016-01-30 00:00:00
getting info on game between: GSW(HOME) vs. DAL(AWAY) on 2016-01-27 00:00:00
getting info on game between: GSW(HOME) vs. SAS(AWAY) on 2016-01-25 00:00:00
getting info on game between: GSW(HOME) vs. IND(AWAY) on 2016-01-22 00:00:00
getting info on game between: CHI(HOME) vs. GSW(AWAY) on 2016-01-20 00:00:00


Unnamed: 0,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,37,89,0.416,14,40,0.35,24,29,0.828,8,42,50,24,12,7,13,22,112
1,44,94,0.468,12,34,0.353,23,26,0.885,14,35,49,28,5,6,12,26,123
2,48,97,0.495,7,26,0.269,13,20,0.65,13,33,46,24,7,4,10,23,116
3,49,93,0.527,20,41,0.488,16,29,0.552,11,38,49,35,10,5,18,24,134
4,42,73,0.575,16,29,0.552,16,23,0.696,3,41,44,31,8,5,18,21,116
5,47,95,0.495,13,30,0.433,1,2,0.5,12,45,57,37,5,7,23,18,108
6,47,81,0.58,14,29,0.483,19,21,0.905,7,33,40,33,5,4,9,18,127
7,44,85,0.518,11,26,0.423,21,25,0.84,10,35,45,31,15,3,21,25,120
8,43,90,0.478,14,33,0.424,22,25,0.88,14,36,50,31,9,4,17,24,122
9,50,95,0.526,12,32,0.375,13,20,0.65,12,43,55,38,8,9,10,17,125


In [7]:
por = last10Games("Portland Trail Blazers", "2016-02-19", 2016)

por

team abbreviation for Portland Trail Blazers is: POR
looking up 2015-2016 season data before 2016-02-19
getting info on game between: POR(HOME) vs. HOU(AWAY) on 2016-02-10 00:00:00
getting info on game between: MEM(HOME) vs. POR(AWAY) on 2016-02-08 00:00:00
getting info on game between: HOU(HOME) vs. POR(AWAY) on 2016-02-06 00:00:00
getting info on game between: POR(HOME) vs. TOR(AWAY) on 2016-02-04 00:00:00
getting info on game between: POR(HOME) vs. MIL(AWAY) on 2016-02-02 00:00:00
getting info on game between: POR(HOME) vs. MIN(AWAY) on 2016-01-31 00:00:00
getting info on game between: POR(HOME) vs. CHO(AWAY) on 2016-01-29 00:00:00
getting info on game between: POR(HOME) vs. SAC(AWAY) on 2016-01-26 00:00:00
getting info on game between: POR(HOME) vs. LAL(AWAY) on 2016-01-23 00:00:00
getting info on game between: POR(HOME) vs. ATL(AWAY) on 2016-01-20 00:00:00


Unnamed: 0,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,41,89,0.461,8,28,0.286,26,34,0.765,16,33,49,21,11,2,13,27,116
1,44,89,0.494,10,29,0.345,14,20,0.7,8,29,37,22,9,6,13,23,112
2,35,87,0.402,11,35,0.314,15,23,0.652,20,34,54,21,15,6,21,21,96
3,38,81,0.469,10,25,0.4,17,23,0.739,10,30,40,21,8,6,14,20,103
4,39,85,0.459,11,31,0.355,18,25,0.72,10,39,49,25,6,7,15,23,107
5,32,76,0.421,6,27,0.222,26,30,0.867,8,29,37,16,7,5,13,21,96
6,45,92,0.489,7,20,0.35,12,16,0.75,13,38,51,24,10,4,11,21,109
7,44,90,0.489,9,30,0.3,15,19,0.789,10,38,48,28,6,8,12,23,112
8,47,93,0.505,10,31,0.323,17,22,0.773,14,28,42,26,12,1,8,22,121
9,41,94,0.436,9,31,0.29,7,11,0.636,13,37,50,23,5,4,12,18,98


In [8]:
gsw.mean()

FG      45.1000
FGA     89.2000
FG%      0.5078
3P      13.3000
3PA     32.0000
3P%      0.4150
FT      16.8000
FTA     22.0000
FT%      0.7386
ORB     10.4000
DRB     38.1000
TRB     48.5000
AST     31.2000
STL      8.4000
BLK      5.4000
TOV     15.1000
PF      21.8000
PTS    120.3000
dtype: float64

In [9]:
por.mean()

FG      40.6000
FGA     87.6000
FG%      0.4625
3P       9.1000
3PA     28.7000
3P%      0.3185
FT      16.7000
FTA     22.3000
FT%      0.7391
ORB     12.2000
DRB     33.5000
TRB     45.7000
AST     22.7000
STL      8.9000
BLK      4.9000
TOV     13.2000
PF      21.9000
PTS    107.0000
dtype: float64

In [29]:
'''
function made to grab a teams season data. still has some bugs
inputs:
    -team: an NBA team's full name (example: Golden State Warriors), no need to worry about capitalization here
    -season_end_year: the end year to the entered NBA season (example: looking within the 2015-2016 season, you would enter 2016)
    
output:
    -a pandas dataframe containing all of the POST-GAME team stats a team played in a given season
    
notes:
    -this function prints a list of wins/losses, which will be used later on for the machine learning portion, where we
    try to using a teams overall stats to predict if they will win or lose

'''
def getSeasonData(team, season_end_year):
    
    abbreviation = team_to_abbreviation[team.upper()]
    print(f"team abbreviation for {team.title()} is: {abbreviation}")    
    season_df = get_schedule(season_end_year, playoffs=False)

    team_home_df = season_df.loc[season_df["HOME"] == team]
    team_away_df = season_df.loc[season_df["VISITOR"] == team]
    
    team_season = [team_home_df, team_away_df]
    team_season_df = pd.concat(team_season).sort_values(by="DATE", ascending=False)
        
    season_data = []
    dates = []
    win_loss = []
        
    for index, row in pd.DataFrame(team_season_df).iterrows():
        game_date = row["DATE"]
        date_no_time = str(game_date).split()[0]
        
        visitor_team = team_to_abbreviation[row["VISITOR"].upper()]
        home_team = team_to_abbreviation[row["HOME"].upper()]
        
        if int(row["HOME_PTS"]) > int(row["VISITOR_PTS"]):
            home_result = "W"
            away_result = "L"
        else:
            home_result = "L"
            away_result = "W"
            
        team_result = (home_result if home_team == abbreviation else away_result)
        
        print(f"getting info on game between: {home_team}(HOME){home_result} vs. {visitor_team}(AWAY){away_result} on {game_date}")
        
        team_box_score = get_box_scores(date_no_time, visitor_team, home_team, period='GAME', stat_type='BASIC')
        team_box_score = pd.DataFrame(team_box_score[abbreviation]).tail(1)
        
        season_data.append(team_box_score)
        dates.append(game_date)
        win_loss.append(team_result)
        
    season_stats = pd.concat(season_data)
    season_stats_df = pd.DataFrame(season_stats)
    season_stats_df = season_stats_df.reset_index()
    
    season_stats_df = season_stats_df[['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA',
       'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']]
    
    season_stats_df["FG"] = season_stats_df["FG"].astype(int)
    season_stats_df["FGA"] = season_stats_df["FGA"].astype(int)
    season_stats_df["FG%"] = season_stats_df["FG%"].astype(float)
    season_stats_df["3P"] = season_stats_df["3P"].astype(int)
    season_stats_df["3PA"] = season_stats_df["3PA"].astype(int)
    season_stats_df["3P%"] = season_stats_df["3P%"].astype(float)
    season_stats_df["FT"] = season_stats_df["FT"].astype(int)
    season_stats_df["FTA"] = season_stats_df["FTA"].astype(int)
    season_stats_df["FT%"] = season_stats_df["FT%"].astype(float)
    season_stats_df["ORB"] = season_stats_df["ORB"].astype(int)
    season_stats_df["DRB"] = season_stats_df["DRB"].astype(int)
    season_stats_df["TRB"] = season_stats_df["TRB"].astype(int)
    season_stats_df["AST"] = season_stats_df["AST"].astype(int)
    season_stats_df["STL"] = season_stats_df["STL"].astype(int)
    season_stats_df["BLK"] = season_stats_df["BLK"].astype(int)
    season_stats_df["TOV"] = season_stats_df["TOV"].astype(int)
    season_stats_df["PF"] = season_stats_df["PF"].astype(int)
    season_stats_df["PTS"] = season_stats_df["PTS"].astype(int)
    season_stats_df["WIN/LOSS"] = win_loss
    season_stats_df["DATE"] = dates
    
#     print(str(win_loss))

    return season_stats_df
    
    

In [30]:
gsw_season = getSeasonData("Golden State Warriors", 2016)

team abbreviation for Golden State Warriors is: GSW
getting info on game between: GSW(HOME)W vs. MEM(AWAY)L on 2016-04-13 00:00:00
getting info on game between: SAS(HOME)L vs. GSW(AWAY)W on 2016-04-10 00:00:00
getting info on game between: MEM(HOME)L vs. GSW(AWAY)W on 2016-04-09 00:00:00
getting info on game between: GSW(HOME)W vs. SAS(AWAY)L on 2016-04-07 00:00:00
getting info on game between: GSW(HOME)L vs. MIN(AWAY)W on 2016-04-05 00:00:00
getting info on game between: GSW(HOME)W vs. POR(AWAY)L on 2016-04-03 00:00:00
getting info on game between: GSW(HOME)L vs. BOS(AWAY)W on 2016-04-01 00:00:00
getting info on game between: UTA(HOME)L vs. GSW(AWAY)W on 2016-03-30 00:00:00
getting info on game between: GSW(HOME)W vs. WAS(AWAY)L on 2016-03-29 00:00:00
getting info on game between: GSW(HOME)W vs. PHI(AWAY)L on 2016-03-27 00:00:00
getting info on game between: GSW(HOME)W vs. DAL(AWAY)L on 2016-03-25 00:00:00
getting info on game between: GSW(HOME)W vs. LAC(AWAY)L on 2016-03-23 00:00:00


In [31]:
gsw_season

Unnamed: 0,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,WIN/LOSS,DATE
0,46,87,0.529,20,47,0.426,13,16,0.813,12,39,51,35,7,7,17,14,125,W,2016-04-13
1,34,77,0.442,10,27,0.370,14,18,0.778,7,39,46,19,7,6,13,18,92,W,2016-04-10
2,39,87,0.448,10,36,0.278,12,13,0.923,7,41,48,20,3,7,11,14,100,W,2016-04-09
3,45,83,0.542,12,25,0.480,10,11,0.909,7,36,43,33,8,2,15,22,112,W,2016-04-07
4,49,98,0.500,12,35,0.343,7,8,0.875,9,37,46,35,9,5,23,29,117,L,2016-04-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,39,85,0.459,10,26,0.385,24,31,0.774,9,33,42,28,6,5,12,22,112,W,2015-11-04
78,43,84,0.512,11,25,0.440,22,30,0.733,10,55,65,32,8,13,14,15,119,W,2015-11-02
79,46,84,0.548,17,30,0.567,25,35,0.714,3,35,38,26,10,4,10,24,134,W,2015-10-31
80,43,93,0.462,9,26,0.346,17,25,0.680,11,36,47,26,9,4,8,24,112,W,2015-10-30


In [32]:
#exporting as csv to for tableau visualizations
gsw_season.to_csv("gsw_15-16.csv")


In [33]:
por_season = getSeasonData("Portland Trail Blazers", 2016)

por_season

team abbreviation for Portland Trail Blazers is: POR
getting info on game between: POR(HOME)W vs. DEN(AWAY)L on 2016-04-13 00:00:00
getting info on game between: POR(HOME)L vs. MIN(AWAY)W on 2016-04-09 00:00:00
getting info on game between: POR(HOME)W vs. OKC(AWAY)L on 2016-04-06 00:00:00
getting info on game between: SAC(HOME)L vs. POR(AWAY)W on 2016-04-05 00:00:00
getting info on game between: GSW(HOME)W vs. POR(AWAY)L on 2016-04-03 00:00:00
getting info on game between: POR(HOME)W vs. MIA(AWAY)L on 2016-04-02 00:00:00
getting info on game between: POR(HOME)W vs. BOS(AWAY)L on 2016-03-31 00:00:00
getting info on game between: POR(HOME)W vs. SAC(AWAY)L on 2016-03-28 00:00:00
getting info on game between: POR(HOME)W vs. PHI(AWAY)L on 2016-03-26 00:00:00
getting info on game between: LAC(HOME)W vs. POR(AWAY)L on 2016-03-24 00:00:00
getting info on game between: POR(HOME)W vs. DAL(AWAY)L on 2016-03-23 00:00:00
getting info on game between: DAL(HOME)W vs. POR(AWAY)L on 2016-03-20 00:00:00

Unnamed: 0,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,WIN/LOSS,DATE
0,39,89,0.438,11,36,0.306,18,22,0.818,13,32,45,19,11,8,15,19,107,W,2016-04-13
1,40,80,0.500,9,24,0.375,16,18,0.889,11,26,37,19,2,4,17,20,105,L,2016-04-09
2,43,84,0.512,15,33,0.455,19,24,0.792,9,30,39,32,11,6,15,23,120,W,2016-04-06
3,40,93,0.430,12,31,0.387,23,27,0.852,19,30,49,20,12,7,14,19,115,W,2016-04-05
4,38,85,0.447,9,24,0.375,26,33,0.788,11,26,37,23,8,4,13,16,111,L,2016-04-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,41,77,0.532,11,25,0.440,15,32,0.469,13,35,48,11,5,7,19,25,108,W,2015-11-04
78,41,89,0.461,9,30,0.300,15,20,0.750,11,35,46,18,5,6,16,27,106,W,2015-11-02
79,33,79,0.418,8,30,0.267,16,28,0.571,10,35,45,20,7,5,16,20,90,L,2015-10-31
80,34,86,0.395,9,24,0.375,15,19,0.789,11,31,42,17,9,5,23,33,92,L,2015-10-30


In [34]:
por_season.to_csv("por_15-16.csv")

Here comes the machine learning portion

In [56]:
X = gsw_season.drop(["WIN/LOSS", "DATE"], axis=1)
y = gsw_season["WIN/LOSS"].values.reshape(-1,1)
print(X.shape, y.shape)

(82, 18) (82, 1)


In [75]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.5)

In [76]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [77]:
classifier.fit(X_train, y_train)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [78]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9166666666666666
Testing Data Score: 0.8333333333333334


In [79]:
predictions = classifier.predict(X_test)
print(f"Season Predictions:   {predictions}")
print(f"Actual labels: {y_test[:len(predictions)//2].tolist()}")

Season Predictions:   ['W' 'W' 'W' 'L' 'W' 'W' 'L' 'L' 'L' 'L' 'L' 'L' 'W' 'L' 'W' 'L' 'L' 'L'
 'W' 'L' 'W' 'L' 'W' 'L' 'W' 'L' 'L' 'W' 'W' 'W' 'W' 'W' 'L' 'L' 'W' 'L']
Actual labels: [['W'], ['W'], ['W'], ['L'], ['W'], ['W'], ['L'], ['L'], ['L'], ['L'], ['L'], ['L'], ['W'], ['W'], ['W'], ['L'], ['L'], ['W']]


In [80]:
y_test = y_test.flatten()

y_test

array(['W', 'W', 'W', 'L', 'W', 'W', 'L', 'L', 'L', 'L', 'L', 'L', 'W',
       'W', 'W', 'L', 'L', 'W', 'W', 'L', 'W', 'W', 'W', 'L', 'W', 'W',
       'L', 'W', 'L', 'W', 'W', 'W', 'L', 'L', 'L', 'L'], dtype=object)

In [81]:
from sklearn.metrics import accuracy_score

accuracy_score(predictions, y_test)

0.8333333333333334

# Little experimentation with the logistic regression

In [82]:
#trying out logistic regression with a team with around a 50-50 win/loss record 
season = getSeasonData("Brooklyn Nets", 2020)

team abbreviation for Brooklyn Nets is: BRK
getting info on game between: BRK(HOME)L vs. POR(AWAY)W on 2020-08-13 00:00:00
getting info on game between: ORL(HOME)L vs. BRK(AWAY)W on 2020-08-11 00:00:00
getting info on game between: LAC(HOME)L vs. BRK(AWAY)W on 2020-08-09 00:00:00
getting info on game between: BRK(HOME)W vs. SAC(AWAY)L on 2020-08-07 00:00:00
getting info on game between: BOS(HOME)W vs. BRK(AWAY)L on 2020-08-05 00:00:00
getting info on game between: MIL(HOME)L vs. BRK(AWAY)W on 2020-08-04 00:00:00
getting info on game between: BRK(HOME)W vs. WAS(AWAY)L on 2020-08-02 00:00:00
getting info on game between: BRK(HOME)L vs. ORL(AWAY)W on 2020-07-31 00:00:00
getting info on game between: LAL(HOME)L vs. BRK(AWAY)W on 2020-03-10 00:00:00
getting info on game between: BRK(HOME)W vs. CHI(AWAY)L on 2020-03-08 00:00:00
getting info on game between: BRK(HOME)W vs. SAS(AWAY)L on 2020-03-06 00:00:00
getting info on game between: BRK(HOME)L vs. MEM(AWAY)W on 2020-03-04 00:00:00
getting 

In [83]:
#show season dataframe
season

Unnamed: 0,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,WIN/LOSS,DATE
0,53,96,0.552,14,33,0.424,13,19,0.684,12,34,46,28,5,5,14,23,133,L,2020-08-13
1,38,89,0.427,13,43,0.302,19,24,0.792,8,39,47,26,9,4,13,20,108,W,2020-08-11
2,47,85,0.553,20,43,0.465,15,19,0.789,8,38,46,32,5,5,16,25,129,W,2020-08-09
3,42,86,0.488,17,40,0.425,18,28,0.643,7,37,44,30,9,3,9,23,119,W,2020-08-07
4,45,95,0.474,8,32,0.250,17,21,0.810,7,27,34,23,9,2,17,22,115,L,2020-08-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,42,85,0.494,19,32,0.594,20,28,0.714,10,45,55,26,8,7,20,21,123,W,2019-11-01
68,41,93,0.441,10,31,0.323,16,26,0.615,18,36,54,21,6,5,19,19,108,L,2019-10-30
69,44,101,0.436,15,45,0.333,30,37,0.811,11,42,53,28,6,5,17,22,133,L,2019-10-27
70,37,78,0.474,16,37,0.432,23,34,0.676,11,35,46,21,11,2,26,26,113,W,2019-10-25


In [84]:
#machine learning portion
X = season.drop(["WIN/LOSS", "DATE"], axis=1)
y = season["WIN/LOSS"].values.reshape(-1,1)
print(X.shape, y.shape)

(72, 18) (72, 1)


In [85]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size = 0.5)

In [86]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [87]:
classifier.fit(X_train, y_train)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [88]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9166666666666666
Testing Data Score: 0.8333333333333334


In [89]:
predictions = classifier.predict(X_test)
print(f"Season Predictions:   {predictions}")
print(f"Actual labels: {y_test[:len(predictions)//2].tolist()}")

Season Predictions:   ['W' 'W' 'W' 'L' 'W' 'W' 'L' 'L' 'L' 'L' 'L' 'L' 'W' 'L' 'W' 'L' 'L' 'L'
 'W' 'L' 'W' 'L' 'W' 'L' 'W' 'L' 'L' 'W' 'W' 'W' 'W' 'W' 'L' 'L' 'W' 'L']
Actual labels: [['W'], ['W'], ['W'], ['L'], ['W'], ['W'], ['L'], ['L'], ['L'], ['L'], ['L'], ['L'], ['W'], ['W'], ['W'], ['L'], ['L'], ['W']]


In [90]:
y_test = y_test.flatten()

y_test



array(['W', 'W', 'W', 'L', 'W', 'W', 'L', 'L', 'L', 'L', 'L', 'L', 'W',
       'W', 'W', 'L', 'L', 'W', 'W', 'L', 'W', 'W', 'W', 'L', 'W', 'W',
       'L', 'W', 'L', 'W', 'W', 'W', 'L', 'L', 'L', 'L'], dtype=object)

In [91]:

from sklearn.metrics import accuracy_score

accuracy_score(predictions, y_test)

0.8333333333333334