In [1]:
from nba_api.stats.endpoints import commonplayerinfo
from nba_api.stats.endpoints import leaguegamelog
from nba_api.stats.endpoints import boxscoreplayertrackv2
from nba_api.stats.endpoints import teamgamelog
from nba_api.stats.static import teams
import pandas as pd
import requests
import time

In [4]:
gamelog_for_season = leaguegamelog.LeagueGameLog(2017, timeout=15)
gamelog_df = gamelog_for_season.get_data_frames()[0]

In [5]:
gamelog_df

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
0,22019,1610612747,LAL,Los Angeles Lakers,0021900002,2019-10-22,LAL @ LAC,L,240,37,...,32,41,20,4,7,15,24,102,-10,1
1,22019,1610612746,LAC,LA Clippers,0021900002,2019-10-22,LAC vs. LAL,W,240,42,...,34,45,24,8,5,14,25,112,10,1
2,22019,1610612740,NOP,New Orleans Pelicans,0021900001,2019-10-22,NOP @ TOR,L,265,43,...,37,53,30,4,9,19,34,122,-8,1
3,22019,1610612761,TOR,Toronto Raptors,0021900001,2019-10-22,TOR vs. NOP,W,265,42,...,41,57,23,7,3,17,24,130,8,1
4,22019,1610612738,BOS,Boston Celtics,0021900008,2019-10-23,BOS @ PHI,L,240,33,...,31,41,18,4,2,11,29,93,-14,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1937,22019,1610612748,MIA,Miami Heat,0021900970,2020-03-11,MIA vs. CHA,L,240,39,...,27,36,32,8,5,14,17,98,-11,1
1938,22019,1610612752,NYK,New York Knicks,0021900969,2020-03-11,NYK @ ATL,W,265,50,...,41,47,32,14,9,12,23,136,5,1
1939,22019,1610612737,ATL,Atlanta Hawks,0021900969,2020-03-11,ATL vs. NYK,L,265,48,...,38,53,26,6,3,17,25,131,-5,1
1940,22019,1610612743,DEN,Denver Nuggets,0021900973,2020-03-11,DEN @ DAL,L,240,39,...,37,40,23,6,4,16,20,97,-16,1


# Step 1: Utility Stuff

We're going to get the IDs of every team and store them. Later, we can cross reference the IDs with the `list_of_teams` to figure out which team was which

In [2]:
list_of_teams = teams.get_teams()
all_team_ids = []
dict_ids_to_name = dict()
for team_obj in list_of_teams:
    all_team_ids.append(team_obj['id'])
    dict_ids_to_name[team_obj['id']] = team_obj['full_name']
dict_ids_to_name

{1610612737: 'Atlanta Hawks',
 1610612738: 'Boston Celtics',
 1610612739: 'Cleveland Cavaliers',
 1610612740: 'New Orleans Pelicans',
 1610612741: 'Chicago Bulls',
 1610612742: 'Dallas Mavericks',
 1610612743: 'Denver Nuggets',
 1610612744: 'Golden State Warriors',
 1610612745: 'Houston Rockets',
 1610612746: 'Los Angeles Clippers',
 1610612747: 'Los Angeles Lakers',
 1610612748: 'Miami Heat',
 1610612749: 'Milwaukee Bucks',
 1610612750: 'Minnesota Timberwolves',
 1610612751: 'Brooklyn Nets',
 1610612752: 'New York Knicks',
 1610612753: 'Orlando Magic',
 1610612754: 'Indiana Pacers',
 1610612755: 'Philadelphia 76ers',
 1610612756: 'Phoenix Suns',
 1610612757: 'Portland Trail Blazers',
 1610612758: 'Sacramento Kings',
 1610612759: 'San Antonio Spurs',
 1610612760: 'Oklahoma City Thunder',
 1610612761: 'Toronto Raptors',
 1610612762: 'Utah Jazz',
 1610612763: 'Memphis Grizzlies',
 1610612764: 'Washington Wizards',
 1610612765: 'Detroit Pistons',
 1610612766: 'Charlotte Hornets'}

Takes a string formatted as "x:xx" or "xx:xx" and returns the equivalent number of seconds as an int... will be useful for filtering players based on playing time

In [3]:
def str_to_mins(inp):
    lst = inp.split(":")
    sec = int(lst[0])*60 + int(lst[1])
    return sec

# Step 2: Storing Individual Games + Winning/Losing Players
We are going to create an empty dataframe where we are going to store everything. Our columns are going to be `game_id`, `winning_team_id`, `winning_team_players`, and `losing_team_players`. To build our basic model where we one-hot encode players names and try to see if our model can predict at better than 50% accuracy who is going to win, this information will suffice.

In [85]:
record = pd.DataFrame(columns=['game_id', 'home_team_id', 'winning_team_id', 'winning_team_players', 'losing_team_players'])

Now, we need to iterate through every season from 1997-1998 to 2018-2019, check every single team's game log (accessed through the `TeamGameLog` endpoint), and insert the game into `record` if the team won. By only inserting the game if the team won, we ensure that we don't duplicate any entries. When considering a given game, we will use the `BoxScorePlayerTrackV2` endpoint, which takes in a `GAME_ID` and returns the boxscore, to get the list of players who played at least 3 minutes on each team. Afterwards, we'll save to a pickle file so that we can access the data easily.

### WARNING: Takes very very long to run... stored in a pickle file so you can just unpack it

In [87]:
import timeit
for year in range(1997,2019):
    for team_id in all_team_ids:
        
        print(year, dict_ids_to_name[team_id])
        
        # a little code block to retry if the NBA site tries to block us
        max_retries = 10
        for _ in range(max_retries):
            try:
                gamelog_for_season = teamgamelog.TeamGameLog(team_id, year, timeout=15)
                break
            except:
                pass
        
        try:
            gamelog_for_season
        except NameError:
            break #raise NameError(f"Tried {max_retries} times but couldn't get gamelog")
                    
        gamelog_df = gamelog_for_season.get_data_frames()[0] #return statement is a list, we want the first element

        start = timeit.default_timer()
        for row in gamelog_df.itertuples():
            if row.WL == "W": #remember, only want the winning team so we don't duplicate anything
                time.sleep(.7)
                
                ministart = timeit.default_timer()
                # a little code block to retry if the NBA site tries to block us
                max_retries = 5
                for _ in range(max_retries):
                    try:
                        bx = boxscoreplayertrackv2.BoxScorePlayerTrackV2(row.Game_ID, timeout=15)
                        break
                    except:
                        pass
                    
                try:
                    bx
                except NameError:
                    break #raise NameError(f"Tried {max_retries} times but couldn't get gamelog")
                ministop = timeit.default_timer()
                if ministop-ministart > 10:
                    print("Fetched boxscore #", row.Index,"in",ministop-ministart,"seconds")
                
                #rest of code, here we are separating the boxscore into the winning team players and losing team players
                players = bx.get_data_frames()[0]
                team_list = players['TEAM_ID'].unique()
                players_dict = dict()
                
                for team in team_list:
                    players_dict[team] = set()
                
                for innerrow in players.itertuples():
                    if str_to_mins(innerrow.MIN) >= 3*60:
                        players_dict[innerrow.TEAM_ID].add(innerrow.PLAYER_NAME)
                        
                
                #definitions for our insertion into the dataframe
                game_id = row.Game_ID
                winning_team_id = team_id
                losing_team_id = next(iter(set(players_dict.keys()) - {winning_team_id}))
                winning_team_players = players_dict[winning_team_id]
                losing_team_players = players_dict[losing_team_id]
                if "vs." in row.MATCHUP:
                    home_team_id  = winning_team_id
                else:
                    home_team_id  = losing_team_id
                record = record.append({'game_id': game_id, \
                                        'home_team_id': home_team_id, \
                                        'winning_team_id': winning_team_id, \
                                        'winning_team_players': winning_team_players, \
                                        'losing_team_players': losing_team_players}, ignore_index=True)
        stop = timeit.default_timer()
        print("Analyzed games in",stop-start,"seconds")

2007 Atlanta Hawks
Analyzed games in 48.89654590000282 seconds
2007 Boston Celtics
Analyzed games in 91.10714340000413 seconds
2007 Cleveland Cavaliers
Analyzed games in 59.33294880000176 seconds
2007 New Orleans Pelicans
Analyzed games in 73.76622520000092 seconds
2007 Chicago Bulls
Analyzed games in 45.34184600000299 seconds
2007 Dallas Mavericks
Analyzed games in 65.3929505000051 seconds
2007 Denver Nuggets
Analyzed games in 66.34178120000433 seconds
2007 Golden State Warriors
Analyzed games in 64.70128560000012 seconds
2007 Houston Rockets
Analyzed games in 75.82649819999642 seconds
2007 Los Angeles Clippers
Analyzed games in 33.71422259999963 seconds
2007 Los Angeles Lakers
Analyzed games in 77.18110159999924 seconds
2007 Miami Heat
Analyzed games in 19.76469480000378 seconds
2007 Milwaukee Bucks
Analyzed games in 37.54614830000355 seconds
2007 Minnesota Timberwolves
Analyzed games in 29.77379629999632 seconds
2007 Brooklyn Nets
Analyzed games in 45.690555200002564 seconds
2007 Ne

Analyzed games in 28.478763099999924 seconds
2011 Chicago Bulls
Analyzed games in 66.48010999999678 seconds
2011 Dallas Mavericks
Analyzed games in 48.590287400002126 seconds
2011 Denver Nuggets
Analyzed games in 48.65105329999642 seconds
2011 Golden State Warriors
Analyzed games in 31.481205000003683 seconds
2011 Houston Rockets
Analyzed games in 45.4374828 seconds
2011 Los Angeles Clippers
Analyzed games in 56.63273409999965 seconds
2011 Los Angeles Lakers
Analyzed games in 56.5901040999961 seconds
2011 Miami Heat
Analyzed games in 58.679571799999394 seconds
2011 Milwaukee Bucks
Analyzed games in 41.50701150000532 seconds
2011 Minnesota Timberwolves
Analyzed games in 32.76223160000518 seconds
2011 Brooklyn Nets
Analyzed games in 29.92449480000505 seconds
2011 New York Knicks
Analyzed games in 46.28276320000441 seconds
2011 Orlando Magic
Analyzed games in 50.21762330000638 seconds
2011 Indiana Pacers
Analyzed games in 54.95446339999762 seconds
2011 Philadelphia 76ers
Analyzed games in

Analyzed games in 97.79939939999895 seconds
2015 Houston Rockets
Analyzed games in 55.531383800000185 seconds
2015 Los Angeles Clippers
Analyzed games in 74.58192769999732 seconds
2015 Los Angeles Lakers
Analyzed games in 24.263291900002514 seconds
2015 Miami Heat
Analyzed games in 64.6399764999951 seconds
2015 Milwaukee Bucks
Analyzed games in 44.7987368000031 seconds
2015 Minnesota Timberwolves
Analyzed games in 40.12679190000199 seconds
2015 Brooklyn Nets
Analyzed games in 27.60579290000169 seconds
2015 New York Knicks
Analyzed games in 43.856619899997895 seconds
2015 Orlando Magic
Analyzed games in 46.54939110000123 seconds
2015 Indiana Pacers
Analyzed games in 61.11312840000028 seconds
2015 Philadelphia 76ers
Analyzed games in 13.375646200001938 seconds
2015 Phoenix Suns
Analyzed games in 32.154083499997796 seconds
2015 Portland Trail Blazers
Analyzed games in 60.72744000000239 seconds
2015 Sacramento Kings
Analyzed games in 44.62793529999908 seconds
2015 San Antonio Spurs
Analyze

In [88]:
record = record.drop_duplicates(subset="game_id")

In [89]:
record.to_csv("one_hot_model_dataset_v2.csv",index=False)

# Step 3: Data Exploration
Now, we can import the pickled dataset and start using it for data exploration. This is a relatively uninteresting dataset, since we only have the names of players. However, one useful thing to do is to store the set of all unique players. We'll use it later when trying to encode our data.

In [90]:
record = pd.read_csv("one_hot_model_dataset_v2.csv")
from ast import literal_eval
record["winning_team_players"] = record["winning_team_players"].apply(lambda x:literal_eval(x))
record["losing_team_players"] = record["losing_team_players"].apply(lambda x:literal_eval(x))
record.head()

Unnamed: 0,game_id,home_team_id,winning_team_id,winning_team_players,losing_team_players
0,29701183,1610612737,1610612737,"{Dikembe Mutombo, Mookie Blaylock, Tyrone Corb...","{Alonzo Mourning, Antonio Lang, Eric Murdock, ..."
1,29701167,1610612737,1610612737,"{Dikembe Mutombo, Mookie Blaylock, Chris Crawf...","{Dell Curry, Donald Royal, Glen Rice, Anthony ..."
2,29701142,1610612737,1610612737,"{Dikembe Mutombo, Mookie Blaylock, Tyrone Corb...","{Derrick Coleman, Anthony Parker, Mark Davis, ..."
3,29701130,1610612764,1610612737,"{Dikembe Mutombo, Mookie Blaylock, Tyrone Corb...","{God Shammgod, Calbert Cheaney, Darvin Ham, Ch..."
4,29701115,1610612766,1610612737,"{Dikembe Mutombo, Mookie Blaylock, Tyrone Corb...","{Dell Curry, Glen Rice, Anthony Mason, Vernon ..."


In [91]:
#Number of unique players
all_players = set()
for row in record.itertuples():
    all_players.update(row.winning_team_players,row.losing_team_players )
all_players = list(all_players)
all_players = sorted(all_players)
print("The # of unique players that have played at least 3 minutes in a game from the 1997-98 season to the 2018-19 season is", len(all_players))

The # of unique players that have played at least 3 minutes in a game from the 1997-98 season to the 2018-19 season is 2023


# Step 4: Data Processing
Cool! Now we have our `record` which is a table that contains all the data we need. Unfortunately, a machine learning algorithm cannot take this as an input. We need to [one-hot encode](https://en.wikipedia.org/wiki/One-hot) our data, secifically, using the Python methods describe [here](https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/). This means we will have to create a column for every player, whose value will be 1 if the player was on the home team, -1 if they were on the away team, and 0 otherwise.

We will encode the outputs as 1 being a victory for the home team and 0 being a victory for the away team.

A useful method. We will require the condensed version of players names a lot, so makes sense to pay the overhead and do it now so that we can access it in constant time instead of having to do expensive string operations at each step

In [92]:
name_to_condensed_dict = dict()
for player in all_players:
    player_name_condensed = player.replace(" ", "")
    name_to_condensed_dict[player] = player_name_condensed

Now we're going to add columns to `ml_record` for every single player indicating whether they played on team A or B, plus a column for the outcome of the game (1 if team A won, 0 if team B won)

In [93]:
cols = []
for player in all_players:
    cols.append(name_to_condensed_dict[player])
cols.append("label")
ml_record = pd.DataFrame(columns=cols, index=range(record.shape[0]))
ml_record

Unnamed: 0,A.C.Green,A.J.Bramlett,A.J.Guyton,AJHammons,AJPrice,AaronBrooks,AaronGordon,AaronGray,AaronHarrison,AaronHoliday,...,ZarkoCabarkapa,ZazaPachulia,ZeljkoRebraca,ZendonHamilton,ZhaireSmith,ZhouQi,ZoranDragic,ZoranPlaninic,ZydrunasIlgauskas,label
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26063,,,,,,,,,,,...,,,,,,,,,,
26064,,,,,,,,,,,...,,,,,,,,,,
26065,,,,,,,,,,,...,,,,,,,,,,
26066,,,,,,,,,,,...,,,,,,,,,,


This is another very important block of code. We are going through each game, considering each player, and changing the appropriate column for that player to 1. Additionally, we are creating a second training example with the opposite result.

In [94]:
offset = record.shape[0]
for row in record.itertuples():
    index = row.Index
    winning_team_players = row.winning_team_players
    losing_team_players = row.losing_team_players
    home_team_id = row.home_team_id
    winning_team_id = row.winning_team_id
    if index%100 == 0:
        print(f"iteratation {index} of {offset}: {round(index/offset*100,2)}%")
    #create the first training example, which has a result of 1 and the winning players on team A
    if winning_team_id == home_team_id:
        for player in winning_team_players:
            player_name_condensed = name_to_condensed_dict[player]
            ml_record.at[index, player_name_condensed] = 1
        for player in losing_team_players:
            player_name_condensed = name_to_condensed_dict[player]
            ml_record.at[index, player_name_condensed] = -1
        ml_record.at[index, "label"] = 1
    else:
        for player in winning_team_players:
            player_name_condensed = name_to_condensed_dict[player]
            ml_record.at[index, player_name_condensed] = -1
        for player in losing_team_players:
            player_name_condensed = name_to_condensed_dict[player]
            ml_record.at[index, player_name_condensed] = 1
        ml_record.at[index, "label"] = 0
    

iteratation 0 of 26068: 0.0%
iteratation 100 of 26068: 0.38%
iteratation 200 of 26068: 0.77%
iteratation 300 of 26068: 1.15%
iteratation 400 of 26068: 1.53%
iteratation 500 of 26068: 1.92%
iteratation 600 of 26068: 2.3%
iteratation 700 of 26068: 2.69%
iteratation 800 of 26068: 3.07%
iteratation 900 of 26068: 3.45%
iteratation 1000 of 26068: 3.84%
iteratation 1100 of 26068: 4.22%
iteratation 1200 of 26068: 4.6%
iteratation 1300 of 26068: 4.99%
iteratation 1400 of 26068: 5.37%
iteratation 1500 of 26068: 5.75%
iteratation 1600 of 26068: 6.14%
iteratation 1700 of 26068: 6.52%
iteratation 1800 of 26068: 6.91%
iteratation 1900 of 26068: 7.29%
iteratation 2000 of 26068: 7.67%
iteratation 2100 of 26068: 8.06%
iteratation 2200 of 26068: 8.44%
iteratation 2300 of 26068: 8.82%
iteratation 2400 of 26068: 9.21%
iteratation 2500 of 26068: 9.59%
iteratation 2600 of 26068: 9.97%
iteratation 2700 of 26068: 10.36%
iteratation 2800 of 26068: 10.74%
iteratation 2900 of 26068: 11.12%
iteratation 3000 of 26

iteratation 24100 of 26068: 92.45%
iteratation 24200 of 26068: 92.83%
iteratation 24300 of 26068: 93.22%
iteratation 24400 of 26068: 93.6%
iteratation 24500 of 26068: 93.98%
iteratation 24600 of 26068: 94.37%
iteratation 24700 of 26068: 94.75%
iteratation 24800 of 26068: 95.14%
iteratation 24900 of 26068: 95.52%
iteratation 25000 of 26068: 95.9%
iteratation 25100 of 26068: 96.29%
iteratation 25200 of 26068: 96.67%
iteratation 25300 of 26068: 97.05%
iteratation 25400 of 26068: 97.44%
iteratation 25500 of 26068: 97.82%
iteratation 25600 of 26068: 98.2%
iteratation 25700 of 26068: 98.59%
iteratation 25800 of 26068: 98.97%
iteratation 25900 of 26068: 99.36%
iteratation 26000 of 26068: 99.74%


Ok, so now we're 90% of the way there. Now, we only need to drop the `game_id`, `winning_team_players`, and `losing_team_players` columns (because they're not going into the model), and then we can start preparing the data for training. Additionally, we will pickle this new dataframe since the method above takes a while to finish.  

In [95]:
ml_record = ml_record.fillna(0)
ml_record.to_csv("encoded_one_hot_model_dataset_v2.csv",index=False)

The extra taining example above unfortunately came out with the blank spaces as `NaN`, but we can replace those with zeroes easily

In [96]:
ml_record = pd.read_csv("encoded_one_hot_model_dataset_v2.csv")
ml_record

Unnamed: 0,A.C.Green,A.J.Bramlett,A.J.Guyton,AJHammons,AJPrice,AaronBrooks,AaronGordon,AaronGray,AaronHarrison,AaronHoliday,...,ZarkoCabarkapa,ZazaPachulia,ZeljkoRebraca,ZendonHamilton,ZhaireSmith,ZhouQi,ZoranDragic,ZoranPlaninic,ZydrunasIlgauskas,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26063,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
26064,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
26065,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
26066,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We're going to drop some useless columns and turn this into a numpy array, which is what scikit-learn likes as an input

In [97]:
#convert to final numpy form
import numpy as np 

# Labels are the values we want to predict
labels = np.array(ml_record['label'])
# Remove the labels column; axis 1 refers to the columns
values = ml_record.drop(['label'], axis = 1)
# Saving feature names for later use
feature_list = list(values.columns)
# Convert to numpy array
values = np.array(values)

Now, finally, we can split this into a training, validation, and testing dataset. We're ready!

In [98]:
#split into training, validation, and testing data
from sklearn.model_selection import train_test_split

X = values
y = labels

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=1) #this gives 20% test, 20% val, 60% train

## Step 6: Machine Learning - Random Forest
We're going to be using scikit-learn's `RandomForestClassifier` for our first attempt. 

In [135]:
#here's the magical step! We train the classifier
from sklearn.ensemble import RandomForestClassifier

# Instantiate model with 500 decision trees
rf = RandomForestClassifier(n_estimators = 200, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train);

In [136]:
from joblib import dump
dump(rf, 'random_forest_week6_model_v2.joblib')

['random_forest_week6_model_v2.joblib']

In [137]:
from joblib import load
rf = load('random_forest_week6_model_v2.joblib')

In [138]:
# the next part of the code is to predict based on the validation set and evaluate the performance of the model...
# we don't want to use the test set until we're certain that we're absolutely done with tweaking parameters
from sklearn import metrics
from sklearn.metrics import classification_report

y_pred = rf.predict(X_val)
# Model Accuracy, how often is the classifier correct?
print("Accuracy: ",metrics.accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy:  0.6518987341772152
              precision    recall  f1-score   support

           0       0.60      0.49      0.54      2173
           1       0.68      0.77      0.72      3041

    accuracy                           0.65      5214
   macro avg       0.64      0.63      0.63      5214
weighted avg       0.65      0.65      0.65      5214



## Step 7: Machine Learning - Logistic Regression
We're going to be using scikit-learn's `LogisticRegression` for our first attempt. 

In [100]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=0, verbose = 1, max_iter=100000)
lr.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.1s finished


LogisticRegression(max_iter=100000, random_state=0, verbose=1)

In [101]:
from joblib import dump
dump(lr, 'logistic_regression_week6_model_v2.joblib')

['logistic_regression_week6_model_v2.joblib']

In [102]:
from joblib import load
lr = load('logistic_regression_week6_model_v2.joblib')

In [140]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn import metrics

y_pred = lr.predict(X_val)
# Model Accuracy, how often is the classifier correct?
print("Accuracy: ",metrics.accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy:  0.6672420406597622
              precision    recall  f1-score   support

           0       0.62      0.53      0.57      2173
           1       0.69      0.77      0.73      3041

    accuracy                           0.67      5214
   macro avg       0.66      0.65      0.65      5214
weighted avg       0.66      0.67      0.66      5214

