### Data Cleaning + Engineering

NBA Season Date Source: https://en.wikipedia.org/wiki/2017_NBA_Playoffs

In [1]:
import pandas as pd
import numpy as np
import pdb

pd.set_option('display.max_columns', None)

In [2]:
# Bring in the data
df = pd.read_csv("game-data-revised.csv")
df.head(1)

Unnamed: 0,Team,Opponent,Team-Score,Opponent-Score,Win,Date,Time,Location,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,ORtg,DRtg
0,Cleveland Cavaliers,Washington Wizards,94,84,1,10/30/12,7:00 PM,"Quicken Loans Arena, Cleveland, Ohio",36,79,0.456,7,20,0.35,15,22,0.682,18,36,54,22,7,5,20,21,0.53,0.5,0.253,0.278,46.2,66.7,58.1,61.1,8.0,8.6,18.4,106.9,95.5


### Create 'Seasons' Variable

In [3]:
# Prepare the data
df.Date = pd.to_datetime(df.Date)

In [4]:
position = len(df.columns)
df['Season'] = ""

for i in range(0, len(df)):
    # 2012-13 Season: October 30, 2012 - June 20, 2013
    if df.iloc[i, 5] >= pd.Timestamp('2012-10-30 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2013-6-20 00:00:00'):
        df.iloc[i, position] = '2012-13'
    
    # 2013-14 Season: October 29, 2013 - June 15, 2014
    elif df.iloc[i, 5] >= pd.Timestamp('2013-10-29 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2014-6-15 00:00:00'):
        df.iloc[i, position] = '2013-14'
    
    # 2014-15 Season: October 29, 2014 - June 16, 2015
    elif df.iloc[i, 5] >= pd.Timestamp('2014-10-28 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2015-6-16 00:00:00'):
        df.iloc[i, position] = '2014-15'
    
    # 2015-16 Season: October 27, 2015 - June 19, 2016
    elif df.iloc[i, 5] >= pd.Timestamp('2015-10-27 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2016-6-19 00:00:00'):
        df.iloc[i, position] = '2015-16'
    
    # 2016-17 Season: October 25, 2016 - June 12, 2017
    elif df.iloc[i, 5] >= pd.Timestamp('2016-10-25 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2017-6-12 00:00:00'):
        df.iloc[i, position] = '2016-17'
    else: 
        df.iloc[i, position] = None

In [5]:
df['Season'].value_counts()

2013-14    2624
2016-17    2592
2015-16    2500
2012-13    2500
2014-15    2498
Name: Season, dtype: int64

In [6]:
df.head(1)

Unnamed: 0,Team,Opponent,Team-Score,Opponent-Score,Win,Date,Time,Location,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,ORtg,DRtg,Season
0,Cleveland Cavaliers,Washington Wizards,94,84,1,2012-10-30,7:00 PM,"Quicken Loans Arena, Cleveland, Ohio",36,79,0.456,7,20,0.35,15,22,0.682,18,36,54,22,7,5,20,21,0.53,0.5,0.253,0.278,46.2,66.7,58.1,61.1,8.0,8.6,18.4,106.9,95.5,2012-13


### Create 'Playoff' Game Dummy

In [7]:
position = len(df.columns)
df['Playoff'] = ""

for i in range(0, len(df)):
    # 2013 Playoffs: April 20 - June 20, 2013
    if df.iloc[i, 5] >= pd.Timestamp('2013-4-20 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2013-6-20 00:00:00'):
        df.iloc[i, position] = 1
    
    # 2014 Playoffs: April 19 - June 15, 2014
    elif df.iloc[i, 5] >= pd.Timestamp('2014-4-19 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2014-6-15 00:00:00'):
        df.iloc[i, position] = 1
    
    # 2015 Playoffs: April 18 - June 16, 2015
    elif df.iloc[i, 5] >= pd.Timestamp('2015-4-18 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2015-6-16 00:00:00'):
        df.iloc[i, position] = 1
    
    # 2016 Playoffs: April 16 - June 19, 2016
    elif df.iloc[i, 5] >= pd.Timestamp('2016-4-16 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2016-6-19 00:00:00'):
        df.iloc[i, position] = 1
    
    # 2017 Playoffs: April 15 - June 12, 2017
    elif df.iloc[i, 5] >= pd.Timestamp('2017-4-15 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2017-6-12 00:00:00'):
        df.iloc[i, position] = 1
    else: 
        df.iloc[i, position] = 0

In [8]:
df['Playoff'].value_counts()

0    11878
1      836
Name: Playoff, dtype: int64

### Create 'Home/Away' Dummy

In [9]:
position = len(df.columns)
df['Home'] = ""

for i in range(0, len(df)):
    if (df.iloc[i, 0] == "Cleveland Cavaliers") and (df.iloc[i, 7] == "Quicken Loans Arena, Cleveland, Ohio"):
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Los Angeles Lakers" and df.iloc[i, 7] == "STAPLES Center, Los Angeles, California":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Dallas Mavericks" and df.iloc[i, 7] == "American Airlines Center, Dallas, Texas":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Miami Heat" and df.iloc[i, 7] == "AmericanAirlines Arena, Miami, Florida":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Washington Wizards" and df.iloc[i, 7] == "Verizon Center, Washington, District of Columbia":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Boston Celtics" and df.iloc[i, 7] == "TD Garden, Boston, Massachusetts":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Philadelphia 76ers" and df.iloc[i, 7] == "Wells Fargo Center, Philadelphia, Pennsylvania":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Denver Nuggets" and df.iloc[i, 7] == "Pepsi Center, Denver, Colorado":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Phoenix Suns" and df.iloc[i, 7] == "US Airways Center, Phoenix, Arizona":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Golden State Warriors" and df.iloc[i, 7] == "Oracle Arena, Oakland, California":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Sacramento Kings" and df.iloc[i, 7] == "Sleep Train Arena, Sacramento, California":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Chicago Bulls" and df.iloc[i, 7] == "United Center, Chicago, Illinois":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "New Orleans Hornets" and df.iloc[i, 7] == "New Orleans Arena, New Orleans, Louisiana":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "San Antonio Spurs" and df.iloc[i, 7] == "AT&T Center, San Antonio, Texas":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Utah Jazz" and df.iloc[i, 7] == "EnergySolutions Arena, Salt Lake City, Utah":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Toronto Raptors" and df.iloc[i, 7] == "Air Canada Centre, Toronto, Canada":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Indiana Pacers" and df.iloc[i, 7] == "Bankers Life Fieldhouse, Indianapolis, Indiana":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Detroit Pistons" and df.iloc[i, 7] == "The Palace of Auburn Hills, Auburn Hills, Michigan":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Houston Rockets" and df.iloc[i, 7] == "Toyota Center, Houston, Texas":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Memphis Grizzlies" and df.iloc[i, 7] == "FedEx Forum, Memphis, Tennessee":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Los Angeles Clippers" and df.iloc[i, 7] == "STAPLES Center, Los Angeles, California":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Portland Trail Blazers" and df.iloc[i, 7] == "Rose Garden Arena, Portland, Oregon":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Oklahoma City Thunder" and df.iloc[i, 7] == "Chesapeake Energy Arena, Oklahoma City, Oklahoma":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Charlotte Bobcats" and df.iloc[i, 7] == "Time Warner Cable Arena, Charlotte, North Carolina":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Minnesota Timberwolves" and df.iloc[i, 7] == "Target Center, Minneapolis, Minnesota":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Orlando Magic" and df.iloc[i, 7] == "Amway Center, Orlando, Florida":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Milwaukee Bucks" and df.iloc[i, 7] == "BMO Harris Bradley Center, Milwaukee, Wisconsin":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "New York Knicks" and df.iloc[i, 7] == "Madison Square Garden (IV), New York, New York":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Brooklyn Nets" and df.iloc[i, 7] == "Barclays Center, Brooklyn, New York":
        df.iloc[i, position] = 1
    elif df.iloc[i, 0] == "Atlanta Hawks" and df.iloc[i, 7] == "Philips Arena, Atlanta, Georgia":
        df.iloc[i, position] = 1
    else:
        df.iloc[i, position] = 0

In [10]:
df['Home'].value_counts()

0    7009
1    5705
Name: Home, dtype: int64

### Filter by Season

In [11]:
# Regular Season (Non-Playoff) Games
df_12_13 = df[(df.Season == "2012-13") & (df.Playoff == 0)]
df_13_14 = df[(df.Season == "2013-14") & (df.Playoff == 0)]
df_14_15 = df[(df.Season == "2014-15") & (df.Playoff == 0)]
df_15_16 = df[(df.Season == "2015-16") & (df.Playoff == 0)]
df_16_17 = df[(df.Season == "2016-17") & (df.Playoff == 0)]

In [12]:
df_16_17.head(1)

Unnamed: 0,Team,Opponent,Team-Score,Opponent-Score,Win,Date,Time,Location,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,ORtg,DRtg,Season,Playoff,Home
10122,San Antonio Spurs,Golden State Warriors,129,100,1,2016-10-25,10:30 PM,"Oracle Arena, Oakland, California",47,98,0.48,12,24,0.5,23,26,0.885,21,34,55,25,13,3,13,19,0.589,0.541,0.245,0.265,43.8,81.0,61.1,53.2,13.2,5.8,10.6,131.3,101.8,2016-17,0,0


In [13]:
# Write to CSV
df_12_13.to_csv("df_12_13.csv", index = False)
df_13_14.to_csv("df_13_14.csv", index = False)
df_14_15.to_csv("df_14_15.csv", index = False)
df_15_16.to_csv("df_15_16.csv", index = False)
df_16_17.to_csv("df_16_17.csv", index = False)

In [14]:
df_12_13 = pd.read_csv("df_12_13.csv")
df_13_14 = pd.read_csv("df_13_14.csv")
df_14_15 = pd.read_csv("df_14_15.csv")
df_15_16 = pd.read_csv("df_15_16.csv")
df_16_17 = pd.read_csv("df_16_17.csv")

In [15]:
df_12_13.head(1)

Unnamed: 0,Team,Opponent,Team-Score,Opponent-Score,Win,Date,Time,Location,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,ORtg,DRtg,Season,Playoff,Home
0,Cleveland Cavaliers,Washington Wizards,94,84,1,2012-10-30,7:00 PM,"Quicken Loans Arena, Cleveland, Ohio",36,79,0.456,7,20,0.35,15,22,0.682,18,36,54,22,7,5,20,21,0.53,0.5,0.253,0.278,46.2,66.7,58.1,61.1,8.0,8.6,18.4,106.9,95.5,2012-13,0,1


### Create Team Clusters by Season

In [16]:
#Packages for Kmeans
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn import preprocessing
from sklearn.metrics import homogeneity_score
from sklearn.metrics import silhouette_score
from collections import Counter

In [17]:
def find_majority(votes):
    vote_count = Counter(votes)
    top_two = vote_count.most_common(2)
    if len(top_two)>1 and top_two[0][1] == top_two[1][1]:
        # It is a tie
        return top_two[0][0]
    return top_two[0][0]

def get_clusters(dataframe, number_of_clusters):
    # Prepare the data
    Team_Name = dataframe["Team"]
    Opponent_Name = dataframe["Opponent"]
    dataframe = dataframe.drop(['FG%', '3P%', 'FT%', 'TS%', 'eFG%', 'ORB%', 'DRB%', 'BLK%', 'TOV%', 'ORtg', 'Team-Score', 'Opponent-Score', 'Win', 'Season', 'Location', 'Date', 'Time', 'Team', 'Opponent'], axis=1)
    #dataframe = dataframe.drop(['Team', 'Opponent', 'Date', 'Time', 'Location', 'Season', 'Playoff', 'Team', 'Opponent'], axis=1)
    
    # Create the clusters
    kmeans_model = KMeans(n_clusters = number_of_clusters, n_init = 5, n_jobs = 1).fit(dataframe)
    labels = kmeans_model.labels_
    
    # Create the dataframe
    dataframe['Clusters'] = labels
    teams = Team_Name.unique()
    clusters = []
    for team in teams:
        clusters.append(find_majority(dataframe.iloc[Team_Name[Team_Name==team].index]['Clusters']))
    
    cluster_df = pd.DataFrame(list(map(list, zip(teams,clusters))), columns = ['Team', 'Cluster'])
    cluster_df = cluster_df.sort_values(by='Cluster', ascending = True)
    cluster_df = cluster_df.reset_index(drop=True)
    
    return cluster_df

In [18]:
dataframes = [df_12_13, df_13_14, df_14_15, df_15_16, df_16_17]
data = pd.DataFrame(columns = ['Team', 'Cluster'])
year = 2012

for dataframe in dataframes:
    temp = get_clusters(dataframe, 8)
    temp["Year"] = str(year)
    year = year + 1
    data= data.append(temp, ignore_index=True)

In [19]:
data[(data.Cluster == 3) & (data.Year == "2015")]

Unnamed: 0,Cluster,Team,Year
104,3,Portland Trail Blazers,2015
105,3,Sacramento Kings,2015
106,3,Phoenix Suns,2015


In [20]:
# Cluster teams using ALL seasons
frames = [df_12_13, df_13_14, df_14_15, df_15_16, df_16_17]
all_seasons = pd.concat(frames)
get_clusters(all_seasons, 8)

Unnamed: 0,Team,Cluster
0,Cleveland Cavaliers,0
1,Brooklyn Nets,0
2,Atlanta Hawks,0
3,New York Knicks,0
4,Milwaukee Bucks,0
5,Minnesota Timberwolves,0
6,Memphis Grizzlies,0
7,Detroit Pistons,0
8,Toronto Raptors,0
9,New Orleans Hornets,0


### Combine & Get Averages

In [82]:
### REVISED APRIL 2, 2018 ###
def nba_jam(dataframe, number_of_games):
    games = []
    for i in range(0, len(dataframe)):
        team_df = dataframe[(dataframe.Team == dataframe.iloc[i,0]) & (dataframe.Date <= dataframe.iloc[i,5])].sort_values(by='Date', ascending = True).iloc[-number_of_games:]
        opponent_df = dataframe[(dataframe.Team == dataframe.iloc[i,1]) & (dataframe.Date <= dataframe.iloc[i,5])].sort_values(by='Date', ascending = True).iloc[-number_of_games:]
        
        if len(team_df) > 1 and len(opponent_df) > 1:
            
            # Team Data
            attributes = []
            metrics = []
            team_current_game = team_df.iloc[-1:]
            team_df = team_df.drop(team_df.index[-1:])
            
            for column in team_df.columns:
                if column == 'Team' or column == 'Opponent' or column == 'Date' or column == 'Time' or column == 'Location' or column == 'Season' or column == 'Playoff' or column == 'Home' or column == 'Win':
                    attributes.append(team_current_game.iloc[0][column])
                else:
                    metrics.append(np.mean(team_df[column]))
            team = attributes + metrics
            
            # Opponent Data
            opponent = [] 
            opponent_df = opponent_df.drop(opponent_df.index[-1:])
        
            for column in opponent_df.columns:
                if column == 'Team' or column == 'Opponent' or column == 'Date' or column == 'Time' or column == 'Location' or column == 'Season' or column == 'Playoff' or column == 'Team-Score' or column == "Opponent-Score" or column == "Home" or column == 'Win':
                    None
                else:
                    opponent.append(np.mean(opponent_df[column]))
            
            # Target Data
            target = []
            for column in team_current_game.columns:
                if column == 'Team-Score' or column == 'Opponent-Score':
                    target.append(team_current_game.iloc[0][column])
        
            row = team + opponent + target

            # Combine
            games.append(row)
            
    # Column Names
    team_attributes = ['Team', 'Opponent', 'Win', 'Date', 'Time', 'Location', 'Season', 'Playoff', 'Home']
    opp_attributes = ['Team', 'Opponent', 'Win', 'Date', 'Time', 'Location', 'Season', 'Playoff', 'Home', 'Team-Score', 'Opponent-Score']

    team_metrics = list(df_12_13.columns)
    for i in range(0, len(team_attributes)):
        team_metrics.remove(team_attributes[i])

    opp_metrics = list(df_12_13.columns)
    for i in range(0, len(opp_attributes)):
        opp_metrics.remove(opp_attributes[i])
    
    for i in range(0, len(team_metrics)):
        team_metrics[i] = "avg_team_" + team_metrics[i]
    
    for i in range(0, len(opp_metrics)):
        opp_metrics[i] = "avg_opp_" + opp_metrics[i]

    column_names = team_attributes + team_metrics + opp_metrics + ['Team-Score', 'Opponent-Score']
    
    # Create DataFrame
    nba_df = pd.DataFrame(games, columns = column_names)
    nba_df = nba_df.drop(['Playoff'], axis = 1)
        
    return nba_df

### Export Data

In [80]:
df_12_13 = pd.read_csv("df_12_13.csv")
df_13_14 = pd.read_csv("df_13_14.csv")
df_14_15 = pd.read_csv("df_14_15.csv")
df_15_16 = pd.read_csv("df_15_16.csv")
df_16_17 = pd.read_csv("df_16_17.csv")

In [85]:
# AVERAGE Regular Season (Non-Playoff) Games
number_of_games = 5
df_12_13_avg = nba_jam(df_12_13, number_of_games)
df_13_14_avg = nba_jam(df_13_14, number_of_games)
df_14_15_avg = nba_jam(df_14_15, number_of_games)
df_15_16_avg = nba_jam(df_15_16, number_of_games)
df_16_17_avg = nba_jam(df_16_17, number_of_games)

In [86]:
# Write to csv files
df_12_13_avg.to_csv("df_12_13_avg.csv", index = False)
df_13_14_avg.to_csv("df_13_14_avg.csv", index = False)
df_14_15_avg.to_csv("df_14_15_avg.csv", index = False)
df_15_16_avg.to_csv("df_15_16_avg.csv", index = False)
df_16_17_avg.to_csv("df_16_17_avg.csv", index = False)

### Data Modeling

In [88]:
# Core Packages
import seaborn as sns
import matplotlib.pyplot as plt
import os 
%matplotlib inline

# ML Packages
from sklearn.linear_model import SGDRegressor, ElasticNetCV, LogisticRegression
from sklearn.metrics import mean_squared_error, make_scorer, f1_score, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split, learning_curve, RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [123]:
# Read in CSV Files
df_12_13_avg = pd.read_csv("df_12_13_avg.csv")
df_13_14_avg = pd.read_csv("df_13_14_avg.csv")
df_14_15_avg = pd.read_csv("df_14_15_avg.csv")
df_15_16_avg = pd.read_csv("df_15_16_avg.csv")
df_16_17_avg = pd.read_csv("df_16_17_avg.csv")

In [124]:
#train = pd.read_csv("df_12_13_avg.csv")
#test = pd.read_csv("df_13_14_avg.csv")
#frames = [df_12_13_avg, df_13_14_avg, df_14_15_avg, df_15_16_avg]

frames = [df_14_15_avg, df_15_16_avg]
result = pd.concat(frames)

train = result
test = df_16_17_avg

train = train.drop(['Team', 'Opponent', 'Team-Score', 'Location', 'Opponent-Score', 'Date', 'Time', 'Season'], axis = 1)
test = test.drop(['Team', 'Opponent', 'Team-Score', 'Location', 'Opponent-Score', 'Date', 'Time', 'Season'], axis = 1)

target_train = train['Win']
target_test = test['Win']
train = train.drop(['Win'], axis = 1)
test = test.drop(['Win'], axis = 1)

In [125]:
# 4. Scale the data
scaler = StandardScaler()

scaler.fit(train)
scaled_train = scaler.transform(train)

scaler.fit(test)
scaled_test = scaler.transform(test)

In [126]:
len(target_train) == len(scaled_train)

True

In [127]:
len(target_test) == len(scaled_test)

True

In [128]:
print("Train Dimensions:", scaled_train.shape, "\nTest Dimensions:", scaled_test.shape)

Train Dimensions: (4596, 63) 
Test Dimensions: (2406, 63)


In [129]:
X_train, X_test, y_train, y_test = train_test_split(scaled_train, target_train, test_size=0.2, random_state=42)

In [130]:
def evaluation_metrics(y_train, y_test, train_predictions, test_predictions):
    rounding = 2
    print('Accuracy:')
    print("  Train: {}".format(round(accuracy_score(y_train, train_predictions),rounding)), "  Test: {}".format(round(accuracy_score(y_test, test_predictions),rounding)))

    print('Precision:')
    print("  Train: {}".format(round(precision_recall_fscore_support(y_train, train_predictions)[0][0],rounding)), "  Test: {}".format(round(precision_recall_fscore_support(y_test, test_predictions)[0][0],rounding)))

    print('Recall:')
    print("  Train: {}".format(round(precision_recall_fscore_support(y_train, train_predictions)[1][0],rounding)), "  Test: {}".format(round(precision_recall_fscore_support(y_test, test_predictions)[1][0],rounding)))

    print('F1:')
    print("  Train: {}".format(round(precision_recall_fscore_support(y_train, train_predictions)[2][0],rounding)), "  Test: {}".format(round(precision_recall_fscore_support(y_test, test_predictions)[2][0],rounding)))

In [131]:
param_dist = {"n_estimators": [100, 250, 500], "max_depth": [1, 2, 3]}
model = RandomForestClassifier()
rf = GridSearchCV(model, param_grid = param_dist, scoring='f1')
rf.fit(X_train, y_train)
rf_train_predictions = rf.predict(X_train)
rf_test_predictions = rf.predict(X_test)
print("Best Params: {}".format(rf.best_params_))

Best Params: {'max_depth': 3, 'n_estimators': 250}


In [132]:
evaluation_metrics(y_train, y_test, rf_train_predictions, rf_test_predictions)

Accuracy:
  Train: 0.65   Test: 0.6
Precision:
  Train: 0.65   Test: 0.59
Recall:
  Train: 0.65   Test: 0.61
F1:
  Train: 0.65   Test: 0.6


In [133]:
# Train Score
accuracy_score(target_train, rf.predict(scaled_train))

0.6390339425587467

In [134]:
# Test Score
accuracy_score(target_test, rf.predict(scaled_test))

0.5839567747298421

### Choosing a Machine Learning Algorithm
From: [A Data Science Framework: To Achieve 99% Accuracy](https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy)

In [135]:
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics