In [54]:
import zipfile, os, re, datetime
import pandas as pd
import numpy as np 

# Read in the fooball data

In [55]:
_file = os.path.join('ncaa_football_scrapy','Data', 'Data.zip')
zf = zipfile.ZipFile(_file)
filenames = zf.namelist()

# Create Program History DF

In [56]:
history = pd.read_csv(zf.open('history.csv'))

In [57]:
history['year'] = history.apply(lambda x: int(x['Year'][:4]), axis = 1)

# Coach links

In [58]:
coach_links = pd.read_csv(os.path.join("ncaa_football_scrapy", "Links", "links_teaminfo.csv"))
coach_links = coach_links.rename(columns={"txt":"coach", 'team':"Team"})
coach_links = coach_links[coach_links.key == 'people'][['Team', 'coach', 'year']]
coach_links['Team'] = coach_links.Team.str.replace("  ", " ")

# Account for coaches who get fired in the season

In [59]:
coach_dict = coach_links.to_dict(orient='record')

In [60]:
coach_fixed = {}
for x in coach_dict: 
    if x['year'] not in coach_fixed: 
        coach_fixed[x['year']] = {}
    if x['Team'] not in coach_fixed[x['year']]: 
        coach_fixed[x['year']][x['Team']] = []
    coach_fixed[x['year']][x['Team']].append(x['coach'])

In [61]:
coaches_list = []
for yr, teams in coach_fixed.items(): 
    for team, coaches in teams.items(): 
        tmp = {'year':yr, 'Team':team, 'coach':coaches}
        coaches_list.append(tmp)

In [62]:
coaches_list_fixed = pd.DataFrame(coaches_list)

In [63]:
coaches_list_fixed[coaches_list_fixed['Team'] == 'Air Force Falcons']

Unnamed: 0,Team,coach,year
1,Air Force Falcons,[Troy Calhoun],2014
117,Air Force Falcons,[Troy Calhoun],2015
236,Air Force Falcons,[Troy Calhoun],2016
358,Air Force Falcons,[Troy Calhoun],2017


# Create Coach DF 

In [64]:
coaches = pd.read_csv(zf.open('coaches.csv'))

In [65]:
coaches['year'] = coaches.apply(lambda x: int(x['Year'][:4]), axis = 1)
coaches.rename(columns = {'Name':'coach'}, inplace = True)

# Create Roster DF

In [12]:
roster = pd.read_csv(zf.open('roster.csv'))

In [13]:
roster['year'] = roster.apply(lambda x: int(x['Year'][:4]), axis = 1)

In [14]:
def yrRecode(data): 
    yr = {'Fr':0, 'So':1, 'Jr':2, 'Sr':3}
    return yr[data]

In [15]:
roster_by_year = roster[['Team', 'Yr', 'year', 'GS']].groupby(['Team', 'year', 'Yr']).count().reset_index()

In [16]:
roster_games_started = roster[['Team', 'year', 'Yr', 'GS']].groupby(['Team', 'year', "Yr"]).mean().reset_index()

In [17]:
roster_games_played = roster[['Team', 'year', 'Yr', 'GP']].groupby(['Team', 'year', "Yr"]).mean().reset_index()

In [18]:
roster_final = pd.merge(roster_games_played, 
                        roster_games_started, 
                        left_on = ['Team', 'year', 'Yr'], 
                        right_on = ['Team', 'year', 'Yr'],
                        how = 'left'
                       )

In [19]:
roster_final2 = roster_final.set_index(['Team', 'year', 'Yr']).unstack().reset_index()

In [20]:
columns = [x[0] if x[0] in ['Team', 'year'] else x[0] + "_" + x[1] for x in roster_final2.columns.values]
roster_final2.columns = columns

In [21]:
def createvariables(data):
    """Create Opponent, Home, Win/Loss, Overtime, and Scores
    
    """
    regexs = {'WL': "[WL]",
              'team': ["\d+\s\-", "\d+"],
              'opponent_score':["\-\s\d+", '\d+'],
              'OT':["\(\d+OT\)", "\d+"], 
              'opponent':['[\w\s]+\.?\s\@\s\w+']
             }
    
    # Opponent 
    if '@' in data['Opponent']: 
        search = re.search(regexs['opponent'][0], data['Opponent']) 
        if search: 
            opponent = data['Opponent'].split("@")[0].strip()
            home = 0
        else: 
            opponent = data['Opponent'].replace("@","").strip()
            home = 0
    else: 
        opponent = data['Opponent']
        home = 1
    
    
    # Win/Loss
    WinLoss = re.search(regexs['WL'], data['Result']).group()
    
    # Overtime 
    Overtime = 0
    re_ot = re.search(regexs['OT'][0], data['Result'])
    if re_ot:
        Overtime = re.search(regexs['OT'][1], re_ot.group()).group()
        
    # Team Score
    team_score = None
    re_team = re.search(regexs['team'][0], data['Result'])
    if re_team:
        team_score = re.search(regexs['OT'][1], re_team.group()).group()
        
    # Opponent Score
    opponent_score = None
    re_opponent = re.search(regexs['opponent_score'][0], data['Result'])
    if re_opponent:
        opponent_score = re.search(regexs['OT'][1], re_opponent.group()).group()
        
    return pd.Series([opponent, home, WinLoss, Overtime, team_score, opponent_score])    

# Start merging

In [22]:
# Subset the gamebygame files
targetfiles = [file for file in filenames if 'gamebygame' in file ]

In [23]:
idx = targetfiles.index('gamebygame_teamnames.csv')
targetfiles.pop(idx);

In [24]:
# Create a dict of dfs and create a multi-index 
dfs = {}
for f in targetfiles: 
    filename = f.split(".")[0]
    dfs[filename] = pd.read_csv(zf.open(f))
    dfs[filename]['Date'] = pd.to_datetime(dfs[filename]['Date'])
    dfs[filename].set_index(['Team', 'Date', 'OffenseDefense'], inplace = True)

In [25]:
# Concatenate the data together and drop duplicate column names 
master = pd.concat([v for k, v in dfs.items()], axis = 1)
master = master.loc[:, ~master.columns.duplicated()]

In [26]:
def fixTOP(row):
    """ Fix time of possession"""
    if isinstance(row, str) == True and ":" in row: 
        _all = row.split(":")
        _min = int(_all[0]) * 60 
        _sec = int(_all[1]) 
        total_seconds = _min + _sec
    else: 
        total_seconds = int(row)
    val = "{}".format(datetime.timedelta(seconds=total_seconds))
    return pd.Series([val])

In [27]:
def removeSlashes(row):
    """ Remove slashes from some of the variables"""
    global cols
    out = []
    for cell in cols[3:]:
        if cell == 'TOP': 
            out.append(row[cell])
        elif isinstance(row[cell], str) and '/' in row[cell]:
            tmp = row[cell].replace("/", "")
            out.append(float(tmp))
        else: 
            out.append(row[cell])
    return pd.Series(out)

In [28]:
# Fix Time of Possession
master['TOP'] = master.apply(lambda x: fixTOP(x['TOP']) if pd.isnull(x['TOP']) == False else x['TOP'], axis =1)
master['TOP'] = pd.to_timedelta(master['TOP'])

In [29]:
# Remove slashes and convert to numeric 
cols = list(master.columns)
TOP_index = list(cols).index('TOP')
cols.pop(TOP_index)

'TOP'

In [30]:
master[cols[3:]] = master.apply(lambda x: removeSlashes(x), axis =1)

In [31]:
master[cols[3:]] = master[cols[3:]].apply(pd.to_numeric)

In [32]:
#df.iloc[df.index.get_level_values('A') == 1]

master.iloc[(master.index.get_level_values('Team') == 'Air Force Falcons') & 
            (master.index.get_level_values('OffenseDefense') == 'Defense') & 
            (master.index.get_level_values('Date') == '2016-10-01 00:00:00' )
           ]['TOP']

Team               Date        OffenseDefense
Air Force Falcons  2016-10-01  Defense          00:28:00
Name: TOP, dtype: timedelta64[ns]

In [33]:
# Create new variables 
master[['opponent', 'home', 'WinLoss','Overtime', 'team_score', 'opponent_score']] = master.apply(lambda x: createvariables(x), axis = 1)

# Subset Offense Defense

In [34]:
defense = master[master.index.get_level_values('OffenseDefense') == 'Defense'].copy()

In [35]:
offense = master[master.index.get_level_values('OffenseDefense') == 'Offense'].copy()

In [36]:
assert len(defense) == len(offense),  "Datasets different sizes"

# Manipulate

In [37]:
offense.reset_index(inplace = True)

In [38]:
offense['year'] = offense['Date'].dt.year

In [39]:
base = pd.merge(offense,
               roster_final2, 
               left_on = ['Team', 'year'], 
               right_on = ['Team', 'year'],
               how = 'left'
              )

In [40]:
def teamhistory(team, year, duration): 
    global history
    team = team
    target_year = year
    out = []
    # Individual 
    for yr in duration: 
        target_year_min = target_year - yr
        # Wins/Losses
        wins, losses = list(history[(history.Team == team) & 
                                    (history.year <= target_year) & 
                                    (history.year >= target_year_min)][['Wins', 'Losses']].sum()
                           )
        out.extend([wins, losses, wins/(wins+losses)])
    # Max 
    wins, losses = list(history[(history.Team == team)][['Wins', 'Losses']].sum())
    out.extend([wins, losses, wins/(wins+losses)])
    return pd.Series(out)

In [41]:
# Add in team historical data 
yrs = [1, 2, 3, 5, 10]
variables = ['history_wins_{yr}yrs', 'history_losses_{yr}yrs', 'history_WL_{yr}yrs']
years = [v.format(yr = yr) for yr in yrs for v in variables ]
years.extend([v.format(yr = 'max') for v in variables])

base[years] = base.apply(lambda x: teamhistory(x['Team'], x['year'], yrs), axis = 1)

In [42]:
assert len(base) == len(offense), "Size change during merging"

In [43]:
coach_teams = coach_links.Team.unique()
for team in base.Team.unique(): 
    if team not in coach_teams: 
        print(team)

In [44]:
tmp = pd.merge(base, 
               coaches_list_fixed, 
               left_on = ['Team', 'year'], 
               right_on = ['Team', 'year'], 
               how = 'left'
              )

In [45]:
assert len(base) == len(tmp), "Dataset size changed"

# Fix the coach history to get the average of coaches

In [46]:
def coach_history(data, year): 
    global coaches
    t = []
    if isinstance(data, list) and len(data) >=1 :
        for coach in data: 
            coach_record = coaches[(coaches.coach == coach) & 
                                   (coaches.year < year)
                                  ][["coach","Wins","Losses","WL","year"]].groupby("coach").agg(['sum', 'count', 'mean'])
            coach_record.columns = [x[0] + "_" + x[1]  for x in coach_record.columns.values]
            coach_record.drop(['Wins_count', 'Wins_mean', 'Losses_count', 'Losses_mean', 'WL_sum', 'WL_count', 
                                     'year_sum', 'year_mean'], axis=1, inplace = True)
            coach_record.rename(columns = {'Wins_sum':"Coach_wins", 
                                           "Losses_sum":"Coach_losses", 
                                           "WL_mean":"Coach_WL", 
                                           "year_count":"Coach_years"
                                          },
                                inplace = True
                               )
            t.append(coach_record.to_dict(orient='record')[0])
    
        out = pd.Series(pd.DataFrame(t).mean().to_dict())
    else: 
        out = pd.Series({'Coach_wins':None, 
                         "Coach_losses":None, 
                         "Coach_WL":None, 
                         "Coach_years":None
                        })
    return out

In [47]:
coach_variables = ['Coach_wins', 'Coach_losses', 'Coach_WL', 'Coach_years']
tmp = tmp.copy()
tmp[coach_variables] = tmp.apply(lambda x: coach_history(x['coach'], x['year']), axis = 1)

In [48]:
tmp

Unnamed: 0,Team,Date,OffenseDefense,Opponent,Result,G,RushNetYards,ReceivingYards,IntRYds,PuntRetYds,...,history_losses_10yrs,history_WL_10yrs,history_wins_maxyrs,history_losses_maxyrs,history_WL_maxyrs,coach,Coach_wins,Coach_losses,Coach_WL,Coach_years
0,Air Force Falcons,2013-08-31,Offense,Colgate,W 38 - 13,1/,409.0,72.0,,14.0,...,67.0,0.507353,385.0,320.0,0.546099,,,,,
1,Air Force Falcons,2013-09-07,Offense,Utah St.,L 20 - 52,1/,162.0,108.0,,,...,67.0,0.507353,385.0,320.0,0.546099,,,,,
2,Air Force Falcons,2013-09-13,Offense,@ Boise St.,L 20 - 42,1/,188.0,99.0,17.0,,...,67.0,0.507353,385.0,320.0,0.546099,,,,,
3,Air Force Falcons,2013-09-21,Offense,Wyoming,L 23 - 56,1/,346.0,127.0,,-8.0,...,67.0,0.507353,385.0,320.0,0.546099,,,,,
4,Air Force Falcons,2013-09-28,Offense,@ Nevada,L 42 - 45,1/,375.0,78.0,21.0,,...,67.0,0.507353,385.0,320.0,0.546099,,,,,
5,Air Force Falcons,2013-10-05,Offense,@ Navy,L 10 - 28,1/,231.0,82.0,,1.0,...,67.0,0.507353,385.0,320.0,0.546099,,,,,
6,Air Force Falcons,2013-10-10,Offense,San Diego St.,L 20 - 27,1/,169.0,150.0,,,...,67.0,0.507353,385.0,320.0,0.546099,,,,,
7,Air Force Falcons,2013-10-26,Offense,Notre Dame,L 10 - 45,1/,290.0,49.0,,16.0,...,67.0,0.507353,385.0,320.0,0.546099,,,,,
8,Air Force Falcons,2013-11-02,Offense,Army West Point,W 42 - 28,1/,343.0,111.0,,,...,67.0,0.507353,385.0,320.0,0.546099,,,,,
9,Air Force Falcons,2013-11-08,Offense,@ New Mexico,L 37 - 45,1/,257.0,188.0,,,...,67.0,0.507353,385.0,320.0,0.546099,,,,,


# Edit Defense

In [48]:
defense.columns = ['def_' + x for x in defense.columns]
defense.reset_index(inplace = True)

In [49]:
defense['year'] = defense['Date'].dt.year

# Merge offense defense

In [50]:
assert len(tmp) == len(defense), "Dataset file difference"

In [51]:
final = pd.merge(tmp, 
                 defense, 
                 left_on = ['Team', 'Date'],
                 right_on = ['Team', 'Date'],
                 how = 'left'
                )

In [52]:
final['count'] = final.groupby(['Team', 'year_x']).cumcount()+1

# Game by game participation - Starting point


In [53]:
gamebygame_part = pd.read_csv(zf.open('gamebygame_participation.csv'))

In [54]:
gamebygame_part['Date'] = pd.to_datetime(gamebygame_part['Date'])
gamebygame_part['year'] = gamebygame_part['Date'].dt.year
gamebygame_part = gamebygame_part[gamebygame_part.OffenseDefense == 'Offense']

In [55]:
gamebygame_part['count'] = gamebygame_part.groupby(['Team', 'year']).cumcount() +1

# Previous Years

In [56]:
def previous_yrs(team, year, game, cols, debug = False): 
    global final
    cols = ['Team', 'year_y', 'count'] + cols
    out = []
    if game != 1: 
        y = final[cols][(final['Team'] == team) & 
                        (final['year_y'] == year) & 
                        (final['count'] <= game)
                       ]

        out.append(y)
    if game <= 3:
        x = final[cols][(final['Team'] == team) & 
                        (final['year_y'] == year - 1)
                       ]
        
        out.append(x)

    if debug == False: 
        mean = pd.concat(out).mean().to_frame().T.to_dict(orient='record')[0]
        mean2 = [mean[var]  for var in cols[3:]]
        return pd.Series(mean2)
    elif debug == True: 
        mean = pd.concat(out)
        return mean


In [82]:
stats = list(final.columns)

In [183]:
for stat in stats: 
    print(stat)

Team
Date
OffenseDefense_x
Opponent
Result
G
RushNetYards
ReceivingYards
IntRYds
PuntRetYds
FRetYds
KORetYds
APY
All-PurposeYardsG
FumblesRecovered
Int
Blkd
FF
FRet
FRetTD
FGM
FGA
FGBlocksAllowed
LongFGM
KO
KOYds
KOTB
KORet
KickRetTDs
PBU
IntRetTDs
PDef
PassAttempts
Completions
Interceptions
PassYards
PassTDs
PassEff
CompletionsPerGame
YdsPerCompletion
Pct
PassYardsG
Punts
PuntYds
PuntAvg
PuntTBs
LongPunt
PuntRet
PuntRetTDs
Rec
ReceptionsPerGame
YardsPerReception
RecTD
RecYdsG
RZScores
RZPts
RZRushTD
RZPassTD
RZFGMade
RZEndFGA
RZEndFumble
RZEndINT
RZEndDowns
RZEndHalf
RZEndGame
SackUA
SackA
SackYds
Sacks
TDs
KickPAT
PATAtt
RushPAT
Ru2PTAtt
ReceivingPAT
Pass2PtConv
Pass2PTAtt
KickReturnPAT
FumbRetPAT
FG
Safeties
Points
SoloTack
AsstTack
STFL
ATFL
TackleYds
Tackles
TOP
Plays
TotOff
YdsPlay
RushingFirstDowns
PassingFirstDowns
FirstDownsbyPenalty
Penalties
PenaltiesPerGame
PenaltyYards
PenaltyYdsPerGame
TotalOffYardsG
FumblesLost
opponent
home
WinLoss
Overtime
team_score
opponent_score
yea

In [278]:
cols2 = ["Coach_WL", "Coach_years", "FG",
         "RushNetYards","PassYards","Interceptions","PassAttempts","Punts","Points","FumblesLost",
         "Penalties","Plays","def_RushNetYards","def_PassYards","def_Interceptions","def_Punts", 
         "def_FF","def_PenaltyYards","def_PDef","Sacks","Tackles","def_Plays",
        ]

In [279]:
# Filter out game 1 of season 2013

gamebygame_part[cols2] = gamebygame_part[~((gamebygame_part.year == 2013) & (gamebygame_part['count'] == 1))].apply(lambda x: previous_yrs(team = x['Team'], 
                                                     year = x['year'], 
                                                     game = x['count'], 
                                                     cols = cols2, 
                                                     debug = False
                                                     ), axis = 1
                                                   )

# Read in team names

In [280]:
teamnames = pd.read_csv(zf.open('gamebygame_teamnames.csv'))

In [281]:
regex = '[\w\s]+\.?\s\@\s\w+'

def opponent(data, opponentNameVar):
    # Opponent 
    if '@' in data[opponentNameVar]: 
        search = re.search(regex, data[opponentNameVar]) 
        if search: 
            opponent = data[opponentNameVar].split("@")[0].strip()
        else: 
            opponent = data[opponentNameVar].replace("@","").strip()
    else: 
        opponent = data[opponentNameVar]
    return opponent.strip()


In [282]:
teamnames['shortName'] = teamnames.apply(lambda x: opponent(x, 'shortName'), axis=1)
teamnames.drop_duplicates(inplace = True)
teamnames = {x["shortName"]:x["longName"] for x in teamnames.to_dict(orient='record')}

# Convert opponent names

In [283]:
gamebygame_part['Opponent2'] = gamebygame_part.apply(lambda x: opponent(x, 'Opponent'), axis = 1)

In [284]:
gamebygame_part['opponentName'] = gamebygame_part.apply(lambda x:teamnames[x['Opponent2']]
                                                                 if x['Opponent2'] in teamnames.keys()
                                                                 else 'Missing'
                                                                 , 
                                                        axis = 1 )

# Merge in the data for the opponents

In [285]:
gamedata = list(final['Team'].unique())

In [286]:
def opp(team, date, year, cols, debug = False):
    """ Calculate the game-by-game stats for the opponents"""
    game = final[(final['Team'] == team) &  (final['Date'] == date) ]['count'].values[0]
    return previous_yrs(team = team, 
                        year = year, 
                        game = game, 
                        cols = cols2, 
                        debug = False
                       )

In [287]:
# Create the oppoent variable names 
opps = ['opp_' + x for x in cols2]

# Subset the datset to games to keep it short
games = gamebygame_part[~((gamebygame_part.year == 2013) & (gamebygame_part['count'] == 1))].copy()

# Calculate the opponent data 
games[opps] = games.apply(lambda x: opp(team = x['Team'],
                                        date = x['Date'], 
                                        year = x['year'], 
                                        cols = cols2,
                                        debug = False
                                       )
                          if x['opponentName'] in gamedata
                          else pd.Series([None for x in range(0, len(cols2))]),
                          axis = 1
                         )

In [288]:
# Select all the games where they don't have missing data for rush data for home team and opponent team
finalgames = games[(pd.notnull(games['opp_def_RushNetYards'])) & (pd.notnull(games['RushNetYards']))]

In [289]:
def create_variables(data):
    """Create Opponent, Home, Win/Loss, Overtime, and Scores
    
    """
    regexs = {'WL': "[WL]",
              'team': ["\d+\s\-", "\d+"],
              'opponent_score':["\-\s\d+", '\d+'],
              'OT':["\(\d+OT\)", "\d+"], 
              'opponent':['[\w\s]+\.?\s\@\s\w+']
             }
    
    # Opponent 
    if '@' in data['Opponent']: 
        search = re.search(regexs['opponent'][0], data['Opponent']) 
        if search: 
            home = 0
        else: 
            home = 0
    else: 
        home = 1
    
    
    # Win/Loss
    WinLoss = re.search(regexs['WL'], data['Result']).group()
    if WinLoss == 'W': 
        WinLoss = 1
    else: 
        WinLoss = 0
        
    return pd.Series([home, WinLoss])    

In [290]:
finalgames[['HomeAway', 'WinLoss']] = finalgames.apply(lambda x: create_variables(x), axis = 1).copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [291]:
# Variables to drop out for analysis
drop_variables = ['Date', 'Opponent', 'Result', 'G', 'Team', 'OffenseDefense', 
                  'year', 'Opponent2', 'opponentName', 'WinLoss']

In [292]:
y = finalgames['WinLoss'].values
X = finalgames.drop(drop_variables, axis = 1).values

# Run Models

In [293]:
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import  cross_val_score

In [294]:
# Create and fit an imputer for missing data 
imr = Imputer(missing_values = 'NaN', strategy='mean', axis = 0)
imr = imr.fit(X)

In [295]:
X_imp = imr.fit_transform(X)

In [296]:
# Create the train test split 
X_train, X_test, y_train, y_test = train_test_split(X_imp, y, test_size=0.33, random_state=42)

In [297]:
# Create and fit the DT Classifier 
dt = RandomForestClassifier()
dt.fit(X= X_train, y=y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [298]:
print("Accuracy:\t", (y_train == dt.predict(X_train)).mean())
print("Accuracy:\t", (y_test == dt.predict(X_test)).mean())

Accuracy:	 0.978598405371
Accuracy:	 0.634893617021


In [299]:
tree_parameters = {'criterion':['gini','entropy'],
             'max_depth':[1,2,3,4,5,6,7,8,9,10],
             'n_estimators':[10,20,30,40,50,75,80,85,90, 100], 
             'n_jobs': [-1]
            }
clf = GridSearchCV(RandomForestClassifier(), tree_parameters, cv=5)
clf.fit(X_imp, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'criterion': ['gini', 'entropy'], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'n_estimators': [10, 20, 30, 40, 50, 75, 80, 85, 90, 100], 'n_jobs': [-1]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [300]:
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_) 

0.6742551995503092 {'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 80, 'n_jobs': -1}
