<a href="https://colab.research.google.com/github/gdollp/kagglebook/blob/master/submit2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook uses lightGBM to make predictions.

We use the following features
* playerId
* position
* teamId(rosters)
* status(rosters)
* playerBoxScores

and the date 20200401~20200431 as the validation data.

But I think there is room for improvement.  
If you have better ways, I would appreciate it if you could comment on it.

このnotebookではlightGBMを使って予測します。

特徴量は以下のものを使用しています。
* playerId
* position
* teamId(rosters)
* status(rosters)
* playerBoxScores

20200401~20200431を日時をvalidation dataとしていますが、一考の余地がありそうです。  
もし良さそうな方法があればコメントしていただけると幸いです。

https://www.kaggle.com/columbia2131/mlb-lightgbm-starter-dataset-code-en-ja

## About Dataset

Train.csv is stored as a csv file with each column as follows.  

train.csvを以下のようにして各カラムをcsvファイルとして保管しています。

In [28]:
%%capture
"""
!pip install pandarallel 

import gc

import numpy as np
import pandas as pd
from pathlib import Path

from pandarallel import pandarallel
pandarallel.initialize()

BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')
train = pd.read_csv(BASE_DIR / 'train.csv')

null = np.nan
true = True
false = False

for col in train.columns:

    if col == 'date': continue

    _index = train[col].notnull()
    train.loc[_index, col] = train.loc[_index, col].parallel_apply(lambda x: eval(x))

    outputs = []
    for index, date, record in train.loc[_index, ['date', col]].itertuples():
        _df = pd.DataFrame(record)
        _df['index'] = index
        _df['date'] = date
        outputs.append(_df)

    outputs = pd.concat(outputs).reset_index(drop=True)

    outputs.to_csv(f'{col}_train.csv', index=False)
    outputs.to_pickle(f'{col}_train.pkl')

    del outputs
    del train[col]
    gc.collect()
"""

## Training

In [29]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
from functools import reduce
from tqdm import tqdm
#import optuna.integration.lightgbm as lgbm
import lightgbm as lgbm
#import mlb
import pickle
from datetime import datetime as dt
import copy
import gc
from sklearn.decomposition import PCA
import math

In [30]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
BASE_DIR = Path('/content/drive/MyDrive/mlb/input')
TRAIN_DIR = Path('/content/drive/MyDrive/mlb/input/archive')

In [32]:
players = pd.read_csv(BASE_DIR / 'players.csv')

rosters = pd.read_pickle(TRAIN_DIR / 'rosters_train.pkl')
targets = pd.read_pickle(TRAIN_DIR / 'nextDayPlayerEngagement_train.pkl')
scores1 = pd.read_pickle(TRAIN_DIR / 'playerBoxScores_train.pkl')
scores = scores1.groupby(['playerId', 'date']).sum().reset_index()
twitter = pd.read_pickle("/content/drive/MyDrive/mlb/input/archive/playerTwitterFollowers_train.pkl")
games = pd.read_pickle(TRAIN_DIR / 'games_train.pkl')
events = pd.read_pickle(TRAIN_DIR / 'events_train.pkl')
standings = pd.read_pickle(TRAIN_DIR / 'standings_train.pkl')
teamtwitter = pd.read_pickle(TRAIN_DIR / 'teamTwitterFollowers_train.pkl')
transaction = pd.read_pickle(TRAIN_DIR / 'teamTwitterFollowers_train.pkl')
awards = pd.read_csv(BASE_DIR / 'awards.csv')
seasons = pd.read_csv(BASE_DIR / 'seasons.csv')
teams = pd.read_csv(BASE_DIR / 'teams.csv')
player_target_stats = pd.read_csv("/content/drive/MyDrive/mlb/input/player_target_stats.csv")
from sklearn.model_selection import KFold

In [33]:
awards2 = awards.groupby("playerId").count()
awards2 = awards2.reset_index()

In [34]:
teamtwitter["teamnumberOfFollowers"] = teamtwitter["numberOfFollowers"]
teamtwi = teamtwitter.groupby("teamId").mean()["teamnumberOfFollowers"].reset_index()
teamtwi

Unnamed: 0,teamId,teamnumberOfFollowers
0,108,1081524.075
1,109,582019.5
2,110,785074.3
3,111,2086901.925
4,112,2535440.475
5,113,845282.3
6,114,1043174.525
7,115,561606.025
8,116,1446103.775
9,117,1480179.525


In [35]:
games = pd.read_pickle(TRAIN_DIR / 'games_train.pkl')
#games.index[games["detailedGameState"] == "Postponed"].shape
#games.drop(games.loc[games['detailedGameState']=='Postponed'].index, inplace=True)
#games["detailedGameState"].unique()
#games[games.duplicated(subset=["gamePk"], keep=False)]
#games['detailedGameState']=='Postponed']だとスコアはキロクされていない

In [36]:
targets["engagementMetricsDate"] = targets["engagementMetricsDate"].str.replace('-', '')
yesterday_targets = targets.drop('date', axis=1)
yesterday_targets = yesterday_targets.rename(columns={'engagementMetricsDate':'date', 'target1': 'yest_target1','target2': 'yest_target2','target3': 'yest_target3','target4': 'yest_target4'})
yesterday_targets["date"] = yesterday_targets["date"].astype(int)
yesterday_targets["date"]

0          20180102
1          20180102
2          20180102
3          20180102
4          20180102
             ...   
2506171    20210501
2506172    20210501
2506173    20210501
2506174    20210501
2506175    20210501
Name: date, Length: 2506176, dtype: int64

In [37]:

feature_cols = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored','homeRuns',
       'strikeOuts', 'baseOnBalls',  'hits', 'hitByPitch',
       'atBats', 'stolenBases', 
       'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacFlies',
       'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching','winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching',  'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored',
       'sacFliesPitching', 'saves', 'holds',
       'assists', 'putOuts', 'errors', 'chances','target1_mean',
 'target1_median',
 'target1_std',
 'target1_min',
 'target1_max',
 'target1_prob',
 'target2_mean',
 'target2_median',
 'target2_std',
 'target2_min',
 'target2_max',
 'target2_prob',
 'target3_mean',
 'target3_median',
 'target3_std',
 'target3_min',
 'target3_max',
 'target3_prob',
 'target4_mean',
 'target4_median',
 'target4_std',
 'target4_min',
 'target4_max',
 'target4_prob',"divisionId","teamnumberOfFollowers"]
feature_cols2 = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored',# 'doubles', 'triples', 'homeRuns',
       'baseOnBalls', 'hits',
      # 'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'plateAppearances', 'totalBases', 'rbi',
      # 'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'winsPitching',
       'lossesPitching',# 'flyOutsPitching', 'airOutsPitching',
       'runsPitching', 
       'strikeOutsPitching',
       #'hitsPitching',
       'hitByPitchPitching',  'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched',
        'battersFaced', 
        'balks', 'pickoffsPitching',
        'inheritedRunners',
        
       #'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'putOuts','chances','target1_mean',
 'target1_median',
 'target1_std',
 'target1_min',
 'target1_max',
 'target1_prob',
 'target2_mean',
 'target2_median',
 'target2_std',
 'target2_min',
 'target2_max',
 'target2_prob',
 'target3_mean',
 'target3_median',
 'target3_std',
 'target3_min',
 'target3_max',
 'target3_prob',
 'target4_mean',
 'target4_median',
 'target4_std',
 'target4_min',
 'target4_max',
 'target4_prob',
    'target1',"divisionId","teamnumberOfFollowers"]
feature_cols3 = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', #'gamesPlayedBatting', 'flyOuts',
        'homeRuns',
      # 'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
     # 'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
        'totalBases', 'rbi',
      # 'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
        'gamesStartedPitching',
      # 'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 
      # 'groundOutsPitching', 'runsPitching', 'doublesPitching',
      # 'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       #'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       #'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'inningsPitched', 
        'battersFaced', 'pitchesThrown', 
      # 'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
      # 'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       #'inheritedRunnersScored', 'catchersInterferencePitching',
       #'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
      # 'assists', 'putOuts', 'errors', 'chances',
      'target1_mean',
 'target1_median',
 'target1_std',
 'target1_min',
 'target1_max',
 'target1_prob',
 'target2_mean',
 'target2_median',
 'target2_std',
 'target2_min',
 'target2_max',
 'target2_prob',
 'target3_mean',
 'target3_median',
 'target3_std',
 'target3_min',
 'target3_max',
 'target3_prob',
 'target4_mean',
 'target4_median',
 'target4_std',
 'target4_min',
 'target4_max',
 'target4_prob',
    'target1',"divisionId","target2","teamnumberOfFollowers"]
feature_cols4 = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
        'runsScored','triples','groundIntoDoublePlay',
       'strikeOuts', 'baseOnBalls','hits', 
       'atBats', 'caughtStealing',
       'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies',
        'gamesPlayedPitching',
        'winsPitching',
       'airOutsPitching',
       'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
        'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 
        'inningsPitched', 'saveOpportunities',
       'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes','wildPitches', 
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 
       'sacBuntsPitching', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances','target1_mean',
 'target1_median',
 'target1_std',
 'target1_min',
 'target1_max',
 'target1_prob',
 'target2_mean',
 'target2_median',
 'target2_std',
 'target2_min',
 'target2_max',
 'target2_prob',
 'target3_mean',
 'target3_median',
 'target3_std',
 'target3_min',
 'target3_max',
 'target3_prob',
 'target4_mean',
 'target4_median',
 'target4_std',
 'target4_min',
 'target4_max',
 'target4_prob',
 'target1', "divisionId","target2","target3","teamnumberOfFollowers"]

In [38]:
targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4','hasTwitterAccount','date']
yesterday_targets_cols = ["date","playerId","yest_target1","yest_target2","yest_target3","yest_target4"]
players_cols = ['playerId', 'primaryPositionName',"birthCountry","primaryPositionCode"]
rosters_cols = ['playerId', 'teamId', 'status', 'date']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances', 'date',"gamePk"]
games_cols = ["gamePk","homeId","dayNight","seriesDescription","gamesInSeries","homeWinner","awayWinner","homeScore","awayScore","gameType"]
playertwitter_cols = ["playerId","numberOfFollowers","year_months"]
awards_cols = ["playerId","awardName"]
standings_cols = ["date","teamId","divisionRank","divisionLeader","wildCardLeader","leagueRank","divisionId","gameDate"]
teamtwitter_cols = ["teamId","teamnumberOfFollowers"]


In [39]:
twitter["strdate"] = twitter["date"].astype(str)
twitter["year_months"] = twitter["strdate"].str[0:6].astype(int)


In [40]:
targets['hasTwitterAccount'] = targets.playerId.isin(twitter.playerId)

In [41]:
train = targets[targets_cols].merge(players[players_cols], on=['playerId'], how='left')
train = train.merge(rosters[rosters_cols], on=['playerId', 'date'], how='left')
train = train.merge(scores[scores_cols], on=['playerId', 'date'], how='left')
train = train.merge(games[games_cols], on=["gamePk"], how="left")
train = train.merge(standings[standings_cols], on=['date',"teamId"], how='left')
train = train.merge(awards2[awards_cols], on=['playerId'], how='left')
train = train.merge(player_target_stats, how='inner', left_on=["playerId"],right_on=["playerId"])
train["strdate"] = train["date"].astype(str)
train["year_months"] = train["strdate"].str[0:6].astype(int)
train = train.merge(twitter[playertwitter_cols], on=['playerId',"year_months"], how='left')




In [42]:
targets['hasTwitterAccount'] = targets.playerId.isin(twitter.playerId)

In [43]:
#make feature
train["allstar"] = np.where((train["date"]==20170711)|(train["date"]==20180717)|train["date"]==20190709,1,0)
train["regularseason"] = np.where(((20171002>train["date"])&(train["date"]>20170401))|
                                  ((20181002>train["date"])&(train["date"]>20180328))|
                                   ((20190930>train["date"])&(train["date"]>20190319))|
                                    ((20200928>train["date"])&(train["date"]>20200722))|
                                     ((20211004>train["date"])&(train["date"]>20210330)),1,0)


In [44]:
train[['target1_mean',
 'target1_median',
 'target1_std',
 'target1_min',
 'target1_max',
 'target1_prob',
 'target2_mean',
 'target2_median',
 'target2_std',
 'target2_min',
 'target2_max',
 'target2_prob',
 'target3_mean',
 'target3_median',
 'target3_std',
 'target3_min',
 'target3_max',
 'target3_prob',
 'target4_mean',
 'target4_median',
 'target4_std',
 'target4_min',
 'target4_max',
 'target4_prob',
    'target1',"divisionId","target2","target3"]].describe()

Unnamed: 0,target1_mean,target1_median,target1_std,target1_min,target1_max,target1_prob,target2_mean,target2_median,target2_std,target2_min,target2_max,target2_prob,target3_mean,target3_median,target3_std,target3_min,target3_max,target3_prob,target4_mean,target4_median,target4_std,target4_min,target4_max,target4_prob,target1,divisionId,target2,target3
count,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,2508784.0,664141.0,2508784.0,2508784.0
mean,0.8076858,0.2086643,1.491682,0.005020743,6.708475,1.127729,2.09907,1.240186,2.463639,0.308588,11.21529,2.467225,0.5694316,0.07745038,1.400443,0.007896621,6.701864,0.3599559,1.637258,1.148316,1.656985,0.2317022,7.629292,1.803057,0.570536,202.500016,2.459199,0.6899692
std,2.345066,1.035032,3.735682,0.02681383,16.73603,5.960057,3.812537,2.774199,4.353143,0.7452425,19.39111,7.834035,1.812547,0.5275353,3.645426,0.0288441,16.75997,2.807083,3.375947,2.673816,3.324605,0.5438217,14.90432,5.750086,4.17929,1.708287,6.237921,5.073026
min,0.0,0.0,0.0,0.0,0.0,0.0,0.002006257,0.0,0.002742816,0.0,0.008734387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,200.0,0.0,0.0
25%,0.0001469648,0.0,0.0002039305,0.0,0.0008597541,0.0,0.0430423,0.03550086,0.03021539,0.007722604,0.1509848,0.03744477,0.0005350206,0.0,0.000866101,0.0,0.003466685,0.0,0.1682062,0.124533,0.1259744,0.02410859,0.5491057,0.09594096,0.0,201.0,0.07727312,0.0
50%,0.002298143,0.0007948978,0.002027149,0.0,0.01107237,0.0008083251,0.3892553,0.1959924,0.3804404,0.06435571,2.036706,0.1908192,0.006800752,0.003120125,0.006719478,0.0,0.03280409,0.003005112,0.5513352,0.354832,0.501786,0.06982174,2.263134,0.3502988,0.001070771,203.0,0.5575155,0.001698937
75%,0.3930881,0.01362733,0.9672931,0.001668067,4.532737,0.0168828,2.718152,1.341537,2.81632,0.3178461,13.01789,1.808318,0.3037478,0.02156877,0.9168443,0.003192542,4.347933,0.02131486,1.66267,1.049403,1.692484,0.2072833,7.636422,1.215743,0.01833517,204.0,2.239762,0.02089282
max,28.61945,22.51564,33.98812,0.7040277,100.0,100.0,49.69193,44.31772,32.88215,15.50033,100.0,100.0,32.18307,15.90317,35.33198,0.6190328,100.0,63.04345,39.33404,38.42898,34.1018,8.647749,100.0,100.0,100.0,205.0,100.0,100.0


In [45]:
train["stryear"] = train["date"].astype(str)
train["year"] = train["stryear"].str[0:4].astype(int)

In [46]:
#make feature
train["strdate"] = train["date"].astype(str)
train["month"] = train["strdate"].str[5].astype(int)
train["month"]
train["stryear"] = train["date"].astype(str)
train["year"] = train["stryear"].str[0:4].astype(int)
train["nullc"] = train.isnull().sum(axis=1)

In [47]:
del awards
del scores
del rosters
del standings

In [48]:
# label encoding
player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}
status2num = {c: i for i, c in enumerate(train['status'].unique())}
daynight2num = {c: i for i, c in enumerate(train['dayNight'].unique())}
seriesDescription2num = {c: i for i, c in enumerate(train['seriesDescription'].unique())}
gameType2num = {c: i for i, c in enumerate(train['gameType'].unique())}
bitrhCountry2num = {c: i for i, c in enumerate(train["birthCountry"].unique())}
train['label_playerId'] = train['playerId'].map(player2num)
train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)
train['label_teamId'] = train['teamId'].map(teamid2num)
train['label_status'] = train['status'].map(status2num)
train["label_daynight"] = train['dayNight'].map(daynight2num)
train["label_seriesDescription"] = train["seriesDescription"].map(seriesDescription2num)
train["birthCountry"] = train["birthCountry"].map(bitrhCountry2num)

In [49]:
#train.to_csv("/content/drive/MyDrive/mlb/script/output/train.csv")

In [50]:
train_X1 = train[feature_cols]
train_X2 = train[feature_cols2]
train_X3 = train[feature_cols3]
train_X4 = train[feature_cols4]

train_y = train[['target1', 'target2', 'target3', 'target4']]
train_y2 = train[['target1', 'target2', 'target3', 'target4']]

_index = (train['date'] < 20210401)
gc.collect()


x_train1 = train_X1.loc[_index].reset_index(drop=True)
y_train = train_y.loc[_index].reset_index(drop=True)
x_valid1 = train_X1.loc[~_index].reset_index(drop=True)
y_valid = train_y.loc[~_index].reset_index(drop=True)

y_train2 = train_y2.loc[_index].reset_index(drop=True)
y_valid2 = train_y2.loc[~_index].reset_index(drop=True)

x_train2 = train_X2.loc[_index].reset_index(drop=True)
x_valid2 = train_X2.loc[~_index].reset_index(drop=True)
x_train3 = train_X3.loc[_index].reset_index(drop=True)
x_valid3 = train_X3.loc[~_index].reset_index(drop=True)
x_train4 = train_X4.loc[_index].reset_index(drop=True)
x_valid4 = train_X4.loc[~_index].reset_index(drop=True)

del train_X1,train_X2,train_X3,train_X4
gc.collect()

0

In [51]:
def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):
    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    #model = lgbm.LGBMRegressor(**params)
    best_params, tuning_history = {}, []
    
    lgb_train = lgbm.Dataset(x_train, y_train)
    lgb_eval = lgbm.Dataset(x_valid, y_valid, reference=lgb_train)
    model = lgbm.train(params, 
        lgb_train, valid_sets=lgb_eval,  
        early_stopping_rounds=100,
        num_boost_round=500,
        verbose_eval=100,
        #verbose=verbose,               
        )
    oof_pred = model.predict(x_valid)
    score = mean_absolute_error(oof_pred, y_valid)
    print('mae:', score)
    return oof_pred, model, score



# training lightgbm

params1 = {'objective':'mae','reg_alpha': 0.14947461820098767, 'reg_lambda': 0.10185644384043743, 'n_estimators': 3633, 'learning_rate': 0.08046301304430488, 'num_leaves': 674, 'feature_fraction': 0.9101240539122566, 'bagging_fraction': 0.9884451442950513, 'bagging_freq': 8, 'min_child_samples': 51}

params2 = {
 'objective':'mae',
 'reg_alpha': 0.1,
 'reg_lambda': 0.1, 
 'n_estimators': 5000,
 'learning_rate': 0.1,
 'random_state': 42,
 "num_leaves": 22
}

params4 = {'objective':'mae','reg_alpha': 0.016468100279441976, 'reg_lambda': 0.09128335764019105, 'n_estimators': 9868, 'learning_rate': 0.10528150510326864, 'num_leaves': 157, 'feature_fraction': 0.5419185713426886, 'bagging_fraction': 0.2637405128936662, 'bagging_freq': 19, 'min_child_samples': 71}


params = {
 'objective':'mae',
 'reg_alpha': 0.1,
 'reg_lambda': 0.1, 
 'n_estimators': 10000,
 'learning_rate': 0.1,
 'random_state': 42,
 "num_leaves": 100
}


oof1, model1, score1 = fit_lgbm(
    x_train1, y_train['target1'],
    x_valid1, y_valid['target1'],
    params1
)
file = '/content/drive/MyDrive/mlb/script/output/model1.pkl'
pickle.dump(model1, open(file, 'wb'))

oof2, model2, score2 = fit_lgbm(
    x_train2, y_train['target2'],
    x_valid2, y_valid['target2'],
    params2
)
file = '/content/drive/MyDrive/mlb/script/output/model2.pkl'
pickle.dump(model2, open(file, 'wb'))
oof3, model3, score3 = fit_lgbm(
    x_train3, y_train['target3'],
    x_valid3, y_valid['target3'],
    params
)
file = '/content/drive/MyDrive/mlb/script/output/model3.pkl'
pickle.dump(model3, open(file, 'wb'))
oof4, model4, score4 = fit_lgbm(
    x_train4, y_train2['target4'],
    x_valid4, y_valid2['target4'],
    params4
)
file = '/content/drive/MyDrive/mlb/script/output/model4.pkl'
pickle.dump(model4, open(file, 'wb'))
score = (score1+score2+score3+score4) / 4
print(f'score: {score}')




Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 0.675621
[200]	valid_0's l1: 0.674728
[300]	valid_0's l1: 0.674421
Early stopping, best iteration is:
[249]	valid_0's l1: 0.674412
mae: 0.6744124163175812




Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 1.33819
[200]	valid_0's l1: 1.33197
[300]	valid_0's l1: 1.3314
Early stopping, best iteration is:
[241]	valid_0's l1: 1.33069
mae: 1.3306902168046313




Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 0.520672
Early stopping, best iteration is:
[78]	valid_0's l1: 0.520648
mae: 0.5206477358519357




Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 1.0083
[200]	valid_0's l1: 1.00164
[300]	valid_0's l1: 1.00103
Early stopping, best iteration is:
[285]	valid_0's l1: 1.00078
mae: 1.0007753433712772
score: 0.8816314280863563


In [52]:
"""
model1 = pickle.load(open('../input/mymodels/model1.pkl', 'rb'))
model2 = pickle.load(open('../input/mymodels/model2.pkl', 'rb'))
model3 = pickle.load(open('../input/mymodels/model3.pkl', 'rb'))
model4 = pickle.load(open('../input/mymodels/model4.pkl', 'rb'))"""

"\nmodel1 = pickle.load(open('../input/mymodels/model1.pkl', 'rb'))\nmodel2 = pickle.load(open('../input/mymodels/model2.pkl', 'rb'))\nmodel3 = pickle.load(open('../input/mymodels/model3.pkl', 'rb'))\nmodel4 = pickle.load(open('../input/mymodels/model4.pkl', 'rb'))"

In [53]:
importance = pd.DataFrame(model1.feature_importance(), index=feature_cols, columns=['importance'])
pd.set_option('display.max_rows', 102)
importance[importance["importance"]==0]

Unnamed: 0,importance
caughtStealingPitching,0
stolenBasesPitching,0
hitBatsmen,0
wildPitches,0
pickoffsPitching,0
errors,0


In [54]:
importance = pd.DataFrame(model2.feature_importance(), index=feature_cols2, columns=['importance'])
pd.set_option('display.max_rows', 102)
importance[importance["importance"]==0]

Unnamed: 0,importance
pickoffs,0
gamesStartedPitching,0
hitByPitchPitching,0
stolenBasesPitching,0
battersFaced,0


In [55]:
importance = pd.DataFrame(model3.feature_importance(), index=feature_cols3, columns=['importance'])
pd.set_option('display.max_rows', 102)
importance[importance["importance"]==0]

Unnamed: 0,importance
homeRuns,0


In [56]:
importance = pd.DataFrame(model4.feature_importance(), index=feature_cols4, columns=['importance'])
pd.set_option('display.max_rows', 102)
importance[importance["importance"]==0]

Unnamed: 0,importance
baseOnBalls,0
caughtStealing,0
leftOnBase,0
sacBunts,0
gamesPlayedPitching,0
winsPitching,0
doublesPitching,0
homeRunsPitching,0
saveOpportunities,0
wildPitches,0


In [57]:
players_cols = ['playerId', 'primaryPositionName']
rosters_cols = ['playerId', 'teamId', 'status']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances']
standings_cols = ["teamId","divisionRank","divisionLeader","wildCardLeader","leagueRank","divisionId","gameDate"]
null = np.nan
true = True
false = False

In [58]:
import pandas as pd
import numpy as np
from datetime import timedelta
from tqdm import tqdm
import gc
from functools import reduce
from sklearn.model_selection import StratifiedKFold

ROOT_DIR = "../input/mlb-player-digital-engagement-forecasting"

#=======================#
def flatten(df, col):
    du = (df.pivot(index="playerId", columns="EvalDate", 
               values=col).add_prefix(f"{col}_").
      rename_axis(None, axis=1).reset_index())
    return du
#============================#
def reducer(left, right):
    return left.merge(right, on="playerId")
#========================

TGTCOLS = ["target1","target2","target3","target4"]
def train_lag(df, lag=1):
    dp = df[["playerId","EvalDate"]+TGTCOLS].copy()
    dp["EvalDate"]  =dp["EvalDate"] + timedelta(days=lag) 
    df = df.merge(dp, on=["playerId", "EvalDate"], suffixes=["",f"_{lag}"], how="left")
    return df
#=================================
def test_lag(sub):
    sub["playerId"] = sub["date_playerId"].apply(lambda s: int(  s.split("_")[1]  ) )
    assert sub.date.nunique() == 1
    dte = sub["date"].unique()[0]
    
    eval_dt = pd.to_datetime(dte, format="%Y%m%d")
    dtes = [eval_dt + timedelta(days=-k) for k in LAGS]
    mp_dtes = {eval_dt + timedelta(days=-k):k for k in LAGS}
    
    sl = LAST.loc[LAST.EvalDate.between(dtes[-1], dtes[0]), ["EvalDate","playerId"]+TGTCOLS].copy()
    sl["EvalDate"] = sl["EvalDate"].map(mp_dtes)
    du = [flatten(sl, col) for col in TGTCOLS]
    du = reduce(reducer, du)
    return du, eval_dt
    #
#===============

tr = pd.read_csv("../input/mlb-data/target.csv")
print(tr.shape)
gc.collect()

tr["EvalDate"] = pd.to_datetime(tr["EvalDate"])
tr["EvalDate"] = tr["EvalDate"] + timedelta(days=-1)
tr["EvalYear"] = tr["EvalDate"].dt.year

MED_DF = tr.groupby(["playerId","EvalYear"])[TGTCOLS].median().reset_index()
MEDCOLS = ["tgt1_med","tgt2_med", "tgt3_med", "tgt4_med"]
MED_DF.columns = ["playerId","EvalYear"] + MEDCOLS

LAGS = list(range(1,21))
FECOLS = [f"{col}_{lag}" for lag in reversed(LAGS) for col in TGTCOLS]

for lag in tqdm(LAGS):
    tr = train_lag(tr, lag=lag)
    gc.collect()
#===========
tr = tr.sort_values(by=["playerId", "EvalDate"])
print(tr.shape)
tr = tr.dropna()
print(tr.shape)
tr = tr.merge(MED_DF, on=["playerId","EvalYear"])
gc.collect()

X = tr[FECOLS+MEDCOLS].values
y = tr[TGTCOLS].values
cl = tr["playerId"].values

NFOLDS = 6
skf = StratifiedKFold(n_splits=NFOLDS)
folds = skf.split(X, cl)
folds = list(folds)

import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

tf.random.set_seed(777)

def make_model(n_in):
    inp = L.Input(name="inputs", shape=(n_in,))
    x = L.Dense(50, activation="relu", name="d1")(inp)
    x = L.Dense(50, activation="relu", name="d2")(x)
    preds = L.Dense(4, activation="linear", name="preds")(x)
    
    model = M.Model(inp, preds, name="ANN")
    model.compile(loss="mean_absolute_error", optimizer="adam")
    return model

net = make_model(X.shape[1])
print(net.summary())

oof = np.zeros(y.shape)
nets = []
for idx in range(NFOLDS):
    print("FOLD:", idx)
    tr_idx, val_idx = folds[idx]
    ckpt = ModelCheckpoint(f"w{idx}.h5", monitor='val_loss', verbose=1, save_best_only=True,mode='min')
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=3, min_lr=0.0005)
    es = EarlyStopping(monitor='val_loss', patience=6)
    reg = make_model(X.shape[1])
    reg.fit(X[tr_idx], y[tr_idx], epochs=10, batch_size=35_000, validation_data=(X[val_idx], y[val_idx]),
            verbose=1, callbacks=[ckpt, reduce_lr, es])
    reg.load_weights(f"w{idx}.h5")
    oof[val_idx] = reg.predict(X[val_idx], batch_size=50_000, verbose=1)
    nets.append(reg)
    gc.collect()
    #
#

mae = mean_absolute_error(y, oof)
mse = mean_squared_error(y, oof, squared=False)
print("mae:", mae)
print("mse:", mse)

# Historical information to use in prediction time
bound_dt = pd.to_datetime("2021-01-01")
LAST = tr.loc[tr.EvalDate>bound_dt].copy()

LAST_MED_DF = MED_DF.loc[MED_DF.EvalYear==2021].copy()
LAST_MED_DF.drop("EvalYear", axis=1, inplace=True)
del tr

#"""
import mlb
FE = []; SUB = [];

FileNotFoundError: ignored

## Inference

In [None]:
def unpack_json(json_str):
    return np.nan if pd.isna(json_str) else pd.read_json(json_str)

In [None]:
"""players_cols = ['playerId', 'primaryPositionName']
rosters_cols = ['playerId', 'teamId', 'status']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances']

null = np.nan
true = True
false = False"""

In [None]:
"""players_cols = ['playerId', 'primaryPositionName']
rosters_cols = ['playerId', 'teamId', 'status']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances',"gamePk"]

null = np.nan
true = True
false = False

env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sample_prediction_df) in iter_test: # make predictions here
    
    sub = copy.deepcopy(sample_prediction_df.reset_index())
    sample_prediction_df = copy.deepcopy(sample_prediction_df.reset_index(drop=True))
    
    
    # creat dataset
    sample_prediction_df['playerId'] = sample_prediction_df['date_playerId']\
                                        .map(lambda x: int(x.split('_')[1]))
    # Dealing with missing values
    if test_df['rosters'].iloc[0] == test_df['rosters'].iloc[0]:
        test_rosters = pd.DataFrame(eval(test_df['rosters'].iloc[0]))
    else:
        test_rosters = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in rosters.columns:
            if col == 'playerId': continue
            test_rosters[col] = np.nan
            
    if test_df['playerBoxScores'].iloc[0] == test_df['playerBoxScores'].iloc[0]:
        test_scores = pd.DataFrame(eval(test_df['playerBoxScores'].iloc[0]))
    else:
        test_scores = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in scores.columns:
            if col == 'playerId': continue
            test_scores[col] = np.nan
    
    test_games = unpack_json(test_df["games"].iloc[0])
    test_standings = unpack_json(test_df["standings"].iloc[0])
    
            
            
    

    



        
    test_scores = test_scores.groupby('playerId').sum().reset_index()
    test = sample_prediction_df[['playerId']].copy()
    test = test.merge(players[players_cols], on='playerId', how='left')
    test = test.merge(test_rosters[rosters_cols], on='playerId', how='left')
    test = test.merge(test_scores[scores_cols], on='playerId', how='left')
    test = test.merge(test_games[games_cols], on="gamePk", how="left")
    test = test.merge(awards2[awards_cols], on=['playerId'], how='left')
    test = test.merge(player_target_stats, how='inner', left_on=["playerId"],right_on=["playerId"])
    test = test.merge(test_standings[standings_cols], on=["teamId"], how='left')
    
    #add feature
    test["ongame"] = np.where(test["gamePk"].isnull() == 1,0,1)
    test["ishome"] = np.where(test["teamId"]==test["homeId"],2,test["ongame"])
    test["winorlose"] = np.where(test["teamId"]==test["homeId"],test["homeWinner"],test["awayWinner"])
    test["winorlose"] = test["winorlose"].fillna(2.0).astype(int)
    test["score"] = np.where(test["teamId"]==test["homeId"],test["homeScore"],test["awayScore"])
    test["divisionRank"] = test["divisionRank"].fillna(7.0).astype(int)
    test["divisionLeader"] = test["divisionLeader"].fillna(-1.0).astype(int)
    test["wildCardLeader"] = np.where(test["wildCardLeader"]=="True",1,0)
    
    
    
    
    #label encoding
    test['label_playerId'] = test['playerId'].map(player2num)
    test['label_primaryPositionName'] = test['primaryPositionName'].map(position2num)
    test['label_teamId'] = test['teamId'].map(teamid2num)
    test['label_status'] = test['status'].map(status2num)
    test["label_daynight"] = test['dayNight'].map(daynight2num)
    test["label_seriesDescription"] = test["seriesDescription"].map(seriesDescription2num)
    test["gameType"] = test["gameType"].map(gameType2num)
    display(test)
    
    
    test_X1 = test[feature_cols1]
    test_X2 = test[feature_cols2]
    test_X3 = test[feature_cols3]
    test_X4 = test[feature_cols4]
    
    # predict
    pred1 = model1.predict(test_X1)
    pred2 = model2.predict(test_X2)
    pred3 = model3.predict(test_X3)
    pred4 = model4.predict(test_X4)
    
    # merge submission
    sample_prediction_df['target1'] = np.clip(pred1, 0, 100)
    sample_prediction_df['target2'] = np.clip(pred2, 0, 100)
    sample_prediction_df['target3'] = np.clip(pred3, 0, 100)
    sample_prediction_df['target4'] = np.clip(pred4, 0, 100)
    sample_prediction_df = sample_prediction_df.fillna(0.)
    del sample_prediction_df['playerId']
    
    # TF summit
    # Features computation at Evaluation Date
    sub_fe, eval_dt = test_lag(sub)
    sub_fe = sub_fe.merge(LAST_MED_DF, on="playerId", how="left")
    sub_fe = sub_fe.fillna(0.)
    
    _preds = 0.
    for reg in nets:
        _preds += reg.predict(sub_fe[FECOLS + MEDCOLS]) / NFOLDS
    sub_fe[TGTCOLS] = np.clip(_preds, 0, 100)
    sub.drop(["date"]+TGTCOLS, axis=1, inplace=True)
    sub = sub.merge(sub_fe[["playerId"]+TGTCOLS], on="playerId", how="left")
    sub.drop("playerId", axis=1, inplace=True)
    sub = sub.fillna(0.)
    # Blending
    blend = pd.concat(
        [sub[['date_playerId']],
        (0.1*sub.drop('date_playerId', axis=1) + 0.9*sample_prediction_df.drop('date_playerId', axis=1))],
        axis=1
    )
    env.predict(blend)
    # Update Available information
    sub_fe["EvalDate"] = eval_dt
    #sub_fe.drop(MEDCOLS, axis=1, inplace=True)
    LAST = LAST.append(sub_fe)
    LAST = LAST.drop_duplicates(subset=["EvalDate","playerId"], keep="last")"""

In [None]:
display(test_df['games'])

In [None]:
def unpack_json(json_str):
    return np.nan if pd.isna(json_str) else pd.read_json(json_str)

In [None]:
unpack_json(test_df["games"].iloc[0])

In [None]:
pd.concat(
    [sub[['date_playerId']],
    (sub.drop('date_playerId', axis=1) + sample_prediction_df.drop('date_playerId', axis=1)) / 2],
    axis=1
)

In [None]:
sample_prediction_df