In [9]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
from tensorflow.python.data import Dataset

from sklearn.model_selection import train_test_split
from datetime import datetime

train_path = './data/train_V2.csv'
test_path = './data/test_V2.csv'

In [4]:
# class get_data:
#     def __init__(self, path, is_train=True):
#         if is_train:
#             self.df = pd.read_csv(path, nrows=500000)
#             self.shuffle_data()
#         else:
#             self.df = pd.read_csv(path)
            
#     def shuffle_data(self):
#         self.df = self.df.reindex(np.random.permutation(self.df.index))
    
#     def fillna(self):
#         self.df['winPlacePerc'] = self.df

class get_data:
    def __init__(self, path, is_train=True):
        self.is_train = is_train
        data = pd.read_csv(path)
        if is_train:
            data = data.sample(100000)
            self.feature = data.drop('winPlacePerc', axis=1)
            self.label = data['winPlacePerc']
        else:
            self.feature = data
        del data
        
        self.deal_feature()
        self.x_train = self.feature
        if is_train:
            self.y_train = self.label
#         if is_train:
#             self.ts_f, self.ts_l, self.vs_f, self.vs_l = self.split_t_v()
        
    def deal_feature(self):
        self.add_cols()
        self.fillna()
        self.drop_cols()
#         self.featuring()
#         self.ohencode()
        self.drop_ids()
            
    def add_cols(self):
        self.feature['teamPlayers'] = self.feature['groupId'].map(self.feature['groupId'].value_counts())
        self.feature['gamePlayers'] = self.feature['matchId'].map(self.feature['matchId'].value_counts())
        self.feature['enemyPlayers'] = self.feature['gamePlayers'] - self.feature['teamPlayers']
        self.feature['totalDistance'] = self.feature['rideDistance'] + self.feature['swimDistance'] + self.feature['walkDistance']
        self.feature['enemyDamage'] = self.feature['assists'] + self.feature['kills']
        
        totalKills = self.feature.groupby(['matchId', 'groupId']).agg({'kills': lambda x: x.sum()})
        totalKills.rename(columns={'kills': 'squadKills'}, inplace=True)
        self.feature = self.feature.join(other=totalKills, on=['matchId', 'groupId'])
        
        self.feature['medicKits'] = self.feature['heals'] + self.feature['boosts']
        self.feature['medicPerKill'] = self.feature['medicKits'] / self.feature['enemyDamage']
        self.feature['distancePerHeals'] = self.feature['totalDistance'] / self.feature['heals']
        self.feature['headShotKillRatio'] = self.feature['headshotKills'] / self.feature['kills']
        self.feature['headshotKillRate'] = self.feature['headshotKills'] / self.feature['kills']
        self.feature['killPlaceOverMaxPlace'] = self.feature['killPlace'] / self.feature['maxPlace']
        self.feature['kills/walkDistance'] = self.feature['kills'] / self.feature['walkDistance']
        self.feature['avgKills'] = self.feature['squadKills'] / self.feature['teamPlayers']
        self.feature['damageRatio'] = self.feature['damageDealt'] / self.feature['enemyDamage']
        self.feature['distTravelledPerGame'] = self.feature['totalDistance'] / self.feature['matchDuration']
        self.feature['killPlacePerc'] = self.feature['killPlace'] / self.feature['gamePlayers']
        self.feature['playerSkill'] = self.feature['headshotKills'] + self.feature['roadKills'] + self.feature['assists'] - (5 * self.feature['teamKills'])
        self.feature['gamePlacePerc'] = self.feature['killPlace'] / self.feature['maxPlace']
    
    def fillna(self):
        self.feature.fillna(0, inplace=True)
        self.feature.replace(np.inf, 0, inplace=True)
        if self.is_train:
            self.label.fillna(0, inplace=True)
            self.label.replace(np.inf, 0, inplace=True)
            
#     def fillInf(self, val):
#         numcols = self.feature.select_dtypes(include='number').columns
#         cols = numcols[numcols != 'winPlacePerc']
#         self.feature[self.feature == np.Inf] = np.NaN
#         self.feature[self.feature == np.NINF] = np.NaN
#         for c in cols:
#             self.feature[c].fillna(val, inplace=True)
        
    def ohencode(self):
        '''
        solo  <-- solo,solo-fpp,normal-solo,normal-solo-fpp
        duo   <-- duo,duo-fpp,normal-duo,normal-duo-fpp,crashfpp,crashtpp
        squad <-- squad,squad-fpp,normal-squad,normal-squad-fpp,flarefpp,flaretpp
        '''
        mapper = lambda x: 'solo' if ('solo' in x) else 'duo' if ('duo' in x) or ('crash' in x) else 'squad'
        self.feature['matchType'] = self.feature['matchType'].apply(mapper)

        self.feature = pd.concat([self.feature, pd.get_dummies(self.feature['matchType'], prefix='matchType')], axis=1)
    
    def drop_cols(self):
        drop_cols = ['killPoints', 'rankPoints', 'winPoints', 'maxPlace']
        self.feature.drop(columns=drop_cols, inplace=True)
    
    def drop_ids(self):
        self.feature = self.feature.drop(['Id', 'groupId', 'matchId', 'matchType'], axis=1)
    
    def featuring(self):
        features = list(self.feature.columns)
        features.remove("Id")
        features.remove("matchId")
        features.remove("groupId")
        features.remove("matchType")
        condition='False'
        
        if 'winPlacePerc' in self.feature.columns:
            y = np.array(self.feature.groupby(['matchId','groupId'])['winPlacePerc'].agg('mean'), dtype=np.float64)
            features.remove("winPlacePerc")
            condition='True'
        
        # get group mean feature
        agg = self.feature.groupby(['matchId','groupId'])[features].agg('mean')
        agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
        df_out = agg.reset_index()[['matchId','groupId']]
        df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
        df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])
    
        # get group max feature
        agg = self.feature.groupby(['matchId','groupId'])[features].agg('max')
        agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
        df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
        df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
        
        # get group min feature
        agg = self.feature.groupby(['matchId','groupId'])[features].agg('min')
        agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
        df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
        df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
        
        # get match mean feature
        agg = self.feature.groupby(['matchId'])[features].agg('mean').reset_index()
        df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
        df_id=df_out[["matchId", "groupId"]].copy()
        df_out.drop(["matchId", "groupId"], axis=1, inplace=True)

        del agg, agg_rank
        gc.collect()
        if condition == 'True':
            return df_out,pd.DataFrame(y),df_id
        else:
            return df_out,df_id
        
    def split_t_v(self):
        self.ts_f, self.vs_f, self.ts_l, self.vs_l = \
        train_test_split(
            self.feature,
            self.label,
            test_size=0.2,
            random_state=42
        )

In [14]:
class create_model:
    def __init__(
        self,
        train_df,
        periods = 10,
        steps = 10000,
        save_model = True,
        save_model_name = 'test_v0'
    ):
        self.train = train_df
        self.periods = periods
        self.steps = steps
        
        self.save_model = save_model
        self.save_model_path = './model/estimator/' + save_model_name
        
        self.create_feature_columns()
#         self.create_optimizer()
        
        model_arg = dict(
            feature_columns = self.featureCols,
            hidden_units = [1024, 128, 32],
            optimizer = 'Adam'#self.optimizer
        )
        if self.save_model:
            model_arg['model_dir'] = self.save_model_path
        self.model = tf.estimator.DNNRegressor(**model_arg)
        print('create model!')
    
    def create_feature_columns(self):
#         featureColumns = []
#         featureColumns.append(tf.feature_column.numeric_column('boosts'))
#         featureColumns.append(tf.feature_column.numeric_column('headshotKills'))
#         featureColumns.append(tf.feature_column.numeric_column('heals'))
#         featureColumns.append(tf.feature_column.numeric_column('kills'))
#         featureColumns.append(tf.feature_column.numeric_column('walkDistance'))
#         featureColumns.append(tf.feature_column.numeric_column('weaponsAcquired'))
#         self.featureCols = featureColumns
        self.featureCols = set([tf.feature_column.numeric_column(my_feature) for my_feature in self.train])
    
    def create_optimizer(self):
#         self.optimizer = tf.train.ProximalAdagradOptimizer(
#             learning_rate = 0.003,
#             l1_regularization_strength = 0.001
#         )
        self.optimizer = lambda: tf.contrib.optimizer_v2.AdamOptimizer(
            learning_rate=tf.exponential_decay(
                learning_rate=0.01,
                global_step=tf.get_global_step(),
                decay_steps=10000,
                decay_rate=0.96
            )
        )

    def training(
        self,
        ts_f,
        ts_l,
        vs_f,
        vs_l
    ):
        self.ts_f = ts_f
        self.ts_l = ts_l
        self.vs_f = vs_f
        self.vs_l = vs_l
        self.create_input_fn()
        
        print('training start.')
        step_period = self.steps / self.periods
        for period in range(self.periods):
            print('start eval')
            evaluate = self.model.evaluate(input_fn=self.eval_fn, steps=step_period)
            print('start training')
            training = self.model.train(input_fn=self.train_fn, steps=step_period)
            print('period: ', period)
        evaluate = self.model.evaluate(input_fn=self.eval_fn, steps=step_period)
        print('training done.')
            
    def create_input_fn(self):
        self.create_train_fn()
        self.create_eval_fn()
        
    def create_train_fn(self):
#         self.train_fn = tf.estimator.inputs.pandas_input_fn(
#             x = self.ts_f,
#             y = self.ts_l,
#             batch_size = 30,
#             num_epochs = None,
#             shuffle = True,
#         )
        self.train_fn = lambda: self.my_input_fn(self.ts_f, self.ts_l, batch_size=30, shuffle=True)
        
    def create_eval_fn(self):
#         self.eval_fn = tf.estimator.inputs.pandas_input_fn(
#             x = self.vs_f,
#             y = self.vs_l,
#             batch_size = 1,
#             num_epochs = 1,
#             shuffle = True,
#         )
        self.eval_fn = lambda: self.my_input_fn(self.vs_f, self.vs_l, batch_size=1, num_epochs=1)
        
#     def my_input_fn(self, train_X, train_y, batch_size=64, shuffle=False):
#         dataset = tf.data.Dataset.from_tensor_slices((train_X, train_y))

#         dataset = dataset.map(lambda x,y: self.preprocess_data(x, y))

#         if shuffle:
#             dataset = dataset.shuffle(buffer_size=128)

#         dataset = dataset.batch(batch_size)

#         itr = dataset.make_one_shot_iterator()
#         features, target = itr.get_next()

#         return features, target
    def my_input_fn(self, features, targets, batch_size=1, shuffle=True, num_epochs=None):
        # Convert pandas data into a dict of np arrays.
        features = {key:np.array(value) for key,value in dict(features).items()}
        # Construct a dataset, and configure batching/repeating.
        ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
        ds = ds.batch(batch_size).repeat(num_epochs)
        # Shuffle the data, if specified.
        if shuffle:
            ds = ds.shuffle(42)
        # Return the next batch of data.
        features, labels = ds.make_one_shot_iterator().get_next()
        return features, labels
    
    def preprocess_data(self, x, y):
        labels = tf.cast(y, tf.int32)
        input_data = tf.cast(x, tf.float32)
        return (dict({'image': input_data}), labels)

    def predict(self, test_df):
        self.test_df = test_df
        self.create_predict_fn()
        prediction = self.model.predict(input_fn=self.predict_fn)
        prediction = [item['predictions'][0] for item in prediction]
        
        self.result = pd.DataFrame(
            [(Id, pred) for Id, pred in zip(testData['Id'], prediction)],
            columns = ['Id', 'winPlacePerc']
        )
        
    def create_predict_fn(self):
        self.predict_fn = tf.estimator.inputs.pandas_input_fn(
            x = self.test_df,
            y = None,
            batch_size = 1,
            num_epochs = 1,
            shuffle = False,
        )
    
    def save_result(self, path='./result/estimator/submission.csv'):
        self.result.to_csv(path, index=False)
        
    def test(self):
        print('model work!')

In [6]:
data= get_data(train_path)
data.feature.head()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,kills,killStreaks,longestKill,...,headShotKillRatio,headshotKillRate,killPlaceOverMaxPlace,kills/walkDistance,avgKills,damageRatio,distTravelledPerGame,killPlacePerc,playerSkill,gamePlacePerc
3867112,0,1,0.0,0,0,4,56,0,0,0.0,...,0.0,0.0,2.074074,0.0,0.0,0.0,0.790051,56.0,0,2.074074
4087297,0,0,76.78,1,0,0,66,0,0,0.0,...,0.0,0.0,1.32,0.0,0.0,0.0,0.134201,66.0,0,1.32
3565070,0,1,0.0,0,0,2,48,0,0,0.0,...,0.0,0.0,0.505263,0.0,0.0,0.0,2.211442,48.0,0,0.505263
195539,0,2,347.6,0,0,2,6,4,2,109.8,...,0.0,0.0,0.060606,0.008451,4.0,86.9,0.258493,6.0,0,0.060606
2905103,0,0,0.0,0,0,0,50,0,0,0.0,...,0.0,0.0,0.555556,0.0,0.0,0.0,0.573496,50.0,0,0.555556


In [15]:
model = create_model(data.feature)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './model/test/test', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x16b31a048>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
create model!


In [16]:
data.split_t_v()
model.training(
    data.ts_f,
    data.ts_l,
    data.vs_f,
    data.vs_l
)

training start.
start eval
INFO:tensorflow:Could not find trained model in model_dir: ./model/test/test, running initialization to evaluate.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-03-18T20:37:38Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [100/1000]
INFO:tensorflow:Evaluation [200/1000]
INFO:tensorflow:Finished evaluation at 2019-03-18-20:37:39
INFO:tensorflow:Saving dict for global step 0: average_loss = 30559.385, global_step = 0, label/mean = 0.49342495, loss = 30559.385, prediction/mean = -141.3291
start training
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ./model/test

INFO:tensorflow:global_step/sec: 280.803
INFO:tensorflow:loss = 8.502928, step = 3601 (0.356 sec)
INFO:tensorflow:global_step/sec: 261.662
INFO:tensorflow:loss = 7.6205163, step = 3701 (0.382 sec)
INFO:tensorflow:global_step/sec: 303.189
INFO:tensorflow:loss = 7.021456, step = 3801 (0.330 sec)
INFO:tensorflow:global_step/sec: 313.239
INFO:tensorflow:loss = 5.836385, step = 3901 (0.319 sec)
INFO:tensorflow:Saving checkpoints for 4000 into ./model/test/test/model.ckpt.
INFO:tensorflow:Loss for final step: 7.2370567.
period:  3
start eval
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-03-18T20:38:09Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./model/test/test/model.ckpt-4000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [100/1000]
INFO:tensorflow:Evaluation [200/1000]
INFO:tensorflow:Finished evaluation at 2019-03-18-20:

start training
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./model/test/test/model.ckpt-7000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 7000 into ./model/test/test/model.ckpt.
INFO:tensorflow:loss = 2.5600686, step = 7001
INFO:tensorflow:global_step/sec: 202.347
INFO:tensorflow:loss = 3.345299, step = 7101 (0.495 sec)
INFO:tensorflow:global_step/sec: 241.46
INFO:tensorflow:loss = 2.989903, step = 7201 (0.415 sec)
INFO:tensorflow:global_step/sec: 320.665
INFO:tensorflow:loss = 3.2793467, step = 7301 (0.311 sec)
INFO:tensorflow:global_step/sec: 261.581
INFO:tensorflow:loss = 2.7804868, step = 7401 (0.382 sec)
INFO:tensorflow:global_step/sec: 313.138
INFO:tensorflow:loss = 2.9890137, step = 7501 (0.319 sec)
INFO:tensorflow:global_step/sec: 312.606
INFO:tensorflo