In [42]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split

train_path = './data/train_V2.csv'
test_path = './data/test_V2.csv'

In [21]:
class get_data:
    def __init__(self, path, is_train=True):
        self.is_train = is_train
        data = pd.read_csv(path)
        
        if is_train:
            self.feature = data.drop('winPlacePerc', axis=1)
            self.label = data['winPlacePerc']
        else:
            self.feature = data
        del data
        
        self.deal_feature()
        self.x_train = self.feature
        if is_train:
            self.y_train = self.label
#         if is_train:
#             self.ts_f, self.ts_l, self.vs_f, self.vs_l = self.split_t_v()
        
    def deal_feature(self):
        self.add_cols()
        self.fillna()
        self.drop_cols()
#         self.featuring()
#         self.ohencode()
        self.drop_ids()
            
    def add_cols(self):
        self.feature['teamPlayers'] = self.feature['groupId'].map(self.feature['groupId'].value_counts())
        self.feature['gamePlayers'] = self.feature['matchId'].map(self.feature['matchId'].value_counts())
        self.feature['enemyPlayers'] = self.feature['gamePlayers'] - self.feature['teamPlayers']
        self.feature['totalDistance'] = self.feature['rideDistance'] + self.feature['swimDistance'] + self.feature['walkDistance']
        self.feature['enemyDamage'] = self.feature['assists'] + self.feature['kills']
        
        totalKills = self.feature.groupby(['matchId', 'groupId']).agg({'kills': lambda x: x.sum()})
        totalKills.rename(columns={'kills': 'squadKills'}, inplace=True)
        self.feature = self.feature.join(other=totalKills, on=['matchId', 'groupId'])
        
        self.feature['medicKits'] = self.feature['heals'] + self.feature['boosts']
        self.feature['medicPerKill'] = self.feature['medicKits'] / self.feature['enemyDamage']
        self.feature['distancePerHeals'] = self.feature['totalDistance'] / self.feature['heals']
        self.feature['headShotKillRatio'] = self.feature['headshotKills'] / self.feature['kills']
        self.feature['headshotKillRate'] = self.feature['headshotKills'] / self.feature['kills']
        self.feature['killPlaceOverMaxPlace'] = self.feature['killPlace'] / self.feature['maxPlace']
        self.feature['kills/walkDistance'] = self.feature['kills'] / self.feature['walkDistance']
        self.feature['avgKills'] = self.feature['squadKills'] / self.feature['teamPlayers']
        self.feature['damageRatio'] = self.feature['damageDealt'] / self.feature['enemyDamage']
        self.feature['distTravelledPerGame'] = self.feature['totalDistance'] / self.feature['matchDuration']
        self.feature['killPlacePerc'] = self.feature['killPlace'] / self.feature['gamePlayers']
        self.feature['playerSkill'] = self.feature['headshotKills'] + self.feature['roadKills'] + self.feature['assists'] - (5 * self.feature['teamKills'])
        self.feature['gamePlacePerc'] = self.feature['killPlace'] / self.feature['maxPlace']
    
    def fillna(self):
        self.feature.fillna(0, inplace=True)
        self.feature.replace(np.inf, 0, inplace=True)
        if self.is_train:
            self.label.fillna(0, inplace=True)
            self.label.replace(np.inf, 0, inplace=True)
            
#     def fillInf(self, val):
#         numcols = self.feature.select_dtypes(include='number').columns
#         cols = numcols[numcols != 'winPlacePerc']
#         self.feature[self.feature == np.Inf] = np.NaN
#         self.feature[self.feature == np.NINF] = np.NaN
#         for c in cols:
#             self.feature[c].fillna(val, inplace=True)
        
    def ohencode(self):
        '''
        solo  <-- solo,solo-fpp,normal-solo,normal-solo-fpp
        duo   <-- duo,duo-fpp,normal-duo,normal-duo-fpp,crashfpp,crashtpp
        squad <-- squad,squad-fpp,normal-squad,normal-squad-fpp,flarefpp,flaretpp
        '''
        mapper = lambda x: 'solo' if ('solo' in x) else 'duo' if ('duo' in x) or ('crash' in x) else 'squad'
        self.feature['matchType'] = self.feature['matchType'].apply(mapper)

        self.feature = pd.concat([self.feature, pd.get_dummies(self.feature['matchType'], prefix='matchType')], axis=1)
    
    def drop_cols(self):
        drop_cols = ['killPoints', 'rankPoints', 'winPoints', 'maxPlace']
        self.feature.drop(columns=drop_cols, inplace=True)
    
    def drop_ids(self):
        self.feature = self.feature.drop(['Id', 'groupId', 'matchId', 'matchType'], axis=1)
    
    def featuring(self):
        features = list(self.feature.columns)
        features.remove("Id")
        features.remove("matchId")
        features.remove("groupId")
        features.remove("matchType")
        condition='False'
        
        if 'winPlacePerc' in self.feature.columns:
            y = np.array(self.feature.groupby(['matchId','groupId'])['winPlacePerc'].agg('mean'), dtype=np.float64)
            features.remove("winPlacePerc")
            condition='True'
        
        # get group mean feature
        agg = self.feature.groupby(['matchId','groupId'])[features].agg('mean')
        agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
        df_out = agg.reset_index()[['matchId','groupId']]
        df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
        df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])
    
        # get group max feature
        agg = self.feature.groupby(['matchId','groupId'])[features].agg('max')
        agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
        df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
        df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
        
        # get group min feature
        agg = self.feature.groupby(['matchId','groupId'])[features].agg('min')
        agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
        df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
        df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
        
        # get match mean feature
        agg = self.feature.groupby(['matchId'])[features].agg('mean').reset_index()
        df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
        df_id=df_out[["matchId", "groupId"]].copy()
        df_out.drop(["matchId", "groupId"], axis=1, inplace=True)

        del agg, agg_rank
        gc.collect()
        if condition == 'True':
            return df_out,pd.DataFrame(y),df_id
        else:
            return df_out,df_id
        
    def split_t_v(self):
        ts_f, vs_f, ts_l, vs_l = \
        train_test_split(
            self.feature,
            self.label,
            test_size=0.1,
            random_state=2
        )
        return ts_f, ts_l, vs_f, vs_l

In [44]:
class create_model:
    def __init__(
        self,
        ts_f, ts_l, vs_f, vs_l,
        batch_size=42
    ):
        self.ts_f = ts_f
        self.ts_l = ts_l
        self.vs_f = vs_f
        self.vs_l = vs_l
        self.estimator = tf.estimator.LinearRegressor(
            feature_columns=self.get_feature_cols(),
            model_dir='./test'
        )
        self.batch_size = batch_size
        
    def get_feature_cols(self):
        return set([tf.feature_column.numeric_column(my_feature) for my_feature in self.ts_f])
        
    def train(self):
        train_input_fn = lambda: self.input_fn(self.ts_f, self.ts_l, batch_size=self.batch_size)
        vali_input_fn = lambda: self.input_fn(self.vs_f, self.vs_l, num_epochs=1, shuffle=False)
        for i in range(2):
            self.estimator.train(input_fn=train_input_fn)
            vali_eval = self.estimator.evaluate(input_fn=vali_input_fn)
            print(f'epoch{i} ', vali_eval)
        
    def input_fn(
        self,
        features_df,
        targets_df,
        batch_size=1,
        shuffle=True,
        num_epochs=None
    ):
        # Convert pandas data into a dict of np arrays.
#         features = {key:np.array(value) for key,value in dict(features).items()}
        features = dict(features_df)
    
        # Construct a dataset, and configure batching/repeating.
        ds = tf.data.Dataset.from_tensor_slices((features,targets_df)) # warning: 2GB limit
        ds = ds.batch(batch_size).repeat(num_epochs)
        # Shuffle the data, if specified.
        if shuffle:
            ds = ds.shuffle(1000)
        # Return the next batch of data.
#         features, labels = ds.make_one_shot_iterator().get_next()
#         return features, labels
        return ds

In [24]:
pg_data = get_data(train_path)
print(pg_data.ts_f.shape)
print(pg_data.ts_l.shape)
print(pg_data.vs_f.shape)
print(pg_data.vs_l.shape)

(4002269, 27)
(4002269,)
(444697, 27)
(444697,)


In [45]:
pg_model = create_model(
    ts_f=pg_data.ts_f,
    ts_l=pg_data.ts_l,
    vs_f=pg_data.vs_f,
    vs_l=pg_data.vs_l
)

In [46]:
pg_model.train()

KeyboardInterrupt: 