In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
print(os.listdir("../input"))


['train_V2.csv', 'test_V2.csv', 'sample_submission_V2.csv']


In [2]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt  

from timeit import default_timer as timer
from sklearn import preprocessing

!pip install ultimate
from ultimate.mlp import MLP 

from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint

import gc, sys
gc.enable()



Using TensorFlow backend.


In [3]:
def state(message,start = True, time = 0):
    if(start):
        print(f'Working on {message} ... ')
    else :
        print(f'Working on {message} took ({round(time , 3)}) Sec \n')

In [4]:
INPUT_DIR = "../input/"

In [5]:
def feature_engineering(is_train=True):
    # When this function is used for the training data, load train_V2.csv :
    if is_train: 
        print("processing train_V2.csv")
        df = pd.read_csv(INPUT_DIR + 'train_V2.csv')
        
        # Only take the samples with matches that have more than 1 player 
        # there are matches with no players or just one player ( those samples could affect our model badly) 
        df = df[df['maxPlace'] > 1]
    
    # When this function is used for the test data, load test_V2.csv :
    else:
        print("processing test_V2.csv")
        df = pd.read_csv(INPUT_DIR + 'test_V2.csv')
        
    # Make a new feature indecating the total distance a player cut :
    state('totalDistance')
    s = timer()
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]
    e = timer()
    state('totalDistance', False, e - s)
          

    state('rankPoints')
    s = timer()
    # Process the 'rankPoints' feature by replacing any value of (-1) to be (0) :
    df['rankPoints'] = np.where(df['rankPoints'] <= 0 ,0 , df['rankPoints'])
    e = timer()                                  
    state('rankPoints', False, e-s)
    

    target = 'winPlacePerc'
    # Get a list of the features to be used
    features = list(df.columns)
    
    # Remove some features from the features list :
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchDuration")
    features.remove("matchType")
    
    y = None
    
    # If we are processing the training data, process the target
    # (group the data by the match and the group then take the mean of the target) 
    if is_train: 
        y = np.array(df.groupby(['matchId','groupId'])[target].agg('mean'), dtype=np.float64)
        # Remove the target from the features list :
        features.remove(target)
    
    # Make new features indicating the mean of the features ( grouped by match and group ) :
    print("get group mean feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('mean')
    # Put the new features into a rank form ( max value will have the highest rank)
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    
    # If we are processing the training data let df_out = the grouped  'matchId' and 'groupId'
    if is_train: df_out = agg.reset_index()[['matchId','groupId']]
    # If we are processing the test data let df_out = 'matchId' and 'groupId' without grouping 
    else: df_out = df[['matchId','groupId']]
    
    # Merge agg and agg_rank (that we got before) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])
    
    # Make new features indicating the max value of the features for each group ( grouped by match )
    print("get group max feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('max')
    # Put the new features into a rank form ( max value will have the highest rank)
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    # Merge the new (agg and agg_rank) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
    
    # Make new features indicating the minimum value of the features for each group ( grouped by match )
    print("get group min feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('min')
    # Put the new features into a rank form ( max value will have the highest rank)
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    # Merge the new (agg and agg_rank) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
    
    # Make new features indicating the number of players in each group ( grouped by match )
    print("get group size feature")
    agg = df.groupby(['matchId','groupId']).size().reset_index(name='group_size')
     
    # Merge the group_size feature with df_out :
    df_out = df_out.merge(agg, how='left', on=['matchId', 'groupId'])
    
    # Make new features indicating the mean value of each features for each match :
    print("get match mean feature")
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    
    # Merge the new agg with df_out :
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
    # Make new features indicating the number of groups in each match :
    print("get match size feature")
    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    
    # Merge the match_size feature with df_out :
    df_out = df_out.merge(agg, how='left', on=['matchId'])
    
    # Drop matchId and groupId
    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)
    
    # X is the output dataset (without the target) and y is the target :
    X = np.array(df_out, dtype=np.float64)
    
    
    del df, df_out, agg, agg_rank
    gc.collect()

    return X, y

In [6]:
%%time
# Process the training data :
x_train, y = feature_engineering(True)
# Scale the data to be in the range (-1 , 1)
scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1), copy=False).fit(x_train)

processing train_V2.csv
Working on totalDistance ... 
Working on totalDistance took (0.323) Sec 

Working on rankPoints ... 
Working on rankPoints took (0.059) Sec 

get group mean feature
get group max feature
get group min feature
get group size feature
get match mean feature
get match size feature
CPU times: user 4min 4s, sys: 54.5 s, total: 4min 58s
Wall time: 4min 58s


In [7]:
print("x_train", x_train.shape, x_train.max(), x_train.min())
scaler.transform(x_train)
print("x_train", x_train.shape, x_train.max(), x_train.min())

x_train (2026744, 170) 41270.1 0.0
x_train (2026744, 170) 1.0000000000000002 -1.0000000000000002


In [8]:
y = y * 2 - 1
print("y", y.shape, y.max(), y.min())

y (2026744,) 1.0 -1.0


In [9]:
%%time
# create NN_model
NN_model = Sequential()
NN_model.add(Dense(x_train.shape[1],  input_dim = x_train.shape[1], activation='relu'))
NN_model.add(Dense(136, activation='relu'))
NN_model.add(Dense(136, activation='relu'))
NN_model.add(Dense(136, activation='relu'))
NN_model.add(Dense(136, activation='relu'))

# output Layer
NN_model.add(Dense(1, activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 170)               29070     
_________________________________________________________________
dense_2 (Dense)              (None, 136)               23256     
_________________________________________________________________
dense_3 (Dense)              (None, 136)               18632     
_________________________________________________________________
dense_4 (Dense)              (None, 136)               18632     
_________________________________________________________________
dense_5 (Dense)              (None, 136)               18632     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 137       
Total params: 108,359
Trainable params: 108,359
Non-trainable params: 0
_________________________________________________________________
CPU 

In [10]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [11]:
all_mae_histories = []
mae_history = history.history['val_mean_absolute_error']
all_mae_histories.append(mae_history)
%%time
NN_model.fit(x=x_train, y=y, batch_size=1000,
             epochs=30, verbose=1, callbacks=callbacks_list,
             validation_split=0.15, validation_data=None, shuffle=True,
             class_weight=None, sample_weight=None, initial_epoch=0,
             steps_per_epoch=None, validation_steps=None)
del x_train, y
gc.collect()

Train on 1722732 samples, validate on 304012 samples
Epoch 1/30

Epoch 00001: val_loss improved from inf to 0.06912, saving model to Weights-001--0.06912.hdf5
Epoch 2/30

Epoch 00002: val_loss improved from 0.06912 to 0.06721, saving model to Weights-002--0.06721.hdf5
Epoch 3/30

Epoch 00003: val_loss improved from 0.06721 to 0.06182, saving model to Weights-003--0.06182.hdf5
Epoch 4/30

Epoch 00004: val_loss improved from 0.06182 to 0.06097, saving model to Weights-004--0.06097.hdf5
Epoch 5/30

Epoch 00005: val_loss did not improve from 0.06097
Epoch 6/30

Epoch 00006: val_loss did not improve from 0.06097
Epoch 7/30

Epoch 00007: val_loss did not improve from 0.06097
Epoch 8/30

Epoch 00008: val_loss did not improve from 0.06097
Epoch 9/30

Epoch 00009: val_loss improved from 0.06097 to 0.06006, saving model to Weights-009--0.06006.hdf5
Epoch 10/30

Epoch 00010: val_loss did not improve from 0.06006
Epoch 11/30

Epoch 00011: val_loss did not improve from 0.06006
Epoch 12/30

Epoch 00

In [None]:
average_mae_history = [np.mean([x[i] for x in all_mae_histories]) for i in range(200)]

In [None]:
import matplotlib.pyplot as plt

plt.plot(range(1, len(average_mae_history) + 1), average_mae_history)
plt.xlabel('Epochs')
plt.ylabel('Validation MAE')
plt.show()

In [None]:
def plot_history(history):
    # print(history.history.keys())

    # 精度の履歴をプロット
    plt.plot(history.history['mean_absolute_error'])
    plt.plot(history.history['val_mean_absolute_error'])
    plt.title('model accuracy')
    plt.xlabel('epoch')
    plt.ylabel('accuracy')
    plt.legend(['mean_absolute_error', 'val_mean_absolute_error'], loc='lower right')
    plt.show()

    # 損失の履歴をプロット
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.legend(['loss', 'val_loss'], loc='lower right')
    plt.show()

In [None]:
plot_history(history)

In [13]:
x_test, _ = feature_engineering(False)
scaler.transform(x_test)
print("x_test", x_test.shape, x_test.max(), x_test.min())
np.clip(x_test, out=x_test, a_min=-1, a_max=1)
print("x_test", x_test.shape, x_test.max(), x_test.min())

processing test_V2.csv
Working on totalDistance ... 
Working on totalDistance took (0.014) Sec 

Working on rankPoints ... 
Working on rankPoints took (0.027) Sec 

get group mean feature
get group max feature
get group min feature
get group size feature
get match mean feature
get match size feature
x_test (1934174, 170) 2.8037403272580264 -1.0633484162895928
x_test (1934174, 170) 1.0 -1.0


In [14]:
%%time
pred = NN_model.predict(x_test)
del x_test
gc.collect()

CPU times: user 1min 2s, sys: 9.96 s, total: 1min 12s
Wall time: 47.9 s


In [15]:
pred = pred.reshape(-1)
pred = (pred + 1) / 2

In [16]:
df_test = pd.read_csv(INPUT_DIR + 'test_V2.csv')

In [17]:
%%time
print("fix winPlacePerc")
for i in range(len(df_test)):
    winPlacePerc = pred[i]
    maxPlace = int(df_test.iloc[i]['maxPlace'])
    if maxPlace == 0:
        winPlacePerc = 0.0
    elif maxPlace == 1:
        winPlacePerc = 1.0
    else:
        gap = 1.0 / (maxPlace - 1)
        winPlacePerc = round(winPlacePerc / gap) * gap
    
    if winPlacePerc < 0: winPlacePerc = 0.0
    if winPlacePerc > 1: winPlacePerc = 1.0    
    pred[i] = winPlacePerc

fix winPlacePerc
CPU times: user 6min 26s, sys: 0 ns, total: 6min 26s
Wall time: 6min 26s


In [18]:
df_test['winPlacePerc'] = pred

In [19]:
submission = df_test[['Id', 'winPlacePerc']]
submission.to_csv('submission.csv', index=False)