<font size=5>
Import Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import gc
import os
import sys

sns.set_style('darkgrid')
sns.set_palette('bone')
pd.options.display.float_format = '{:,.3f}'.format

In [2]:
def toTapleList(list1,list2):
    return list(itertools.product(list1,list2))

In [3]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                #el
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else:
            #df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
%%time
train = pd.read_csv(open('C:\\Users\\Alicia\\Macheine Learning\\train_V2.csv'))
train = reduce_mem_usage(train)
test = pd.read_csv(open('C:\\Users\\Alicia\\Macheine Learning\\test_V2.csv'))
test = reduce_mem_usage(test)
print(train.shape, test.shape)

Memory usage of dataframe is 983.90 MB --> 339.28 MB (Decreased by 65.5%)
Memory usage of dataframe is 413.18 MB --> 140.19 MB (Decreased by 66.1%)
(4446966, 29) (1934174, 28)
Wall time: 1min 49s


<font size=5>
Feateure Engineering

In [5]:
#combine train and test
#drops the current index of the test dataFrame and replaces it with an index of increasing integers.
all_data = train.append(test, sort=False).reset_index(drop=True)

In [6]:
del train, test
gc.collect()

14

In [7]:
all_data.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0,0.0,0,0.0,0,0,244.8,1,1466,0.444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0,0.004,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0,0.0,0,0.0,0,0,161.8,2,0,0.775
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,...,0,0.0,0,0.0,0,0,202.7,3,0,0.167
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0,0.0,0,0.0,0,0,49.75,2,0,0.188


In [8]:
list(all_data)

['Id',
 'groupId',
 'matchId',
 'assists',
 'boosts',
 'damageDealt',
 'DBNOs',
 'headshotKills',
 'heals',
 'killPlace',
 'killPoints',
 'kills',
 'killStreaks',
 'longestKill',
 'matchDuration',
 'matchType',
 'maxPlace',
 'numGroups',
 'rankPoints',
 'revives',
 'rideDistance',
 'roadKills',
 'swimDistance',
 'teamKills',
 'vehicleDestroys',
 'walkDistance',
 'weaponsAcquired',
 'winPoints',
 'winPlacePerc']

In [9]:
to_drop=['Id', 'groupId','matchId', 'matchDuration', 'numGroups', 'maxPlace','rankPoints', 'winPoints', 'killPoints']
all_data.drop(to_drop, inplace=True, axis=1)

In [10]:
#New feature: headshot kill rate = headshot kill/kill
all_data['HeadShotKillRate'] = all_data['headshotKills'] / all_data['kills']
all_data.drop(['headshotKills'], inplace=True, axis=1)

In [11]:
all_data = reduce_mem_usage(all_data)
print(all_data.shape)

Memory usage of dataframe is 322.53 MB --> 298.19 MB (Decreased by 7.5%)
(6381140, 20)


<font size=5>
Predict

In [12]:
x_train = all_data[all_data['winPlacePerc'].notnull()].reset_index(drop=True)
x_test = all_data[all_data['winPlacePerc'].isnull()].drop(['winPlacePerc'], axis=1).reset_index(drop=True)

In [13]:
del all_data
gc.collect()

14

In [28]:
x_train.drop(['matchType'], inplace=True, axis=1)
x_test.drop(['matchType'], inplace=True, axis=1)

In [15]:
y_train = x_train.pop('winPlacePerc')

In [46]:
list(x_train)

['assists',
 'boosts',
 'damageDealt',
 'DBNOs',
 'heals',
 'killPlace',
 'kills',
 'killStreaks',
 'longestKill',
 'revives',
 'rideDistance',
 'roadKills',
 'swimDistance',
 'teamKills',
 'vehicleDestroys',
 'walkDistance',
 'weaponsAcquired',
 'HeadShotKillRate']

In [47]:
y_train.head()

0   0.444
1   0.640
2   0.775
3   0.167
4   0.188
Name: winPlacePerc, dtype: float32

In [30]:
print(x_train.shape, x_test.shape)

(4446965, 18) (1934175, 18)


In [54]:
#find nan
# 计算在x_train中NaN值的个数
x =  x_train.isnull().sum().sum()

# 输出
print('在我们DataFrame中NaN的数量:', x)

在我们DataFrame中NaN的数量: 2529721


In [57]:
test_train = x_train
print(test_train.shape, x_train.shape)

(4446965, 18) (4446965, 18)


In [58]:
test_train.drop(['HeadShotKillRate'], inplace=True, axis=1)
list(test_train)

['assists',
 'boosts',
 'damageDealt',
 'DBNOs',
 'heals',
 'killPlace',
 'kills',
 'killStreaks',
 'longestKill',
 'revives',
 'rideDistance',
 'roadKills',
 'swimDistance',
 'teamKills',
 'vehicleDestroys',
 'walkDistance',
 'weaponsAcquired']

In [59]:
x =  test_train.isnull().sum().sum()

# 输出
print('在我们DataFrame中NaN的数量:', x)

在我们DataFrame中NaN的数量: 0


In [55]:
x_train.head()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,heals,killPlace,kills,killStreaks,longestKill,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,HeadShotKillRate
0,0,0,0.0,0,0,60,0,0,0.0,0,0.0,0,0.0,0,0,244.8,1,
1,0,0,91.47,0,0,57,0,0,0.0,0,0.004,0,11.04,0,0,1434.0,5,
2,1,0,68.0,0,0,47,0,0,0.0,0,0.0,0,0.0,0,0,161.8,2,
3,0,0,32.9,0,0,75,0,0,0.0,0,0.0,0,0.0,0,0,202.7,3,
4,0,0,100.0,0,0,45,1,1,58.53,0,0.0,0,0.0,0,0,49.75,2,0.0


In [18]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD

Using TensorFlow backend.


In [61]:
model = Sequential()

In [32]:
x_train.shape[1]

18

In [62]:
model.add(Dense(500, input_dim=test_train.shape[1]))
model.add(Activation('relu'))

In [63]:
model.add(Dense(500))
model.add(Activation('relu'))

In [64]:
model.add(Dense(1))
model.add(Activation('relu'))

In [65]:
model.compile(loss='mse', optimizer=SGD(lr=0.1), metrics=['accuracy'])

In [66]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 500)               9000      
_________________________________________________________________
activation_10 (Activation)   (None, 500)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 500)               250500    
_________________________________________________________________
activation_11 (Activation)   (None, 500)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 501       
_________________________________________________________________
activation_12 (Activation)   (None, 1)                 0         
Total params: 260,001
Trainable params: 260,001
Non-trainable params: 0
_________________________________________________________________


In [79]:
from keras import optimizers
from keras.callbacks import LearningRateScheduler, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers import Dense, Dropout, BatchNormalization, PReLU
from keras.models import load_model
from keras.models import Sequential

model.add(Dense(512, kernel_initializer='he_normal', input_dim=test_train.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Dense(256, kernel_initializer='he_normal'))
model.add(PReLU(alpha_initializer='zeros', alpha_regularizer=None, alpha_constraint=None, shared_axes=None))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Dense(128, kernel_initializer='he_normal'))
model.add(PReLU(alpha_initializer='zeros', alpha_regularizer=None, alpha_constraint=None, shared_axes=None))
model.add(BatchNormalization())
model.add(Dropout(0.1))

model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

optimizer = optimizers.Adam(lr=0.002)
model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
#model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['mae'])

In [80]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 500)               9000      
_________________________________________________________________
activation_10 (Activation)   (None, 500)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 500)               250500    
_________________________________________________________________
activation_11 (Activation)   (None, 500)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 501       
_________________________________________________________________
activation_12 (Activation)   (None, 1)                 0         
_________________________________________________________________
dense_13 (Dense)             (None, 512)               1024      
__________

In [45]:
model.fit(x_train, y_train, batch_size=100, epochs=3)
#在拿掉HeadShotKillRate以前, 結果predic裡都是nan

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x17bdf2f9898>

In [71]:
model.fit(test_train, y_train, batch_size=100, epochs=2)
#activation function都是relu, 結果predict都是array([0.], dtype=float32)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x17c278bc550>

In [None]:
#完全照kaggle上的做
model.fit(test_train, y_train,epochs=2, batch_size=2**16)

Epoch 1/2
Epoch 2/2

<font size=5>
Result

In [72]:
from ipywidgets import interact_manual
predict = model.predict(test_train)

In [73]:
predict[2]

array([0.], dtype=float32)

In [52]:
def test(number):
    print("原值為:", y_train[number])
    print("神經網路判斷為:", predict[number])

In [77]:
test(578);

原值為: 0.0333
神經網路判斷為: [0.]
