https://www.kaggle.com/c/pubg-finish-placement-prediction

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import keras

In [None]:
pd.options.display.max_columns = 30

# Import Data And EDA

In [None]:
train_data = pd.read_csv('../input/train.csv')

In [None]:
train_data.head()

In [None]:
train_data.shape

In [None]:
train_data.describe()

### Data fields
DBNOs - Number of enemy players knocked.

assists - Number of enemy players this player damaged that were killed by teammates.

boosts - Number of boost items used.

damageDealt - Total damage dealt. Note: Self inflicted damage is subtracted.

headshotKills - Number of enemy players killed with headshots.

heals - Number of healing items used.

killPlace - Ranking in match of number of enemy players killed.

killPoints - Kills-based external ranking of player. (Think of this as an Elo ranking where only kills matter.)

killStreaks - Max number of enemy players killed in a short amount of time.

kills - Number of enemy players killed.

longestKill - Longest distance between player and player killed at time of death. This may be misleading, as downing a player and driving away may lead to a large longestKill stat.

matchId - Integer ID to identify match. There are no matches that are in both the training and testing set.

revives - Number of times this player revived teammates.

rideDistance - Total distance traveled in vehicles measured in meters.

roadKills - Number of kills while in a vehicle.

swimDistance - Total distance traveled by swimming measured in meters.

teamKills - Number of times this player killed a teammate.

vehicleDestroys - Number of vehicles destroyed.

walkDistance - Total distance traveled on foot measured in meters.

weaponsAcquired - Number of weapons picked up.

winPoints - Win-based external ranking of player. (Think of this as an Elo ranking where only winning matters.)

groupId - Integer ID to identify a group within a match. If the same group of players plays in different matches, they will have a different groupId each time.

numGroups - Number of groups we have data for in the match.

maxPlace - Worst placement we have data for in the match. This may not match with numGroups, as sometimes the data skips over placements.

winPlacePerc - The target of prediction. This is a percentile winning placement, where 1 corresponds to 1st place, and 0 corresponds to last place in the match. It is calculated off of maxPlace, not numGroups, so it is possible to have missing chunks in a match.

In [None]:
train_data.columns

In [None]:
X = train_data[['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
                'killStreaks', 'longestKill', 'maxPlace', 'numGroups', 'revives', 'rideDistance', 'roadKills', 'swimDistance', 
                'teamKills', 'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints']]

In [None]:
y = train_data['winPlacePerc']

In [None]:
test_data = pd.read_csv('../input/test.csv')

In [None]:
test_data.columns

In [None]:
test_data.head()

In [None]:
submissions = pd.read_csv('../input/sample_submission.csv')

In [None]:
submissions.head()

In [None]:
pred_feat = test_data[['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
                'killStreaks', 'longestKill', 'maxPlace', 'numGroups', 'revives', 'rideDistance', 'roadKills', 'swimDistance', 
                'teamKills', 'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints']]

## Distribution of Target Variable

In [None]:
fig, ax = plt.subplots(figsize=(20,8))
y.plot.hist(bins=50)
plt.show()

In [None]:
print('Average:', y.mean())
print('Mode:', y.mode())

In [None]:
def insight(x):
    print('Average: {:.4f}'.format(train_data[x].mean()))
    print('99% quantile: {} '.format(train_data[x].quantile(0.99)))
    print('Max: {}'.format(train_data[x].max()))

## Kills

In [None]:
insight('kills')

**Distribution:**

In [None]:
data = train_data['kills'].copy()
data[data > 7] = '8+'
plt.figure(figsize=(20,8))
sns.countplot(data.astype('str').sort_values())
plt.title("Kill Count",fontsize=15)
plt.show()

Most people don't kill even 1... How much do they knockout, damage or assist

In [None]:
data = train_data[['kills', 'damageDealt', 'assists', 'DBNOs', 'winPlacePerc']].copy()
data = data[data['kills']==0]
plt.figure(figsize=(20,8))
plt.title("Knockouts by 0 killers",fontsize=15)
sns.distplot(data['DBNOs'])
plt.show()

plt.figure(figsize=(20,8))
plt.title("Assists by 0 killers",fontsize=15)
sns.distplot(data['assists'])
plt.show()

plt.figure(figsize=(20,8))
plt.title("Damage Dealt by 0 killers",fontsize=15)
sns.distplot(data['damageDealt'])
plt.show()

Most of these didn't even assist, knockout or deal some damage

In [None]:
win_zerokills = len(data[data['winPlacePerc'] == 1])

In [None]:
total_players = len(train_data)

In [None]:
win_zerodamage = len(train_data[(train_data['damageDealt'] == 0) & (train_data['winPlacePerc'] == 1)])

In [None]:
win_zeroassist = len(train_data[(train_data['assists'] == 0) & (train_data['winPlacePerc'] == 1)])

In [None]:
win_zeroknocks = len(train_data[(train_data['DBNOs'] == 0) & (train_data['winPlacePerc'] == 1)])

In [None]:
print('{} ({:.4f}%) players won without even one kill'.format(win_zerokills, win_zerokills*100/total_players))

In [None]:
print('{} ({:.4f}%) players won without even knocking someone out'.format(win_zeroknocks, win_zeroknocks*100/total_players))

In [None]:
print('{} ({:.4f}%) players won without even assisting'.format(win_zeroassist, win_zeroassist*100/total_players))

In [None]:
print('{} ({:.4f}%) players won without dealing some damage'.format(win_zerodamage, win_zerodamage*100/total_players))

**Win percentage v/s kills**

In [None]:
sns.jointplot(x='winPlacePerc', y='kills', data=train_data)
plt.show()
sns.jointplot(x='winPlacePerc', y='assists', data=train_data)
plt.show()
sns.jointplot(x='winPlacePerc', y='DBNOs', data=train_data)
plt.show()
sns.jointplot(x='winPlacePerc', y='damageDealt', data=train_data)
plt.show()

**Correlation between killing and winning**

In [None]:
sns.heatmap(train_data[['kills', 'assists', 'DBNOs', 'damageDealt', 'winPlacePerc']].corr(), cmap='RdBu',annot=True)
plt.show()

In [None]:
insight('DBNOs')

In [None]:
insight('assists')

In [None]:
insight('damageDealt')

## Boosts Heals Revives

** Insights **

In [None]:
insight('boosts')

In [None]:
insight('heals')

In [None]:
insight('revives')

In [None]:
data = train_data[['winPlacePerc', 'kills']].copy()
for x in ['boosts', 'heals', 'revives']:
    data1 = train_data[x].copy()
    print(data1.unique())
    data1[data1 > (data1.quantile(0.99)-1)] = (str(int(data1.quantile(0.99)))+'+')
    print(data1.unique())
    data = pd.concat([data, data1], axis=1)

In [None]:
def heal_fix(x):
    if x != '11+' and x != 10:
        return '0'+str(x)
    if x == 10:
        return '10'
    else:
        return x
    

In [None]:
data['heals'] = data['heals'].apply(heal_fix)

In [None]:
data.head()

** Distribution **

In [None]:
plt.figure(figsize=(20,8))
sns.countplot(data['boosts'].astype('str').sort_values())
plt.title("No. of Boosts Used",fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(20,8))
sns.countplot(data['heals'].astype('str').sort_values())
plt.title("No. of Heals",fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(20,8))
sns.countplot(data['revives'].astype('str').sort_values())
plt.title("No. of Revives",fontsize=15)
plt.show()

** Correlation **

In [None]:
sns.heatmap(train_data[['boosts', 'heals', 'revives', 'winPlacePerc']].corr(), cmap='RdBu',annot=True)
plt.show()

In [None]:
sns.jointplot(x='winPlacePerc', y='boosts', data=train_data)
plt.show()
sns.jointplot(x='winPlacePerc', y='heals', data=train_data)
plt.show()
sns.jointplot(x='winPlacePerc', y='revives', data=train_data)
plt.show()

##  Ride Swim Walk

** Insights **

In [None]:
insight('walkDistance')

In [None]:
insight('rideDistance')

In [None]:
insight('swimDistance')

** Distribution **

In [None]:
data = train_data['walkDistance'].copy()
data = data[data < data.quantile(0.99)]
plt.figure(figsize=(20,8))
sns.distplot(data)
plt.title("Walk Distance",fontsize=15)
plt.show()

In [None]:
print('{} ({:.4f}%) walked 0 meters,which means they were killed before even taking a step'
      .format(len(train_data[train_data['walkDistance'] == 0]),
              len(train_data[train_data['walkDistance'] == 0])*100/len(train_data)))

In [None]:
data = train_data['rideDistance'].copy()
data = data[data < data.quantile(0.99)]
plt.figure(figsize=(20,8))
sns.distplot(data)
plt.title("Ride Distance",fontsize=15)
plt.show()

In [None]:
print("{} ({:.4f}%) rode 0 meters,which means they couldn't find a vehicle"
      .format(len(train_data[train_data['rideDistance'] == 0]),
              len(train_data[train_data['rideDistance'] == 0])*100/len(train_data)))

In [None]:
data = train_data['swimDistance'].copy()
data = data[data < data.quantile(0.99)]
plt.figure(figsize=(20,8))
sns.distplot(data)
plt.title("Swim Distance",fontsize=15)
plt.show()

In [None]:
print("{} ({:.4f}%) swim 0 meters,which means they need not to swim"
      .format(len(train_data[train_data['swimDistance'] == 0]),
              len(train_data[train_data['swimDistance'] == 0])*100/len(train_data)))

** Correlation **

In [None]:
sns.heatmap(train_data[['walkDistance', 'rideDistance', 'swimDistance', 'winPlacePerc', 'kills']].corr(), cmap='RdBu',annot=True)
plt.show()

Walk Distance is highly correlated with winning

In [None]:
sns.jointplot(x='winPlacePerc', y='walkDistance', data=train_data)
plt.show()
sns.jointplot(x='winPlacePerc', y='rideDistance', data=train_data)
plt.show()
sns.jointplot(x='winPlacePerc', y='swimDistance', data=train_data)
plt.show()

## Team Kills, Head shot Kills, Longest Kill, Road Kills, Vehicle Kills, Kill Streaks

** Insights **

In [None]:
insight('headshotKills')

In [None]:
insight('killStreaks')

In [None]:
insight('longestKill')

In [None]:
insight('roadKills')

In [None]:
insight('teamKills')

In [None]:
insight('vehicleDestroys')

** Distribution **

In [None]:
data = train_data['headshotKills'].copy()
# data = data[data < data.quantile(0.99)]
plt.figure(figsize=(20,8))
sns.distplot(data)
plt.title("Head Shots",fontsize=15)
plt.show()

In [None]:
print('{} ({:.4f}%) had 0 headshots'
      .format(len(train_data[train_data['headshotKills'] == 0]),
              len(train_data[train_data['headshotKills'] == 0])*100/len(train_data)))

In [None]:
data = train_data['killStreaks'].copy()
# data = data[data < data.quantile(0.99)]
plt.figure(figsize=(20,8))
sns.distplot(data)
plt.title("Kill Streaks",fontsize=15)
plt.show()

In [None]:
print('{} ({:.4f}%) had kill streaks less than 3'
      .format(len(train_data[train_data['killStreaks'] <= 3]),
              len(train_data[train_data['killStreaks'] <= 3])*100/len(train_data)))

In [None]:
data = train_data['longestKill'].copy()
# data = data[data < data.quantile(0.99)]
plt.figure(figsize=(20,8))
sns.distplot(data)
plt.title("Longest Kill",fontsize=15)
plt.show()

In [None]:
print('{} ({:.4f}%) had kill from a distance less than 50 meters'
      .format(len(train_data[train_data['longestKill'] <= 50]),
              len(train_data[train_data['longestKill'] <= 50])*100/len(train_data)))

In [None]:
data = train_data['roadKills'].copy()
# data = data[data < data.quantile(0.99)]
plt.figure(figsize=(20,8))
sns.distplot(data)
plt.title("Road Kills",fontsize=15)
plt.show()

In [None]:
print('{} ({:.4f}%) had 0 road kills'
      .format(len(train_data[train_data['roadKills'] == 0]),
              len(train_data[train_data['roadKills'] == 0])*100/len(train_data)))

In [None]:
data = train_data['teamKills'].copy()
# data = data[data < data.quantile(0.99)]
plt.figure(figsize=(20,8))
sns.distplot(data)
plt.title("Teams Kills",fontsize=15)
plt.show()

In [None]:
print('{} ({:.4f}%) had 0 team kills'
      .format(len(train_data[train_data['teamKills'] == 0]),
              len(train_data[train_data['teamKills'] == 0])*100/len(train_data)))

In [None]:
data = train_data['vehicleDestroys'].copy()
# data = data[data < data.quantile(0.99)]
plt.figure(figsize=(20,8))
sns.distplot(data)
plt.title("Vehicles Destroyed",fontsize=15)
plt.show()

In [None]:
print('{} ({:.4f}%) had destroyed 0 vehicles'
      .format(len(train_data[train_data['vehicleDestroys'] == 0]),
              len(train_data[train_data['vehicleDestroys'] == 0])*100/len(train_data)))

** Correlation **

In [None]:
sns.heatmap(train_data[['headshotKills', 'killStreaks', 'longestKill', 'roadKills', 'teamKills', 'vehicleDestroys', 
                        'winPlacePerc', 'kills']].corr(), cmap='RdBu',annot=True)
plt.show()

In [None]:
sns.jointplot(x='winPlacePerc', y='headshotKills', data=train_data)
plt.show()
sns.jointplot(x='winPlacePerc', y='killStreaks', data=train_data)
plt.show()
sns.jointplot(x='winPlacePerc', y='longestKill', data=train_data)
plt.show()
sns.jointplot(x='winPlacePerc', y='roadKills', data=train_data)
plt.show()
sns.jointplot(x='winPlacePerc', y='teamKills', data=train_data)
plt.show()
sns.jointplot(x='winPlacePerc', y='vehicleDestroys', data=train_data)
plt.show()

## Weapons Acquired, Kills, Kill place, Kill points

** Insights **

In [None]:
insight('weaponsAcquired')

In [None]:
insight('killPlace')

In [None]:
insight('killPoints')

** Distribution **

In [None]:
data = train_data['weaponsAcquired'].copy()
# data = data[data < data.quantile(0.99)]
plt.figure(figsize=(20,8))
sns.distplot(data)
plt.title("Weapons Acquired",fontsize=15)
plt.show()

In [None]:
print("{} ({:.4f}%) couldn't even acquire a weapon"
      .format(len(train_data[train_data['weaponsAcquired'] == 0]),
              len(train_data[train_data['weaponsAcquired'] == 0])*100/len(train_data)))

In [None]:
data = train_data['killPlace'].copy()
# data = data[data < data.quantile(0.99)]
plt.figure(figsize=(20,8))
sns.distplot(data)
plt.title("Kill Place",fontsize=15)
plt.show()

In [None]:
data = train_data['killPoints'].copy()
# data = data[data < data.quantile(0.99)]
plt.figure(figsize=(20,8))
sns.distplot(data)
plt.title("Kill Points",fontsize=15)
plt.show()

In [None]:
print("{} ({:.4f}%) had 1000 kill points"
      .format(len(train_data[train_data['killPoints'] == 1000]),
              len(train_data[train_data['killPoints'] == 1000])*100/len(train_data)))

** Correlation **

In [None]:
sns.heatmap(train_data[['weaponsAcquired', 'killPlace', 'killPoints', 
                        'winPlacePerc', 'kills']].corr(), cmap='RdBu',annot=True)
plt.show()

In [None]:
sns.jointplot(x='winPlacePerc', y='weaponsAcquired', data=train_data)
plt.show()
sns.jointplot(x='winPlacePerc', y='killPlace', data=train_data)
plt.show()
sns.jointplot(x='winPlacePerc', y='killPoints', data=train_data)
plt.show()

## Solos Duos Squads

In [None]:
solos = train_data[train_data['numGroups']>50]
duos = train_data[(train_data['numGroups']>25) & (train_data['numGroups']<=50)]
squads = train_data[train_data['numGroups']<=25]
print("There are {} ({:.2f}%) solo games,\n\t  {} ({:.2f}%) duo games and\n\t  {} ({:.2f}%) squad games."
      .format(len(solos), 100*len(solos)/len(train_data),
              len(duos), 100*len(duos)/len(train_data),
              len(squads), 100*len(squads)/len(train_data),))

In [None]:
fig,ax = plt.subplots(figsize =(20,10))
sns.pointplot(x='kills',y='winPlacePerc',data=solos,color='black',alpha=0.8)
sns.pointplot(x='kills',y='winPlacePerc',data=duos,color='#CC0000',alpha=0.8)
sns.pointplot(x='kills',y='winPlacePerc',data=squads,color='#3399FF',alpha=0.8)
plt.text(37,0.6,'Solos',color='black',fontsize = 17,style = 'italic')
plt.text(37,0.55,'Duos',color='#CC0000',fontsize = 17,style = 'italic')
plt.text(37,0.5,'Squads',color='#3399FF',fontsize = 17,style = 'italic')
plt.xlabel('Number of kills',fontsize = 15,color='blue')
plt.ylabel('Win Percentage',fontsize = 15,color='blue')
plt.title('Solo vs Duo vs Squad Kills',fontsize = 20,color='blue')
plt.grid()
plt.show()

Solo and Duo behave pretty mush the same

## Scatterplot of various features with target Variable

In [None]:
for x in X.columns.drop(['kills', 'assists', 'DBNOs', 'damageDealt',
                         'boosts', 'heals', 'revives', 
                         'walkDistance', 'rideDistance', 'swimDistance',
                        'headshotKills', 'killStreaks', 'longestKill', 'roadKills', 'teamKills', 'vehicleDestroys',
                        'weaponsAcquired', 'killPlace', 'killPoints']):
    sns.jointplot(x='winPlacePerc',y=x, data=train_data )
    plt.show()

## Correlation

In [None]:
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(train_data.corr(), cmap='RdBu',annot=True)
plt.show()

In [None]:
k = 5 #number of variables for heatmap
fig,ax = plt.subplots(figsize=(11, 11))
cols = train_data.corr().nlargest(k, 'winPlacePerc')['winPlacePerc'].index
cm = np.corrcoef(train_data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
                 yticklabels=cols.values, xticklabels=cols.values)
plt.show()

# Preprocessing

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X))

In [None]:
scaler_pred = MinMaxScaler()
pred_feat = pd.DataFrame(scaler_pred.fit_transform(pred_feat))

# Model Training

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
model = LinearRegression()
model.fit(X, y)
pred = model.predict(pred_feat)

submissions['winPlacePerc'] = pred
submissions.to_csv('LinearRegression.csv', index=False)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
model = RandomForestRegressor()
model.fit(X, y)
pred = model.predict(pred_feat)

submissions['winPlacePerc'] = pred
submissions.to_csv('RandomForestRegressor.csv', index=False)

## Keras Neural Network

In [None]:
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout

### [22->11->1]

In [None]:
def NeuralNetwork1():
    model = Sequential()
    model.add(Dense(11, input_dim=22, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [None]:
network1 = NeuralNetwork1()

In [None]:
network1.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5)

### [22->44->1]

In [None]:
def NeuralNetwork2():
    model = Sequential()
    model.add(Dense(44, input_dim=22, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [None]:
network2 = NeuralNetwork2()

In [None]:
network2.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5)

In [None]:
network = NeuralNetwork2()
network.fit(X, y, epochs=10)
pred = network.predict(pred_feat)

In [None]:
for p in pred[:5]:
    print(p)

In [None]:
submissions['winPlacePerc'] = pred
submissions.to_csv('Keras22-44-1.csv', index=False)

### [22->44->22->11->1]

In [None]:
def NeuralNetwork3():
    model = Sequential()
    model.add(Dense(44, input_dim=22, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    model.add(Dense(22, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    model.add(Dense(11, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [None]:
network3 = NeuralNetwork3()

In [None]:
network3.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5)

### [22->512->256->128->1]

In [None]:
def NeuralNetwork4():
    model = Sequential()
    model.add(Dense(512, input_dim=22, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [None]:
network4 = NeuralNetwork4()

In [None]:
network4.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5)

In [None]:
network = NeuralNetwork4()
network.fit(X, y, epochs=10)
pred = network.predict(pred_feat)

In [None]:
submissions['winPlacePerc'] = pred
submissions.to_csv('Keras22-512-256-128-1.csv', index=False)