In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
pubg_train = reduce_mem_usage(pd.read_csv('train_V2.csv'))

Memory usage of dataframe is 983.90 MB
Memory usage after optimization is: 288.39 MB
Decreased by 70.7%


In [4]:
pubg_train.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0,0.0,0,0.0,0,0,244.75,1,1466,0.444336
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.5,0,0,0,57,...,0,0.004501,0,11.039062,0,0,1434.0,5,0,0.640137
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0,0.0,0,0.0,0,0,161.75,2,0,0.775391
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.90625,0,0,0,75,...,0,0.0,0,0.0,0,0,202.75,3,0,0.166748
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [5]:
pubg_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
Id                 object
groupId            object
matchId            object
assists            int8
boosts             int8
damageDealt        float16
DBNOs              int8
headshotKills      int8
heals              int8
killPlace          int8
killPoints         int16
kills              int8
killStreaks        int8
longestKill        float16
matchDuration      int16
matchType          object
maxPlace           int8
numGroups          int8
rankPoints         int16
revives            int8
rideDistance       float16
roadKills          int8
swimDistance       float16
teamKills          int8
vehicleDestroys    int8
walkDistance       float16
weaponsAcquired    int16
winPoints          int16
winPlacePerc       float16
dtypes: float16(6), int16(5), int8(14), object(4)
memory usage: 288.4+ MB


In [6]:
pubg_train['matchType'].unique()

array(['squad-fpp', 'duo', 'solo-fpp', 'squad', 'duo-fpp', 'solo',
       'normal-squad-fpp', 'crashfpp', 'flaretpp', 'normal-solo-fpp',
       'flarefpp', 'normal-duo-fpp', 'normal-duo', 'normal-squad',
       'crashtpp', 'normal-solo'], dtype=object)

In [7]:
solos_df = pubg_train[(pubg_train['matchType'] == 'solo-fpp') | (pubg_train['matchType'] == 'solo') 
                      | (pubg_train['matchType'] == 'normal-solo-fpp')].reset_index().drop('index', axis=1)
solos_df.drop(['groupId', 'matchId', 'DBNOs', 'revives', 'swimDistance', 'walkDistance', 'rideDistance', 'matchType'], axis=1, inplace=True);
solos_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720387 entries, 0 to 720386
Data columns (total 21 columns):
Id                 720387 non-null object
assists            720387 non-null int8
boosts             720387 non-null int8
damageDealt        720387 non-null float16
headshotKills      720387 non-null int8
heals              720387 non-null int8
killPlace          720387 non-null int8
killPoints         720387 non-null int16
kills              720387 non-null int8
killStreaks        720387 non-null int8
longestKill        720387 non-null float16
matchDuration      720387 non-null int16
maxPlace           720387 non-null int8
numGroups          720387 non-null int8
rankPoints         720387 non-null int16
roadKills          720387 non-null int8
teamKills          720387 non-null int8
vehicleDestroys    720387 non-null int8
weaponsAcquired    720387 non-null int16
winPoints          720387 non-null int16
winPlacePerc       720386 non-null float16
dtypes: float16(3), int16(5), int

In [8]:
solos_df.describe()

Unnamed: 0,assists,boosts,damageDealt,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,maxPlace,numGroups,rankPoints,roadKills,teamKills,vehicleDestroys,weaponsAcquired,winPoints,winPlacePerc
count,720387.0,720387.0,720387.0,720387.0,720387.0,720387.0,720387.0,720387.0,720387.0,720387.0,720387.0,720387.0,720387.0,720387.0,720387.0,720387.0,720387.0,720387.0,720387.0,720386.0
mean,0.061474,1.059973,,0.248525,1.01349,47.474106,447.321506,0.933884,0.474645,,1600.831105,93.942782,91.150458,966.801465,0.005115,0.011459,0.004511,3.587341,557.243486,
std,0.291209,1.79953,,0.65769,2.405218,27.723485,595.020288,1.589397,0.563667,,256.823622,10.007331,11.317641,744.266943,0.08798,0.106458,0.069617,2.59487,728.786109,0.0
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,9.0,1.0,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,23.0,0.0,0.0,0.0,0.0,1384.0,94.0,91.0,-1.0,0.0,0.0,0.0,2.0,0.0,0.229248
50%,0.0,0.0,74.25,0.0,0.0,47.0,0.0,0.0,0.0,0.0,1456.0,96.0,93.0,1496.0,0.0,0.0,0.0,3.0,0.0,0.483887
75%,0.0,2.0,169.125,0.0,1.0,71.0,1090.0,1.0,1.0,20.84375,1874.0,97.0,95.0,1517.0,0.0,0.0,0.0,5.0,1492.0,0.747559
max,22.0,28.0,6616.0,42.0,63.0,100.0,1970.0,66.0,18.0,1002.0,2237.0,100.0,100.0,2857.0,18.0,2.0,5.0,153.0,1922.0,1.0


In [9]:
np.nan in solos_df

False

No NaN values in the dataframe, that's good. Should make life easier.

In [10]:
solos_df[solos_df['kills'] >= 50]

Unnamed: 0,Id,assists,boosts,damageDealt,headshotKills,heals,killPlace,killPoints,kills,killStreaks,...,matchDuration,maxPlace,numGroups,rankPoints,roadKills,teamKills,vehicleDestroys,weaponsAcquired,winPoints,winPlacePerc
54296,810f2379261545,20,0,6616.0,13,5,1,0,65,7,...,1798,11,11,1500,0,0,0,60,0,1.0
124549,2ade4369bccd12,12,0,5556.0,7,4,1,0,55,6,...,1798,19,18,1500,0,0,0,66,0,1.0
128897,7f3edd982813e6,6,0,5408.0,16,14,1,1000,56,5,...,1797,37,22,-1,0,0,0,50,1500,1.0
202420,80ac0bbf58bfaf,5,0,6376.0,21,4,1,0,66,8,...,1390,18,12,1500,0,0,0,23,0,1.0
236013,c47bd86daa3de6,2,2,4496.0,42,2,1,1000,50,7,...,1136,30,17,-1,0,0,0,15,1500,1.0
237412,be4ff9afaa5bb1,22,0,5376.0,8,4,3,0,53,3,...,1798,11,11,1500,0,0,0,83,0,0.799805
242116,1f6ceaa5140fb6,15,0,5468.0,13,5,2,0,56,4,...,1798,11,11,1500,0,0,0,60,0,0.899902
421731,436d1530e9eb00,5,0,4348.0,31,0,1,0,53,8,...,1116,37,26,1500,0,0,0,36,0,1.0
474905,97047ef60af1e7,11,0,5520.0,8,4,1,0,53,3,...,1799,16,14,1500,0,0,0,57,0,1.0


Having played the game, I'm fairly certain these are cheaters that used hacking techniques to have instant kills across the map. Especially #236013, I don't like that 84% of his kills were headshot kills, and his longest kill was over 300 meters. That screams cheating.

In [11]:
solos_df = solos_df.drop(solos_df[solos_df['kills'] >= 50].index)
solos_df.describe()

Unnamed: 0,assists,boosts,damageDealt,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,maxPlace,numGroups,rankPoints,roadKills,teamKills,vehicleDestroys,weaponsAcquired,winPoints,winPlacePerc
count,720378.0,720378.0,720378.0,720378.0,720378.0,720378.0,720378.0,720378.0,720378.0,720378.0,720378.0,720378.0,720378.0,720378.0,720378.0,720378.0,720378.0,720378.0,720378.0,720377.0
mean,0.061339,1.059984,,0.248307,1.013444,47.474683,447.324319,0.933192,0.47458,,1600.831073,93.943692,91.1514,966.798971,0.005115,0.011459,0.004512,3.586761,557.246283,
std,0.287729,1.799537,,0.65361,2.405164,27.723178,595.021659,1.577191,0.563333,,256.823257,10.004017,11.314561,744.26799,0.087981,0.106459,0.069617,2.588692,728.786899,0.0
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,9.0,1.0,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,23.0,0.0,0.0,0.0,0.0,1384.0,94.0,91.0,-1.0,0.0,0.0,0.0,2.0,0.0,0.229248
50%,0.0,0.0,74.25,0.0,0.0,47.0,0.0,0.0,0.0,0.0,1456.0,96.0,93.0,1496.0,0.0,0.0,0.0,3.0,0.0,0.483887
75%,0.0,2.0,169.125,0.0,1.0,71.0,1090.0,1.0,1.0,20.84375,1874.0,97.0,95.0,1517.0,0.0,0.0,0.0,5.0,1492.0,0.747559
max,21.0,28.0,5480.0,39.0,63.0,100.0,1970.0,48.0,18.0,1002.0,2237.0,100.0,100.0,2857.0,18.0,2.0,5.0,153.0,1922.0,1.0


Continuing the search for cheaters, anyone with more than an 80% headshot kill to kill ratio and more than 20 kills could be classified as a cheater and should not be considered within the dataset.

In [12]:
solos_df[((solos_df['headshotKills'] / solos_df['kills']) >= 0.8) & (solos_df['kills'] >= 20)]

Unnamed: 0,Id,assists,boosts,damageDealt,headshotKills,heals,killPlace,killPoints,kills,killStreaks,...,matchDuration,maxPlace,numGroups,rankPoints,roadKills,teamKills,vehicleDestroys,weaponsAcquired,winPoints,winPlacePerc
158594,8e9eb1ce0e0135,0,7,2068.0,19,2,1,0,21,2,...,1375,97,94,1507,0,0,0,4,0,1.0
330512,3e28994fb2950b,1,0,3696.0,39,2,1,0,41,8,...,886,17,13,1500,0,0,0,21,0,1.0
544484,a7ad68271a7137,5,0,2560.0,23,0,3,1000,25,9,...,897,21,13,-1,0,0,0,14,1500,0.899902


In [13]:
solos_df = solos_df.drop(solos_df[((solos_df['headshotKills'] / solos_df['kills']) >= 0.8) & (solos_df['kills'] >= 20)].index)

Next we take a look at match duration. Looking for matches that lasted shorter than about 15 minutes could mean that the game started with a well below average number of people.

In [21]:
three_down = np.mean(solos_df['matchDuration']) - 3 * np.std(solos_df['matchDuration'])

print(three_down)

834.5493596376818


So three_down will look for matches in this database that lasted less than 830 seconds, which is about 14 minutes. Games in PUBG typically last around 1600 seconds or around 26 minutes. These are games that could have started with a small amount of people. Only 30 or so people, so getting rid of these will drop all of those games that started with a well below average number of people.

In [15]:
solos_df[(solos_df['matchDuration'] <= three_down)]

Unnamed: 0,Id,assists,boosts,damageDealt,headshotKills,heals,killPlace,killPoints,kills,killStreaks,...,matchDuration,maxPlace,numGroups,rankPoints,roadKills,teamKills,vehicleDestroys,weaponsAcquired,winPoints,winPlacePerc
211,32220283493bc2,2,0,655.00000,2,0,11,1000,6,3,...,492,20,13,-1,0,0,0,9,1500,0.473633
1055,6ab29688a68902,0,2,85.68750,0,7,3,0,1,1,...,314,10,10,1500,0,0,0,1,0,0.777832
6004,8358fd4679e2db,4,0,339.00000,1,0,13,1000,3,1,...,492,20,13,-1,0,0,0,9,1500,0.368408
8206,26809358eab674,1,0,352.25000,0,2,6,0,2,1,...,635,6,5,1500,0,0,0,25,0,0.000000
8415,a16a20a3305521,0,1,522.00000,1,5,6,0,2,1,...,686,7,7,1500,0,0,0,18,0,0.166748
10647,822d53b0965228,4,0,1009.00000,1,0,7,1000,12,3,...,599,21,14,-1,0,0,0,20,1500,0.750000
16032,a2b88f886138cf,0,0,200.00000,1,1,16,1000,2,2,...,384,37,12,-1,0,0,0,7,1500,0.944336
16068,1ac82deebad3df,0,0,0.00000,0,0,13,1000,0,0,...,762,16,15,-1,0,0,0,1,1500,0.133301
16653,d897450a550371,1,1,689.00000,7,3,16,1000,7,2,...,732,32,16,-1,0,0,0,12,1500,0.580566
20197,ef3c16baa36e0f,0,0,29.40625,0,0,6,0,0,0,...,530,9,9,1500,0,0,0,1,0,0.500000


In [16]:
solos_df = solos_df.drop(solos_df[(solos_df['matchDuration'] <= three_down)].index)
solos_df.head()

Unnamed: 0,Id,assists,boosts,damageDealt,headshotKills,heals,killPlace,killPoints,kills,killStreaks,...,matchDuration,maxPlace,numGroups,rankPoints,roadKills,teamKills,vehicleDestroys,weaponsAcquired,winPoints,winPlacePerc
0,315c96c26c9aac,0,0,100.0,0,0,45,0,1,1,...,1424,97,95,1560,0,0,0,2,0,0.1875
1,311b84c6ff4390,0,0,8.539062,0,0,48,1000,0,0,...,1967,96,92,-1,0,0,0,6,1500,0.736816
2,b7807186e3f679,0,1,324.25,1,5,5,986,4,1,...,1886,97,94,-1,0,0,0,6,1462,0.875
3,92022479b92ce7,0,3,254.25,0,12,13,0,2,1,...,1371,96,95,1536,0,0,0,3,0,0.821289
4,47143f942503e0,0,0,136.875,0,0,37,0,1,1,...,1425,96,94,1500,0,0,0,1,0,0.347412


In [17]:
solos_df.describe()

Unnamed: 0,assists,boosts,damageDealt,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,maxPlace,numGroups,rankPoints,roadKills,teamKills,vehicleDestroys,weaponsAcquired,winPoints,winPlacePerc
count,719937.0,719937.0,719937.0,719937.0,719937.0,719937.0,719937.0,719937.0,719937.0,719937.0,719937.0,719937.0,719937.0,719937.0,719937.0,719937.0,719937.0,719937.0,719937.0,719937.0
mean,0.060916,1.060325,,0.247738,1.013297,47.496199,447.310804,0.931598,0.474196,,1601.451408,93.9876,91.19877,966.903819,0.005117,0.011455,0.004514,3.583739,557.156339,
std,0.285408,1.799856,,0.648956,2.405124,27.716388,595.075446,1.571368,0.562466,,255.635701,9.840698,11.153571,744.252849,0.088,0.10644,0.069638,2.580297,728.765535,0.0
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,850.0,5.0,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,23.0,0.0,0.0,0.0,0.0,1384.0,94.0,91.0,-1.0,0.0,0.0,0.0,2.0,0.0,0.229248
50%,0.0,0.0,74.1875,0.0,0.0,47.0,0.0,0.0,0.0,0.0,1456.0,96.0,93.0,1496.0,0.0,0.0,0.0,3.0,0.0,0.483887
75%,0.0,2.0,169.125,0.0,1.0,71.0,1090.0,1.0,1.0,20.8125,1875.0,97.0,95.0,1517.0,0.0,0.0,0.0,5.0,1492.0,0.747559
max,21.0,28.0,5480.0,26.0,63.0,100.0,1970.0,48.0,18.0,1002.0,2237.0,100.0,100.0,2857.0,18.0,2.0,5.0,153.0,1922.0,1.0


In the teamKills column, there is a max of 2. Someone can't kill themselves twice in a solos game. Let's see that data point

In [18]:
solos_df[solos_df['teamKills'] == max(solos_df['teamKills'])]

Unnamed: 0,Id,assists,boosts,damageDealt,headshotKills,heals,killPlace,killPoints,kills,killStreaks,...,matchDuration,maxPlace,numGroups,rankPoints,roadKills,teamKills,vehicleDestroys,weaponsAcquired,winPoints,winPlacePerc
58140,6e34b7909e4691,1,2,881.0,6,17,6,0,12,2,...,1097,11,8,1500,0,2,0,38,0,0.399902
446343,0c575f045d2fb1,3,0,566.0,1,2,23,0,2,1,...,1093,23,17,1500,0,2,0,26,0,0.0


Two of them. These data points have clearly bad data in them, so they must be removed from the data set

In [19]:
solos_df = solos_df.drop(solos_df[solos_df['teamKills'] == 2].index)
solos_df.describe()

Unnamed: 0,assists,boosts,damageDealt,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,maxPlace,numGroups,rankPoints,roadKills,teamKills,vehicleDestroys,weaponsAcquired,winPoints,winPlacePerc
count,719935.0,719935.0,719935.0,719935.0,719935.0,719935.0,719935.0,719935.0,719935.0,719935.0,719935.0,719935.0,719935.0,719935.0,719935.0,719935.0,719935.0,719935.0,719935.0,719935.0
mean,0.060911,1.060325,,0.247729,1.013273,47.496291,447.312046,0.931581,0.474193,,1601.452814,93.987814,91.198989,966.902338,0.005117,0.01145,0.004514,3.58366,557.157886,
std,0.285385,1.799858,,0.648921,2.405053,27.716368,595.075806,1.571316,0.562464,,255.634662,9.83987,11.152813,744.253353,0.088,0.106389,0.069638,2.579846,728.765955,0.0
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,850.0,5.0,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,23.0,0.0,0.0,0.0,0.0,1384.0,94.0,91.0,-1.0,0.0,0.0,0.0,2.0,0.0,0.229248
50%,0.0,0.0,74.1875,0.0,0.0,47.0,0.0,0.0,0.0,0.0,1456.0,96.0,93.0,1496.0,0.0,0.0,0.0,3.0,0.0,0.483887
75%,0.0,2.0,169.125,0.0,1.0,71.0,1090.0,1.0,1.0,20.8125,1875.0,97.0,95.0,1517.0,0.0,0.0,0.0,5.0,1492.0,0.747559
max,21.0,28.0,5480.0,26.0,63.0,100.0,1970.0,48.0,18.0,1002.0,2237.0,100.0,100.0,2857.0,18.0,1.0,5.0,153.0,1922.0,1.0


In [20]:
solos_df = solos_df.reset_index().drop('index', axis=1)