In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
pubg_train = reduce_mem_usage(pd.read_csv('train_V2.csv'))

Memory usage of dataframe is 983.90 MB
Memory usage after optimization is: 288.39 MB
Decreased by 70.7%


In [4]:
pubg_train.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0,0.0,0,0.0,0,0,244.75,1,1466,0.444336
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.5,0,0,0,57,...,0,0.004501,0,11.039062,0,0,1434.0,5,0,0.640137
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0,0.0,0,0.0,0,0,161.75,2,0,0.775391
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.90625,0,0,0,75,...,0,0.0,0,0.0,0,0,202.75,3,0,0.166748
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [5]:
pubg_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
Id                 object
groupId            object
matchId            object
assists            int8
boosts             int8
damageDealt        float16
DBNOs              int8
headshotKills      int8
heals              int8
killPlace          int8
killPoints         int16
kills              int8
killStreaks        int8
longestKill        float16
matchDuration      int16
matchType          object
maxPlace           int8
numGroups          int8
rankPoints         int16
revives            int8
rideDistance       float16
roadKills          int8
swimDistance       float16
teamKills          int8
vehicleDestroys    int8
walkDistance       float16
weaponsAcquired    int16
winPoints          int16
winPlacePerc       float16
dtypes: float16(6), int16(5), int8(14), object(4)
memory usage: 288.4+ MB


In [6]:
pubg_train['matchType'].unique()

array(['squad-fpp', 'duo', 'solo-fpp', 'squad', 'duo-fpp', 'solo',
       'normal-squad-fpp', 'crashfpp', 'flaretpp', 'normal-solo-fpp',
       'flarefpp', 'normal-duo-fpp', 'normal-duo', 'normal-squad',
       'crashtpp', 'normal-solo'], dtype=object)

In [7]:
pubg_test = reduce_mem_usage(pd.read_csv('test_V2.csv'))

Memory usage of dataframe is 413.18 MB
Memory usage after optimization is: 121.74 MB
Decreased by 70.5%


In [8]:
pubg_test.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints
0,9329eb41e215eb,676b23c24e70d6,45b576ab7daa7f,0,0,51.46875,0,0,0,73,...,1500,0,0.0,0,0.0,0,0,588.0,1,0
1,639bd0dcd7bda8,430933124148dd,42a9a0b906c928,0,4,179.125,0,0,2,11,...,1503,2,4668.0,0,0.0,0,0,2017.0,6,0
2,63d5c8ef8dfe91,0b45f5db20ba99,87e7e4477a048e,1,0,23.40625,0,0,4,49,...,1565,0,0.0,0,0.0,0,0,788.0,4,0
3,cf5b81422591d1,b7497dbdc77f4a,1b9a94f1af67f1,0,0,65.5,0,0,0,54,...,1465,0,0.0,0,0.0,0,0,1812.0,3,0
4,ee6a295187ba21,6604ce20a1d230,40754a93016066,0,4,330.25,1,2,1,7,...,1480,1,0.0,0,0.0,0,0,2964.0,4,0


In [9]:
pubg_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1934174 entries, 0 to 1934173
Data columns (total 28 columns):
Id                 object
groupId            object
matchId            object
assists            int8
boosts             int8
damageDealt        float16
DBNOs              int8
headshotKills      int8
heals              int8
killPlace          int8
killPoints         int16
kills              int8
killStreaks        int8
longestKill        float16
matchDuration      int16
matchType          object
maxPlace           int8
numGroups          int8
rankPoints         int16
revives            int8
rideDistance       float16
roadKills          int8
swimDistance       float16
teamKills          int8
vehicleDestroys    int8
walkDistance       float16
weaponsAcquired    int16
winPoints          int16
dtypes: float16(5), int16(5), int8(14), object(4)
memory usage: 121.7+ MB


In [10]:
train_solos_df = pubg_train[(pubg_train['matchType'] == 'solo-fpp') | (pubg_train['matchType'] == 'solo') 
                      | (pubg_train['matchType'] == 'normal-solo-fpp')].reset_index().drop('index', axis=1)
train_solos_df.drop(['groupId', 'matchId', 'DBNOs', 'revives', 'matchType'], axis=1, inplace=True);
train_solos_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720387 entries, 0 to 720386
Data columns (total 24 columns):
Id                 720387 non-null object
assists            720387 non-null int8
boosts             720387 non-null int8
damageDealt        720387 non-null float16
headshotKills      720387 non-null int8
heals              720387 non-null int8
killPlace          720387 non-null int8
killPoints         720387 non-null int16
kills              720387 non-null int8
killStreaks        720387 non-null int8
longestKill        720387 non-null float16
matchDuration      720387 non-null int16
maxPlace           720387 non-null int8
numGroups          720387 non-null int8
rankPoints         720387 non-null int16
rideDistance       720387 non-null float16
roadKills          720387 non-null int8
swimDistance       720387 non-null float16
teamKills          720387 non-null int8
vehicleDestroys    720387 non-null int8
walkDistance       720387 non-null float16
weaponsAcquired    720387 non

In [11]:
test_solos_df = pubg_test[(pubg_test['matchType'] == 'solo-fpp') | (pubg_test['matchType'] == 'solo') 
                      | (pubg_test['matchType'] == 'normal-solo-fpp')].reset_index().drop('index', axis=1)
test_solos_df.drop(['groupId', 'matchId', 'DBNOs', 'revives', 'matchType'], axis=1, inplace=True);
test_solos_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 313866 entries, 0 to 313865
Data columns (total 23 columns):
Id                 313866 non-null object
assists            313866 non-null int8
boosts             313866 non-null int8
damageDealt        313866 non-null float16
headshotKills      313866 non-null int8
heals              313866 non-null int8
killPlace          313866 non-null int8
killPoints         313866 non-null int16
kills              313866 non-null int8
killStreaks        313866 non-null int8
longestKill        313866 non-null float16
matchDuration      313866 non-null int16
maxPlace           313866 non-null int8
numGroups          313866 non-null int8
rankPoints         313866 non-null int16
rideDistance       313866 non-null float16
roadKills          313866 non-null int8
swimDistance       313866 non-null float16
teamKills          313866 non-null int8
vehicleDestroys    313866 non-null int8
walkDistance       313866 non-null float16
weaponsAcquired    313866 non

In [12]:
train_solos_df.to_csv('train_solos_df.csv')
test_solos_df.to_csv('test_solos_df.csv')