In [0]:
import pandas as pd
import time
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from timeit import default_timer as timer
from sklearn import preprocessing

import random
random.seed(42)

import os
assert os.system("pip install ultimate==2.15.1")==0

from ultimate.mlp import MLP
import gc, sys
gc.enable()

NUM = 0

In [0]:
def state(message, start = True, time =0):
  if(start):
    print('Working on {message}...')
  else:
    print('Working on {message} took ({round(time, 3)}) Sec \n')

In [0]:

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [4]:
from google.colab import drive
drive.mount('/content/Drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/Drive


In [5]:
cd /content/Drive/My Drive/Colab Notebooks/pubg

/content/Drive/My Drive/Colab Notebooks/pubg


In [0]:
INPUT_DIR="/content/Drive/My Drive/Colab Notebooks/pubg"


In [0]:

def reload():
  print("Building dataframe...")
  gc.collect()
  df = reduce_mem_usage(pd.read_csv(INPUT_DIR + '/tt/train_V2.csv')) # <=========== Just a function to reduce memory usage

  # Only take the samples with matches that have more than 1 player 
  # there are matches with no players or just one player ( those samples could affect our model badly) 
  df = df[df['maxPlace'] > 1]
  invalid_match_ids = df[df['winPlacePerc'].isna()]['matchId'].values
  df = df[-df['matchId'].isin(invalid_match_ids)]
  print("Done loading train to dataframe...")
  return df



In [0]:
def train_test_split(df, test_size=0.1):
    match_ids = df['matchId'].unique().tolist()
    train_size = int(len(match_ids) * (1 - test_size))
    train_match_ids = random.sample(match_ids, train_size)

    train = df[df['matchId'].isin(train_match_ids)]
    test = df[-df['matchId'].isin(train_match_ids)]
    
    return train, test
  
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor

# Feature Selectors
def run_lightgbmreg(df, do_shap):
  print("LightGBM: Start Light Gradient Boosted Regression...")

  target = 'winPlacePerc'
  cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
  cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
  train, val = train_test_split(df, 0.1)

  params = {
      'n_estimators': 100,
      'learning_rate': 0.3, 
      'num_leaves': 20,
      'objective': 'regression_l2', 
      'metric': 'mae',
      'verbose': -1,
  }

  model = LGBMRegressor(**params)
  model.fit(
      train[cols_to_fit], train[target],
      eval_metric='mae',
      verbose=20,
  )
  
  y_true = val[target]
  y_pred = model.predict(val[cols_to_fit])
  
  if do_shap:
    show_shap_analysis(model, cols_to_fit, val)
  else:
    print("LightGBM: Selecting features")
    feature_importance = pd.DataFrame(sorted(zip(model.feature_importances_, cols_to_fit), reverse=True), columns=['Value','Feature'])
    print(feature_importance)
  
  return mean_absolute_error(y_true, y_pred)
  
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

def run_selectkbest(df, do_shap):
  print("SelectKBest: Start SelectKBest...")

  target = 'winPlacePerc'
  cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
  cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
  train, val = train_test_split(df, 0.1)

  params = {
      'score_func': f_classif,
      'k': all
  }
  
  model = SelectKBest(**params)
  model.fit(
      train[cols_to_fit], train[target]
  )
  
  y_true = val[target]
  y_pred = model.predict(val[cols_to_fit])
  
  if do_shap:
    show_shap_analysis(model, cols_to_fit, val)
  else:
    print("SelectKBest: Selecting features")
    dfscores = pd.DataFrame(model.scores_)
    dfcolumns = pd.DataFrame(train[cols_to_fit].columns)
    #concat two dataframes for better visualization 
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Specs','Score']  #naming the dataframe columns
    print(featureScores.nlargest(10,'Score'))  #print 10 best features

  return mean_absolute_error(y_true, y_pred)

  
from sklearn.ensemble import RandomForestRegressor
def run_randomforest(df, do_shap):
  print("RandomForestRegressor: Start Random Forest Regression...")

  target = 'winPlacePerc'
  cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
  cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
  train, val = train_test_split(df, 0.1)

  model = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features='sqrt',
                          n_jobs=-1, verbose = 2)
  model.fit(
      train[cols_to_fit], train[target]
  )
  
  y_true = val[target]
  y_pred = model.predict(val[cols_to_fit])
  
  if do_shap:
    show_shap_analysis(model, cols_to_fit, val)
  else:
    print("RandomForestRegressor: Selecting features")
    feature_importance = pd.DataFrame(sorted(zip(model.feature_importances_, cols_to_fit), reverse=True), columns=['Value','Feature'])
    print(feature_importance)
    
  return mean_absolute_error(y_true, y_pred)

  
from sklearn.linear_model import LassoCV
def run_lasso(df, do_shap):
  print("Lasso CV: Start Lasso Regularization for selecting features...")

  target = 'winPlacePerc'
  cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
  cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
  train, val = train_test_split(df, 0.1)

  model = LassoCV()
  model.fit(train[cols_to_fit], train[target])
  
  y_true = val[target]
  y_pred = model.predict(val[cols_to_fit])
  
  if do_shap:
    show_shap_analysis(model, cols_to_fit, val)
  
  else:
    print("Best alpha using built-in LassoCV: %f" % model.alpha_)
    print("Best score using built-in LassoCV: %f" % model.score(train[cols_to_fit], train[target]))
    coef = pd.Series(model.coef_, index = train[cols_to_fit].columns)

    print("LassoCV: Selecting features")
    print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

    imp_coef = coef.sort_values()
    plt.rcParams['figure.figsize'] = (8.0, 10.0)
    imp_coef.plot(kind = "barh")
    plt.title("Feature importance using Lasso Model")
  
  return mean_absolute_error(y_true, y_pred)
  
  
from xgboost import XGBRegressor
def run_xgboost(df, do_shap):
  print("XGBoost: Start XGBoost Regression...")

  target = 'winPlacePerc'
  cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
  cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
  train, val = train_test_split(df, 0.1)

  params = {
      'n_estimators': 40,
      'learning_rate': 0.1, 
      'num_leaves': 20,
      'objective': 'binary:logistic', 
      'metric': 'mae',
      'verbose': 20,
      'seed' : 42
  }

  model = XGBRegressor(**params)
  model.fit(
      train[cols_to_fit], train[target],
      eval_set=[(val[cols_to_fit], val[target])],
      eval_metric='mae',
      verbose=20,
  )
  
  print("XGBoost: Selecting features")
  feature_importance = pd.DataFrame(sorted(zip(model.feature_importances_, cols_to_fit), reverse=True), columns=['Value','Feature'])
  print(feature_importance)
  return model

In [0]:
import lightgbm as lgb
def run_lgb2(df, do_shap):
  print("LightGBM: Start Light Gradient Boosted Regression...")

  target = 'winPlacePerc'
  cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
  cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
  train, val = train_test_split(df, 0.1)

  params = {"objective" : "regression", "metric" : "mae", 'n_estimators':1000,
              "num_leaves" : 31, "learning_rate" : 0.5, "bagging_fraction" : 0.7,
               "bagging_seed" : 0, "num_threads" : 4,"colsample_bytree" : 0.7
             }

  model = LGBMRegressor(**params)
  model.fit(
      train[cols_to_fit], train[target],
      eval_metric='mae',
      verbose=20,
  )
  
  y_true = val[target]
  y_pred = model.predict(val[cols_to_fit])
  
  if do_shap:
    show_shap_analysis(model, cols_to_fit, val)
  else:
    print("LightGBM: Selecting features")
    feature_importance = pd.DataFrame(sorted(zip(model.feature_importances_, cols_to_fit), reverse=True), columns=['Value','Feature'])
    print(feature_importance)
  
  return mean_absolute_error(y_true, y_pred)


params = {
      'n_estimators': 100,
      'learning_rate': 0.3, 
      'num_leaves': 20,
      'objective': 'regression_l2', 
      'metric': 'mae',
      'verbose': -1,
  }

  model = LGBMRegressor

In [0]:
def run_experiment(preprocess):
    df = reload()
    df.drop(columns=['matchType'], inplace=True)
    
    df = preprocess(df)

    #score = run_lightgbmreg(df, False)
    score = run_lgb2(df, False)
    #score=run_selectkbest(df, False)
    #score=run_xgboost(df, False)
    #score = feature_engineering(True)
    return score

def run_experiments(preprocesses):
    results = []
    for preprocess in preprocesses:
        start = time.time()
        score = run_experiment(preprocess)
        execution_time = time.time() - start
        results.append({
            'name': preprocess.__name__,
            'score': score,
            'execution time': f'{round(execution_time, 2)}s'
        })
        gc.collect()
        
    return pd.DataFrame(results, columns=['name', 'score', 'execution time']).sort_values(by='score')

In [0]:
def original(df):
    return df
  #preprocessing functions

In [0]:
def generate_train_test_set(df, split):
  print("Generating train and test set...")
  df.drop(columns=['matchType'], inplace=True)
  
  cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType']
  cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
  train, val = train_test_split(df, split)
  
  return train[cols_to_fit], val[cols_to_fit]

def load_test():
  print("Building dataframe...")
  df = reduce_mem_usage(pd.read_csv(INPUT_DIR + '/tt/test_V2.csv')) # <=========== Just a function to reduce memory usage

  cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType']
  cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
  print("Done loading train to dataframe...")
  return df[cols_to_fit]

In [13]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LeakyReLU
from keras.regularizers import l2

# adjusted because im still testing so i reduced hidden units from 64 to 32, epoch reduced
class NNModel():
  # network parameters
  batch_size = 32
  hidden_units = 32
  dropout = 0.1
  kernel_regularizer = l2(0.0001)
  leaky_relu = (5 ** 0.5 - 3) / 2
  
  def __init__(self, input_size):
    # Regression has 1 output layer
    output_shape = 1
    
    self.model = model = Sequential()
    self.model.add(Dense(self.hidden_units, input_dim=input_size))
    # self.model.add(LeakyReLU(alpha=self.leaky_relu))
    self.model.add(Activation('sigmoid'))
    self.model.add(Dropout(self.dropout))
    self.model.add(Dense(self.hidden_units))
    self.model.add(Activation('sigmoid'))
    # self.model.add(LeakyReLU(alpha=self.leaky_relu))
    self.model.add(Dropout(self.dropout))
    self.model.add(Dense(output_shape))
    # this is the output for one-hot vector
    self.model.add(Activation('linear'))
    
  def _summarize(self):
    self.model.summary()
  
  def _compile(self):
    self.model.compile(loss='mse',
              optimizer='adam',
              metrics=['mse', 'mae'])
    
  def _train(self, x_train, y_train, epochs):
    self.model.fit(x_train, y_train, 
              epochs=epochs, batch_size=self.batch_size)
    
  def _evaluate(self, x_test, y_test):
    return self.model.evaluate(x_test, y_test, batch_size=self.batch_size)
  
  def _predict(self, x_test):
    return self.model.predict(x_test)

Using TensorFlow backend.


In [14]:
# Put Feature Selection Experiments here
experiment_scores = run_experiments([original]) # original dataframe


# Print Scores
print(experiment_scores)
# Best combination should be used for training and testing on NN

Building dataframe...
Done loading train to dataframe...
LightGBM: Start Light Gradient Boosted Regression...
LightGBM: Selecting features
    Value          Feature
0    3741        killPlace
1    3454    matchDuration
2    3314     walkDistance
3    2746         maxPlace
4    2526        numGroups
5    1939      longestKill
6    1782      damageDealt
7    1296     rideDistance
8    1120       rankPoints
9    1110  weaponsAcquired
10    965            kills
11    899        winPoints
12    892       killPoints
13    871            heals
14    794           boosts
15    625            DBNOs
16    511          assists
17    361     swimDistance
18    341      killStreaks
19    308    headshotKills
20    225          revives
21    123        teamKills
22     36        roadKills
23     21  vehicleDestroys
       name    score execution time
0  original  0.05834        168.92s


In [15]:
# Training Phase
from sklearn.preprocessing import StandardScaler
df = reload()
target = 'winPlacePerc'

print("Num of rows of csv is ", len(df.index))

train, val = generate_train_test_set(df, 0.1)
x_train, y_train = train.drop(target, axis=1).to_numpy(), train[target].to_numpy()
x_eval, y_eval = val.drop(target, axis=1).to_numpy(), val[target].to_numpy()

scaler = StandardScaler().fit(x_train)
rescaled_x_train = scaler.transform(x_train)

del x_train
del train
del val
gc.collect()

num_labels = len(x_eval[0])
model1 = NNModel(num_labels)
model1._summarize()
model1._compile()
model1._train(rescaled_x_train, y_train, 2)

Building dataframe...
Done loading train to dataframe...
Num of rows of csv is  4446965
Generating train and test set...




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                800       
_________________________________________________________________
activation_1 (Activation)    (None, 32)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
activation_2 (Activation)    (None, 32)                0         
____________

In [16]:
# Training Evaluation
# USE THIS FOR DEBUGGING PURPOSES: final training should have all training dataset for accuracy
rescaled_x_test = scaler.transform(x_eval)
score = model1._evaluate(rescaled_x_test, y_eval)
print("\nMean Squared Error [smaller the better]: %f" % (score[1]))


Mean Squared Error [smaller the better]: 0.007771


In [18]:
# testing phase
# https://www.kaggle.com/ceshine/a-simple-post-processing-trick-lb-0237-0204/
def transform_preds(df_test, pred):
  for i in range(len(df_test)):
      winPlacePerc_m = pred[i]
      maxPlace = int(df_test.iloc[i]['maxPlace'])
      if maxPlace == 0:
          winPlacePerc_m = 0.0
      elif maxPlace == 1:
          winPlacePerc_m = 1.0
      else:
          gap = 1.0 / (maxPlace - 1)
          winPlacePerc_m = np.round(winPlacePerc_m / gap) * gap

      if winPlacePerc_m < 0: winPlacePerc_m = 0.0
      if winPlacePerc_m > 1: winPlacePerc_m = 1.0    
      pred[i] = winPlacePerc_m

      if (i + 1) % 100000 == 0:
          print(i, flush=True, end=" ")

  df_test['winPlacePerc_mod'] = pred
  return df_test

df_test = load_test() # load test data from csv
x_test = df_test.to_numpy() # df to numpy array
scaled_x_test = scaler.transform(x_test) # scaling

del x_test
gc.collect()
pred = model1._predict(scaled_x_test)
df_test['orig_preds'] = pred
df_test = transform_preds(df_test['maxPlace', 'orig_preds'], pred)
# Sample Table
df_test[:5]

Building dataframe...
Done loading train to dataframe...


KeyError: ignored