# **데이터 불러오기 및 메모리 축소**

In [6]:
#필요한 라이브러리 불러오기 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMRegressor
import lightgbm as lgb

warnings.filterwarnings('ignore')
%matplotlib inline

pd.options.display.float_format = '{:,.3f}'.format
sns.set()

Collecting plotly
  Downloading plotly-5.13.1-py2.py3-none-any.whl (15.2 MB)
Collecting tenacity>=6.2.0
  Downloading tenacity-8.2.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.13.1 tenacity-8.2.1


In [7]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import gc

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
#train 데이터 불러오기 

pubg_data = pd.read_csv("train_V2.csv")   #각자 구글 드라이브에 들어가면 바로 보이게 파일 넣으시고 경로 변경 없이 바로 데이터 불러올 수 있게 설정함 
df = pubg_data
df.info()

# 그냥 불러오면 983.9MB로 메모리가 굉장히 큰 상태 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Id               object 
 1   groupId          object 
 2   matchId          object 
 3   assists          int64  
 4   boosts           int64  
 5   damageDealt      float64
 6   DBNOs            int64  
 7   headshotKills    int64  
 8   heals            int64  
 9   killPlace        int64  
 10  killPoints       int64  
 11  kills            int64  
 12  killStreaks      int64  
 13  longestKill      float64
 14  matchDuration    int64  
 15  matchType        object 
 16  maxPlace         int64  
 17  numGroups        int64  
 18  rankPoints       int64  
 19  revives          int64  
 20  rideDistance     float64
 21  roadKills        int64  
 22  swimDistance     float64
 23  teamKills        int64  
 24  vehicleDestroys  int64  
 25  walkDistance     float64
 26  weaponsAcquired  int64  
 27  winPoints   

In [9]:
%%time
df.info()

# Memory usages in Bytes
print("Reduced Memory size: ",df.memory_usage(index=True).sum()/(1024*1024), "MB")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Id               object 
 1   groupId          object 
 2   matchId          object 
 3   assists          int64  
 4   boosts           int64  
 5   damageDealt      float64
 6   DBNOs            int64  
 7   headshotKills    int64  
 8   heals            int64  
 9   killPlace        int64  
 10  killPoints       int64  
 11  kills            int64  
 12  killStreaks      int64  
 13  longestKill      float64
 14  matchDuration    int64  
 15  matchType        object 
 16  maxPlace         int64  
 17  numGroups        int64  
 18  rankPoints       int64  
 19  revives          int64  
 20  rideDistance     float64
 21  roadKills        int64  
 22  swimDistance     float64
 23  teamKills        int64  
 24  vehicleDestroys  int64  
 25  walkDistance     float64
 26  weaponsAcquired  int64  
 27  winPoints   

In [10]:
for column_name in df:
    if df[column_name].dtype=='float64':
        df[column_name] = pd.to_numeric(df[column_name], downcast= 'float')
    if df[column_name].dtype=='int64':
        df[column_name] = pd.to_numeric(df[column_name],downcast='integer')

In [11]:
%%time
df.info()

# Memory usages in Bytes
print("Reduced Memory size: ",df.memory_usage(index=True).sum()/(1024*1024), "MB")

# 데이터의 메모리가 상당히 많이 줄었음을 알 수 있음 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Id               object 
 1   groupId          object 
 2   matchId          object 
 3   assists          int8   
 4   boosts           int8   
 5   damageDealt      float32
 6   DBNOs            int8   
 7   headshotKills    int8   
 8   heals            int8   
 9   killPlace        int8   
 10  killPoints       int16  
 11  kills            int8   
 12  killStreaks      int8   
 13  longestKill      float32
 14  matchDuration    int16  
 15  matchType        object 
 16  maxPlace         int8   
 17  numGroups        int8   
 18  rankPoints       int16  
 19  revives          int8   
 20  rideDistance     float32
 21  roadKills        int8   
 22  swimDistance     float32
 23  teamKills        int8   
 24  vehicleDestroys  int8   
 25  walkDistance     float32
 26  weaponsAcquired  int16  
 27  winPoints   

# **데이터 살펴보기 및 전처리**

In [12]:
df.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0,0.0,0,0.0,0,0,244.8,1,1466,0.444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0,0.004,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0,0.0,0,0.0,0,0,161.8,2,0,0.775
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,...,0,0.0,0,0.0,0,0,202.7,3,0,0.167
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0,0.0,0,0.0,0,0,49.75,2,0,0.188


In [13]:
df.columns

Index(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints', 'winPlacePerc'],
      dtype='object')

In [14]:
print("Train : ", df.shape)

Train :  (4446966, 29)


In [15]:

# col 생략 없이 출력
pd.set_option('display.max_columns', None)

In [16]:
df.describe()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
count,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446965.0
mean,0.234,1.107,130.633,0.658,0.227,1.37,47.599,505.006,0.925,0.544,22.993,1579.506,44.505,43.008,892.01,0.165,606.092,0.003,4.509,0.024,0.008,1148.517,3.66,606.46,0.473
std,0.589,1.716,169.887,1.146,0.602,2.68,27.463,627.505,1.558,0.711,51.476,258.74,23.828,23.289,736.648,0.472,1496.47,0.073,30.238,0.167,0.093,1180.553,2.457,739.7,0.307
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,9.0,1.0,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,1367.0,28.0,27.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,155.1,2.0,0.0,0.2
50%,0.0,0.0,84.24,0.0,0.0,0.0,47.0,0.0,0.0,0.0,0.0,1438.0,30.0,30.0,1443.0,0.0,0.0,0.0,0.0,0.0,0.0,685.6,3.0,0.0,0.458
75%,0.0,2.0,186.0,1.0,0.0,2.0,71.0,1172.0,1.0,1.0,21.32,1851.0,49.0,47.0,1500.0,0.0,0.191,0.0,0.0,0.0,0.0,1976.0,5.0,1495.0,0.741
max,22.0,33.0,6616.0,53.0,64.0,80.0,101.0,2170.0,72.0,20.0,1094.0,2237.0,100.0,100.0,5910.0,39.0,40710.0,18.0,3823.0,12.0,5.0,25780.0,236.0,2013.0,1.0


In [17]:
df.isnull().sum()  #결측치는 'winPlacePerc'(승률 변수)에서 존재   

Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       1
dtype: int64

In [18]:
df[df['winPlacePerc'].isna()==True]

# 전체 4446966 행에서 결측치 행 1행은 소수
# winPlacePerc 변수에서 결측치 값이 있는 1 행만 추출: 별 의미 없는 행
# ---> 최종적으로 이 행 제거 

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
2744604,f70c74418bb064,12dfbede33f92b,224a123c53e008,0,0,0.0,0,0,0,1,0,0,0,0.0,9,solo-fpp,1,1,1574,0,0.0,0,0.0,0,0,0.0,0,0,


In [19]:
df.dropna(inplace=True)

In [20]:
df.isnull().sum() # 모든 컬럼에 결측치가 없음을 확인 

Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       0
dtype: int64

In [77]:
# train.drop(['Id'], axis=1, inplace=True) # 'Id' 변수는 플레이어의 id(처음 게임 생성했을 때 지정하는 닉네임 변수)이기에 큰 영향을 주지 않을 것 같아서 drop ----> 추후 이상치 제거에 사용할 변수기에 주석 처리 

# **매치 타입 레이블 인코딩**

In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
df['matchType'] = df['matchType'].apply(lambda x: 'solo' if ('solo' in x) else 'duo' if ('duo' in x) else 'etc' if ('flare' in x) or ('crash' in x) else 'squad')
encoder = LabelEncoder()
df['matchType'] = encoder.fit_transform(df['matchType'])
df['matchType'].value_counts()


3    2400402
0    1315970
2     720712
1       9881
Name: matchType, dtype: int64

In [23]:
print(encoder.inverse_transform([0,1,2,3])) # duo는 0으로 etc는 1로 solo는 2로 squad는 3으로 레이블인코딩 됨 

['duo' 'etc' 'solo' 'squad']


In [24]:
df.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,1241,0,0,0.0,1306,3,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,0,0,0,0.0,1777,3,26,25,1484,0,0.004,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,0,0,0,0.0,1318,0,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.775
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,0,0,0,0.0,1436,3,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.167
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,0,1,1,58.53,1424,2,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.188


# **이상치 제거**

현재 train 데이터에는 정상적인 게임에서 나올 수 없는 행들이 존재 ex)  
* 이동이 없는데 kill 수가 존재  
* 한 매치에서 최대 킬 수가 해당매치 참여 인원보다 많은 경우  
* 차량 탄 거리가 0인데 roadkill이 1이상인 경우   
.   
.   
.   


**1) 이동 거리 관련**  
* totalDistance = walkDistance + rideDistance + swimDistance인 파생변수 생성

In [25]:
# totalDistance 파생 변수 생성 
df['totalDistance'] = df['walkDistance']+df['rideDistance']+df['swimDistance']
df.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,totalDistance
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,1241,0,0,0.0,1306,3,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.444,244.8
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,0,0,0,0.0,1777,3,26,25,1484,0,0.004,0,11.04,0,0,1434.0,5,0,0.64,1445.045
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,0,0,0,0.0,1318,0,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.775,161.8
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,0,0,0,0.0,1436,3,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.167,202.7
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,0,1,1,58.53,1424,2,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.188,49.75


In [26]:
# totalDistance=0인데 Kills, teamkills, killStreaks, longestkill,  headshot, boosts, hill, assist, weaponsAcquired>0 인 데이터프레임
# ---> 총 거리량이 0인데 총 킬 수, 팀킬 횟수, 최대 킬 수, 킬 사이 최장 거리 ... 이 0 이상일 수 없음 

df.loc[(df.totalDistance==0) & ((df.assists>0) | (df.kills>0) | (df.teamKills>0) | (df.DBNOs>0) | (df.killStreaks>0) | (df.longestKill>0) | 
                                      (df.headshotKills>0) | (df.boosts>0) |(df.heals>0) | (df.weaponsAcquired>0)),] #20836행행

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,totalDistance
711,b9a35e03db6a7f,7b1b2b23d70cad,e1d2ac9fe9bbe1,0,0,0.000,0,0,0,95,0,0,0,0.000,1408,0,48,47,1500,0,0.000,0,0.000,0,0,0.000,1,0,0.000,0.000
846,0df7c355497d77,325464bcf4c846,f3a64f99badeca,0,0,0.000,0,0,0,5,0,0,0,0.000,1659,2,19,1,1500,0,0.000,0,0.000,0,0,0.000,6,0,0.000,0.000
1357,72f70c6c074ca9,da042db555b932,f3c5fa51aacb95,0,0,722.800,0,0,0,5,0,0,0,0.000,1808,2,18,2,1500,0,0.000,0,0.000,0,0,0.000,15,0,0.000,0.000
1433,0a5036716cd6e0,65d3de7c9396c4,75cbdf89be0139,0,0,85.600,0,0,0,90,0,0,0,0.000,1884,2,90,89,1763,0,0.000,0,0.000,0,0,0.000,1,0,0.000,0.000
1814,269108abb1ba4e,28d3205dd35436,61ec8e0f6bd01d,0,0,0.000,0,0,0,94,0,0,0,0.000,1292,2,95,95,1500,0,0.000,0,0.000,0,0,0.000,1,0,0.011,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4446464,672a8673146c3e,910ef021cfa4ee,7d66e01d3d9cd5,0,0,0.000,0,0,0,91,0,0,0,0.000,1414,2,95,95,1538,0,0.000,0,0.000,0,0,0.000,1,0,0.053,0.000
4446682,b444f8c7af62b5,4e5b8eefdd669e,54277ef2ed69b7,0,0,41.950,0,0,0,48,0,1,1,43.380,599,2,54,27,1500,0,0.000,0,0.000,0,0,0.000,4,0,0.943,0.000
4446786,12c053c0567dca,9db52b9e2d4f8c,fe0c3ccbea8181,0,0,0.000,0,0,0,84,0,0,0,0.000,1462,2,84,83,1564,0,0.000,0,0.000,0,0,0.000,1,0,0.000,0.000
4446812,484990766633ce,77770865680fe0,fb5acb99d6dc45,0,0,0.000,0,0,0,95,0,0,0,0.000,1899,2,95,90,1502,0,0.000,0,0.000,0,0,0.000,1,0,0.000,0.000


In [27]:
df = df.drop(df[df.totalDistance==0][df.assists>0].index) # 총 거리량이 0인데 어시스트가 0 이상인 행 제거
df = df.drop(df[df.totalDistance==0][df.kills>0].index) # 총 거리량이 0인데 킬 수가 0 이상인 행 제거 
df = df.drop(df[df.totalDistance==0][df.teamKills>0].index) # 총 거리량이 0인데 팀킬 수가 0 이상인 행 제거 
df = df.drop(df[df.totalDistance==0][df.DBNOs>0].index) # 총 거리량이 0인데 DBNOs가 0 이상인 행 제거
df = df.drop(df[df.totalDistance==0][df.killStreaks>0].index) # 총 거리량이 0인데 단기간 최대 킬 수가 0 이상인 행 제거 
df = df.drop(df[df.totalDistance==0][df.longestKill>0].index) # 총 거리량이 0인데 킬과 킬 사이 최장거리가 0 이상인 행 제거
df = df.drop(df[df.totalDistance==0][df.headshotKills>0].index) # 총 거리량이 0인데 헤드샷 킬 수가 0 이상인 행 제거 
df = df.drop(df[df.totalDistance==0][df.boosts>0].index) # 총 거리량이 0인데 부스트 아이템 사용 횟수가 0 이상인 행 제거 
df = df.drop(df[df.totalDistance==0][df.heals>0].index) # 총 거리량이 0인데 힐링 아이템 사용 횟수가 0 이상인 행 제거 
df = df.drop(df[df.totalDistance==0][df.weaponsAcquired>0].index) # 총 거리량이 0인데 얻은 무기의 수가 0 이상인 행 제거 

In [28]:
# 자동차를 탄 거리량이 0인데 자동차킬이 1 이상인 경우----> 로드킬은 자동차를 타고 이동할 때만 생기는 건데 차량 거리량이 0인데 로드킬이 1이상은 이상치 
df.loc[(df.rideDistance==0) & (df.roadKills>0), ] #186행

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,totalDistance
820,209314d1a20b48,72a26f1ded95d6,7abce2d2ca9e49,1,1,334.800,4,0,1,17,1187,2,1,50.980,1714,3,28,28,-1,0,0.000,1,0.000,0,0,838.000,6,1533,0.370,838.000
40362,81863d8837256d,af6e04e1b77f4b,59ad07f21f77dd,0,1,272.400,1,1,2,22,0,2,1,115.600,1336,3,29,29,1500,0,0.000,1,127.700,0,0,2443.000,4,0,0.536,2570.700
46376,ed7575c25f2a5e,9be59a8d7ae94b,6c5c240ed3e339,2,5,298.900,3,0,6,12,1378,2,1,15.090,1870,0,50,49,-1,1,0.000,1,0.000,0,0,3054.000,4,1509,0.898,3054.000
50918,b5f8da95af493f,1a2f5a68a2720c,ae50bb56b6da4e,0,0,169.300,1,0,0,39,0,1,1,14.420,1871,0,46,44,1453,0,0.000,1,0.000,0,0,148.600,1,0,0.133,148.600
55266,77178fffc09d09,86fd3da1738c43,620f0ea9a8c997,0,0,100.000,2,0,0,31,0,1,1,64.060,1351,3,28,24,1486,1,0.000,1,0.000,1,0,696.000,3,0,0.370,696.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4417743,e32a8f224bee8a,a2f7c3e4f4ff2e,15fbf879405faf,0,3,328.900,1,1,2,4,1050,4,2,127.000,1363,0,47,45,-1,0,0.000,1,104.600,0,0,2505.000,4,1499,0.870,2609.600
4418648,8dc09552147fe2,0284db08778c3b,f9a88397f45677,0,0,96.430,1,0,0,31,1211,1,1,7.680,1848,3,27,27,-1,0,0.000,1,0.000,0,0,1293.000,3,1429,0.346,1293.000
4426830,c9df7b60fd1b00,b7460fb7ea1462,e31f8af291bb6e,0,0,100.000,1,0,0,36,0,1,1,0.800,1889,3,26,25,1512,0,0.000,1,0.000,0,0,29.410,1,0,0.040,29.410
4444830,027b3f672516a1,766b70fa9c7750,4248befe3f7d28,0,6,442.900,4,0,6,4,1175,5,2,59.230,1371,0,50,49,-1,0,0.000,1,0.000,0,0,2875.000,5,1503,0.837,2875.000


In [29]:
# rideDistance==0 & roadKills>0 인 값을 가진 행들 제거 
df=df.drop(index=df[(df['rideDistance']==0) & (df['roadKills']>0)  ].index)

In [30]:
# 총 거리량이 0인데 총 넣은 데미지가 0이상인 행들 ----> 모델 성능 고려해서 추후 제거 
df.loc[(df.totalDistance==0) & (df.damageDealt>0), ].head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,totalDistance
3752,2ec3865d7f8375,e1d8504f4c83ed,a9a5b78ce24aa0,0,0,91.44,0,0,0,82,0,0,0,0.0,1883,2,95,91,1518,0,0.0,0,0.0,0,0,0.0,0,0,0.128,0.0
3893,48186ea18d02c9,3a68a217c22234,df0626352a542c,0,0,39.6,0,0,0,94,1061,0,0,0.0,1290,0,49,44,-1,0,0.0,0,0.0,0,0,0.0,0,1436,0.0,0.0
11943,668d230a40b90d,8f44220517fc92,d18b4304f2447a,0,0,19.8,0,0,0,98,0,0,0,0.0,1868,3,29,29,1509,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0.0
15254,6207d048c76aac,f27a26d6024ddc,dad458cea40ce3,0,0,69.84,0,0,0,97,1710,0,0,0.0,1384,2,99,97,-1,0,0.0,0,0.0,0,0,0.0,0,1641,0.01,0.0
15886,ccf3084fb4ee65,773be317f20edc,7573b5f1709038,0,0,34.83,0,0,0,84,1151,0,0,0.0,1356,3,26,23,-1,0,0.0,0,0.0,0,0,0.0,0,1425,0.0,0.0


**2) 매치 관련**

In [31]:
df['num']=df.groupby('matchId')['kills'].transform('sum') #matchId 기준으로 그룹을 묶어서 킬 횟수를 세는 'num' 변수 생성 

In [32]:
df.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,totalDistance,num
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,1241,0,0,0.0,1306,3,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.444,244.8,84.0
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,0,0,0,0.0,1777,3,26,25,1484,0,0.004,0,11.04,0,0,1434.0,5,0,0.64,1445.045,83.0
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,0,0,0,0.0,1318,0,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.775,161.8,91.0
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,0,0,0,0.0,1436,3,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.167,202.7,82.0
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,0,1,1,58.53,1424,2,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.188,49.75,91.0


In [33]:
# matchid로 groupby 했을 때, kills가 모두 0인 경우----> 한 매치당 전체 킬 수가 0일 수가 없음 
df.loc[(df.num==0), ] #1082행행

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,totalDistance,num
3962,b553c49b7cea2a,f603d46c0cf8e6,c62a9311ca6624,0,0,0.000,0,0,0,17,0,0,0,0.000,1808,2,17,2,1500,0,0.000,0,0.000,0,0,0.000,0,0,0.000,0.000,0.000
6771,f7acd8915c42be,7ce7d87b8db94a,e0802c8084d50d,0,0,0.000,0,0,0,17,0,0,0,0.000,1808,2,19,2,1500,0,0.000,0,0.000,0,0,0.000,0,0,0.000,0.000,0.000
7105,b04b8dd27fda90,00af6bab6a98cc,48beae05cf8f80,0,0,0.000,0,0,0,19,0,0,0,0.000,1808,2,19,2,1500,0,0.000,0,0.000,0,0,0.000,0,0,0.000,0.000,0.000
11609,cf20461d1207da,65c0fd07b89577,d1fdec9e4632dc,0,0,0.000,0,0,0,2,0,0,0,0.000,1808,2,16,2,1500,0,0.000,0,0.000,0,0,130.300,4,0,0.000,130.300,0.000
12628,09886f67d5a30d,5b2405c02106d6,8ca30ed6e8926d,0,0,0.000,0,0,0,11,0,0,0,0.000,1808,2,19,2,1500,0,0.000,0,0.000,0,0,0.000,0,0,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4422789,5125476e21721a,e4bbbbd7af29dc,d1c663de6e23bf,0,1,0.000,0,0,2,1,0,0,0,0.000,1808,2,16,2,1500,0,8682.000,0,0.000,0,0,806.300,2,0,1.000,9488.300,0.000
4422909,5e030e2f841f25,2d03c8cef40074,ad64e518208d26,0,0,0.000,0,0,0,18,0,0,0,0.000,1808,2,18,2,1500,0,0.000,0,0.000,0,0,0.000,0,0,0.000,0.000,0.000
4427944,edee71fedeec1d,d029d09dbba6fe,cf0cb51c829eb5,0,0,0.000,0,0,0,18,0,0,0,0.000,1808,2,18,2,1500,0,0.000,0,0.000,0,0,0.000,0,0,0.000,0.000,0.000
4430451,69f2d577afb916,1ca82eb3229fe3,8b1af8b54b0edb,0,0,0.000,0,0,0,2,0,0,0,0.000,1808,2,18,2,1500,0,0.000,0,0.000,0,0,0.000,0,0,1.000,0.000,0.000


In [34]:
df=df.drop(index=df[(df['num']==0) ].index) # 한 매치당 전체 킬 수가 0인 행 제거 

In [35]:
df.drop(['num'], axis=1, inplace=True) # 위에서 생성한 'num' 변수 필요 없으니 제거 

In [36]:
# 매치에서의 킬 횟수 순위가 100 초과인 경우 ---> 한 매치당 최대 100인이기 때문에 순위가 100 초과일 수가 없음음
df.loc[(df.killPlace)>100,] #1행행

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,totalDistance
3679420,a1e45f366ad76f,05c7966baad337,fe57e25e37dbfd,0,0,66.65,0,0,0,101,0,0,0,0.0,1864,3,25,25,1500,0,0.0,0,0.0,0,0,8.277,1,0,0.0,8.277


In [37]:
# killPlace가 match 참여 인원 수보다 큰 경우 제거
df=df.drop(index=df[df['killPlace']>100].index) 

In [38]:
# matchId 별 player 수
df['cnt_per_matchid'] = df.groupby('matchId').Id.transform('count')

In [39]:
#killplace가 match 참여 인원 수보다 큰 경우 ----> 킬 순위가 매치 참여 인원 수보다 클 수가 없음 
df.loc[(df.cnt_per_matchid)<(df.killPlace),] #9810행행

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,totalDistance,cnt_per_matchid
109,de93ba20446980,29276739992710,7456e2eaa324a2,0,0,19.350,0,0,0,96,0,0,0,0.000,1923,2,96,89,1500,0,0.000,0,0.000,0,0,20.340,1,0,0.000,20.340,95
994,b890be433d01b4,9ad75b9e378d08,df5f39b08f895f,0,0,0.000,0,0,0,96,1302,0,0,0.000,1972,2,96,91,-1,0,0.000,0,0.000,0,0,24.560,0,1493,0.000,24.560,95
1109,f76f2ac18df53a,5f8da43e34bc03,c3e76b61c37d55,0,0,0.000,0,0,0,94,0,0,0,0.000,1876,0,47,44,1495,0,0.000,0,0.000,0,0,25.720,0,0,0.000,25.720,93
1247,60526516205691,35bc41f9273e30,e3dfb129298dc5,1,0,95.360,0,0,0,97,0,0,0,0.000,1363,2,98,98,1617,0,0.000,0,0.000,0,0,32.390,2,0,0.010,32.390,96
1345,78d68043fc136b,f594b264467f3e,5369152a17bd77,0,0,200.000,2,0,0,94,0,0,0,0.000,1371,3,28,27,1357,0,0.000,0,0.000,0,0,16.200,1,0,0.000,16.200,93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4444224,e60375c1d7d144,b0311938baec3d,808a1bf6efac6c,0,0,0.000,0,0,0,94,0,0,0,0.000,1326,0,47,47,1498,0,0.000,0,0.000,0,0,0.630,1,0,0.000,0.630,93
4445339,6ca57c637f9f02,92f103c845d61f,ec64831a3777fc,0,0,0.000,0,0,0,91,0,0,0,0.000,1811,3,27,27,1443,0,0.000,0,0.000,0,0,3.980,1,0,0.000,3.980,90
4445523,e05809111d5452,593c9c92ba759a,3be85d463dd278,0,0,0.000,0,0,0,94,1152,0,0,0.000,1355,3,31,29,-1,0,0.000,0,0.000,0,0,18.860,1,1426,0.000,18.860,93
4445904,ec34f4c895e1cc,88575ae9e22d7c,7d3e5a3621abe4,0,0,0.000,0,0,0,95,0,0,0,0.000,1389,2,95,94,1574,0,0.000,0,0.000,0,0,3.433,1,0,0.000,3.433,93


In [40]:
# killPlace가 match 참여 인원 수보다 큰 경우 제거
df=df.drop(index=df[df['cnt_per_matchid'] < df['killPlace']].index) 

**3) kill 관련**

In [41]:
# 데미지 양이 없는데 킬 수가 있는 경우----> 킬을 해야 해당 킬에 따라서 데미지가 생기기 때문에 딜이 없다면 킬 수가 생길 수가 없음 
df.loc[(df.damageDealt==0) & (df.kills>0)] #4151행

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,totalDistance,cnt_per_matchid
892,1483b507bf43be,e468ae00a05134,19b3a8232074e5,1,0,0.000,0,0,1,37,1523,1,1,10.600,1347,3,28,28,-1,0,0.000,0,0.000,0,0,231.100,2,1443,0.296,231.100,98
942,0da074947acace,780051d11688f1,56b36999cfe3ee,0,0,0.000,0,0,0,39,1058,1,1,8.493,1391,3,27,25,-1,0,0.000,0,0.000,0,0,62.340,2,1474,0.192,62.340,91
2448,c63b184658a1a3,3dc6e06ab727eb,e26d9b4f6dfdf1,0,0,0.000,0,0,0,26,0,1,1,16.110,1382,3,30,29,1496,0,0.000,0,0.000,1,0,1514.000,3,0,0.552,1514.000,93
2850,751d4916e02c7c,f1d428a4b98ba1,5ae3a69bef506e,0,0,0.000,0,1,0,28,0,1,1,0.000,1828,3,27,27,1473,1,2214.000,0,0.000,0,0,914.800,5,0,0.577,3128.800,96
3011,b155164fd1f063,e29566c9a0d7b4,abad739a729c29,0,3,0.000,0,0,2,37,1239,1,1,43.110,1386,3,29,29,0,0,0.000,0,0.000,0,0,2087.000,8,1492,0.464,2087.000,99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4444208,09f0e19baacdb9,66b5af9fbd9540,1f7460f60f8645,0,0,0.000,0,0,0,37,1093,1,1,34.900,1444,3,28,28,-1,0,0.000,0,0.000,0,0,31.670,2,1536,0.296,31.670,98
4444586,2b0f82148db808,aa9a1d8659d27d,a830228514f1a5,0,0,0.000,0,0,0,42,0,1,1,61.010,1371,0,49,49,1508,0,0.000,0,0.000,0,0,123.400,2,0,0.354,123.400,98
4444646,532b9428583e93,afcf2b32044608,dfd7327dc91394,0,0,0.000,0,0,0,29,0,1,1,24.630,1360,3,27,25,1418,1,0.000,0,0.000,0,0,335.200,3,0,0.269,335.200,90
4445499,44fb23554a8930,4549cedf3cce66,6366ed38dbb461,0,0,0.000,0,0,0,29,1070,1,1,8.689,1757,3,28,28,-1,0,0.000,0,0.000,0,0,13.800,1,1484,0.000,13.800,97


In [42]:
# danageDealt==0 & kills>0인 값을 가진 행들 제거 
df=df.drop(index=df[ (df['damageDealt']==0) & (df['kills']>0)  ].index)

In [43]:
# 한 매치에서 최대 킬 수가 해당 매치의 참여 인원 수 보다 많은 경우 -----> ex) 매치의 참여 인원 수가 95명인데 나의 최대 킬 수가 96일 수가 없음

# 같은 게임에 참여한 사람 수 컬럼 수 만듦
df['num']=df.groupby('matchId')['Id'].transform('count')
# 한 게임에서 최대 킬수 컬럼
df['max']=df.groupby('matchId')['kills'].transform('max')

df.loc[df['max']>df['num']] #2114 행 

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,totalDistance,cnt_per_matchid,num,max
1292,919f57a28a5e02,79a34f910375bc,b181e5bc4f0c1d,6,0,1750.000,0,6,2,7,0,15,3,69.880,1034,3,8,8,1500,0,1657.000,0,0.000,0,0,1137.000,43,0,0.714,2794.000,26,23,38
2769,89120d2f4e44e0,fa54efeb11b8a6,662c9c7f9e87d0,4,0,970.500,6,4,17,4,1000,11,2,21.610,1263,0,7,7,-1,2,0.000,0,0.000,0,0,90.790,28,1500,1.000,90.790,10,9,23
5479,875a95c6a48744,47f6d77a0582b3,e263f4a227313a,3,0,745.700,3,2,10,3,1000,7,2,29.900,887,0,6,6,-1,0,0.000,0,0.000,0,0,528.700,16,1500,0.400,528.700,8,8,17
12210,06be812fd8703a,b4d1ecd6204d22,93edb943c25530,2,1,974.700,0,4,2,19,0,7,2,163.000,1368,3,7,7,1500,0,0.000,0,0.000,0,0,2107.000,18,0,0.167,2107.000,20,16,30
13923,a43aba1ea56253,0c0ff478329513,9a4e500e3cf355,2,0,515.900,0,0,5,21,0,4,1,90.830,597,3,15,15,1500,0,0.000,0,0.000,0,0,3263.000,12,0,0.500,3263.000,35,27,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4434989,1e40a124f96e18,69c3e37e3b211e,76f7e1fca94cb7,2,0,458.000,0,3,0,12,0,3,1,233.100,1244,2,16,11,1500,0,3424.000,0,0.000,0,0,1803.000,11,0,0.267,5227.000,15,14,23
4435441,e9a9cdf2db7276,f3ecd137b8cc62,10f129a6c2be8d,4,2,992.500,0,2,2,8,1000,6,1,133.500,890,0,6,5,-1,0,0.000,0,0.000,0,0,1506.000,12,1500,0.000,1506.000,12,11,16
4435972,085fd66a43671e,31241cba41823e,1ac375e4121651,8,0,1248.000,16,3,9,13,1000,10,2,140.700,1264,3,5,5,-1,0,0.000,0,0.000,0,0,614.200,17,1500,0.000,614.200,34,32,33
4438414,e977f0f5cc2313,61189f050e4592,1ac375e4121651,5,0,1649.000,15,3,4,8,1000,13,2,174.500,1264,3,5,5,-1,2,0.000,0,0.000,0,0,1025.000,23,1500,0.250,1025.000,34,32,33


In [44]:
# 최대 킬수가 한 게임 사람 수 보다 많을 수 없음, 행 제거
df=df.drop(index=df[(df['max']>df['num'])].index)

In [45]:
df.drop(['num', 'max'], axis=1, inplace=True)  # 이상치 제거를 위해 생성한 num, max 컬럼 제거

# **파생 변수 생성**

In [46]:
# walk, ride, swim 중 하나만 했을 수 있어서 totalDistance 변수 생성
df['totalDistance'] = df['walkDistance'] + df['rideDistance'] + df['swimDistance']

In [45]:
# heals + boosts 값만
df['total_heals'] = df['heals'] + df['boosts']

In [48]:
df.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,totalDistance,cnt_per_matchid,total_heals
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,1241,0,0,0.0,1306,3,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.444,244.8,96,0
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,0,0,0,0.0,1777,3,26,25,1484,0,0.004,0,11.04,0,0,1434.0,5,0,0.64,1445.045,91,0
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,0,0,0,0.0,1318,0,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.775,161.8,98,0
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,0,0,0,0.0,1436,3,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.167,202.7,91,0
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,0,1,1,58.53,1424,2,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.188,49.75,97,0


In [151]:
# items = heals + boosts + weapon
df['items'] = df['heals'] + df['boosts'] +df['weaponsAcquired']

In [152]:
# teamwork = 아군을 보조 + 부활
df['teamwork'] = df['assists'] + df['revives']

In [153]:
# agg = df.groupby('groupId').size().to_frame('players_in_team')
# df = df.merge(agg, on='groupId')

In [154]:
# kill 대비 headshot
df['headshots_over_kills'] = df['headshotKills'] / df['kills']
df['headshots_over_kills'] = df['headshots_over_kills'].fillna(0)

In [155]:
df.loc[df['headshotKills']>df['kills']]  
#위에서 생성한 파생변수에서는 총 킬수 대비의 헤드샷 킬 수 이기 때문에 전제가 헤드샷이 킬 수 보다 크면 안되는 거기 때문에, 해당 행이 존재하는지 확인
# ----> 0행이니 위에서 생성한 파생변수 유의미함 

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,totalDistance,cnt_per_matchid,total_heals,items,teamwork,headshots_over_kills


In [156]:
# (kill횟수 순위) / (최악의 순위)
df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']

In [157]:
# heals 대비 걸은 거리
df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
df['walkDistance_over_heals'] = df['walkDistance_over_heals'].replace(np.inf, 0)

In [158]:
# kill 대비 걸은 거리
df['workDistance_over_kills'] = df['walkDistance'] / df['kills']
df['workDistance_over_kills'] = df['workDistance_over_kills'].fillna(0)
df['workDistance_over_kills'] = df['workDistance_over_kills'].replace(np.inf, 0)

In [159]:
# matchId 별 player 수
df['cnt_per_matchid'] = df.groupby('matchId').Id.transform('count')

In [160]:
# match 내 player수 대비 kill 수
df['kills_over_match_cnt']=df['kills']/df['cnt_per_matchid']

In [161]:
display(df)

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,totalDistance,cnt_per_matchid,total_heals,items,teamwork,headshots_over_kills,killPlace_over_maxPlace,walkDistance_over_heals,workDistance_over_kills,kills_over_match_cnt
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.000,0,0,0,60,1241,0,0,0.000,1306,3,28,26,-1,0,0.000,0,0.000,0,0,244.800,1,1466,0.444,244.800,96,0,1,0,0.000,2.143,0.000,0.000,0.000
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.470,0,0,0,57,0,0,0,0.000,1777,3,26,25,1484,0,0.004,0,11.040,0,0,1434.000,5,0,0.640,1445.045,91,0,5,0,0.000,2.192,0.000,0.000,0.000
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.000,0,0,0,47,0,0,0,0.000,1318,0,50,47,1491,0,0.000,0,0.000,0,0,161.800,2,0,0.775,161.800,98,0,2,1,0.000,0.940,0.000,0.000,0.000
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.900,0,0,0,75,0,0,0,0.000,1436,3,31,30,1408,0,0.000,0,0.000,0,0,202.700,3,0,0.167,202.700,91,0,3,0,0.000,2.419,0.000,0.000,0.000
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.000,0,0,0,45,0,1,1,58.530,1424,2,97,95,1560,0,0.000,0,0.000,0,0,49.750,2,0,0.188,49.750,97,0,2,0,0.000,0.464,0.000,49.750,0.010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4446961,afff7f652dbc10,d238e426f50de7,18492834ce5635,0,0,0.000,0,0,0,74,1029,0,0,0.000,1873,3,29,28,-1,0,1292.000,0,0.000,0,0,1019.000,3,1507,0.179,2311.000,91,0,3,0,0.000,2.552,0.000,0.000,0.000
4446962,f4197cf374e6c0,408cdb5c46b2ac,ee854b837376d9,0,1,44.150,0,0,0,69,0,0,0,0.000,1435,2,93,93,1501,0,0.000,0,0.000,0,0,81.700,6,0,0.294,81.700,91,1,7,0,0.000,0.742,0.000,0.000,0.000
4446963,e1948b1295c88a,e26ac84bdf7cef,6d0cd12784f1ab,0,0,59.060,0,0,0,66,0,0,0,0.000,1321,3,28,28,1500,0,0.000,0,2.184,0,0,788.700,4,0,0.481,790.884,98,0,4,0,0.000,2.357,0.000,0.000,0.000
4446964,cc032cdd73b7ac,c2223f35411394,c9c701d0ad758a,0,4,180.400,1,1,2,11,0,2,1,98.500,1373,3,26,25,1418,2,0.000,0,0.000,0,0,2748.000,8,0,0.800,2748.000,92,6,14,2,0.500,0.423,1374.000,1374.000,0.022


## solo model
+ model에서 공통적으로 빠져도 되는 columns: 'Id', 'groupId', 'matchId', 'KillPoints', ' rankPoints', 'winPoints', 'maxPlace', 'matchDuration'
+ Solo model에서 빠져야하는 columns: 'DBNOs', 'revives', 'teamKills', 'assists'



*   matchDuration(matchId별로 동일한 값 가지므로 순위에 영향 X)
*   killpoints, rankpoints, winpoints(과거 기록이므로 각 게임 순위 결정에 영향 X)
*   maxPlace(numGroups와 비슷한 의미를 가지므로 제거)





In [47]:
solo_drop = ['Id', 'groupId', 'matchId', 'DBNOs', 'revives', 'teamKills', 'assists', 'killPoints', 'rankPoints', 'winPoints', 'maxPlace', 'matchDuration']
solo_model = df.drop(solo_drop, axis = 'columns')
solo_model.head()

Unnamed: 0,boosts,damageDealt,headshotKills,heals,killPlace,kills,killStreaks,longestKill,matchType,numGroups,rideDistance,roadKills,swimDistance,vehicleDestroys,walkDistance,weaponsAcquired,winPlacePerc,totalDistance,cnt_per_matchid
0,0,0.0,0,0,60,0,0,0.0,3,26,0.0,0,0.0,0,244.8,1,0.444,244.8,96
1,0,91.47,0,0,57,0,0,0.0,3,25,0.004,0,11.04,0,1434.0,5,0.64,1445.045,91
2,0,68.0,0,0,47,0,0,0.0,0,47,0.0,0,0.0,0,161.8,2,0.775,161.8,98
3,0,32.9,0,0,75,0,0,0.0,3,30,0.0,0,0.0,0,202.7,3,0.167,202.7,91
4,0,100.0,0,0,45,1,1,58.53,2,95,0.0,0,0.0,0,49.75,2,0.188,49.75,97


In [49]:
solo_model = solo_model[solo_model['matchType'] == 2]
solo_model = solo_model.drop('matchType', axis = 'columns')
solo_model.head()

Unnamed: 0,boosts,damageDealt,headshotKills,heals,killPlace,kills,killStreaks,longestKill,numGroups,rideDistance,roadKills,swimDistance,vehicleDestroys,walkDistance,weaponsAcquired,winPlacePerc,totalDistance,cnt_per_matchid
4,0,100.0,0,0,45,1,1,58.53,95,0.0,0,0.0,0,49.75,2,0.188,49.75,97
7,0,8.538,0,0,48,0,0,0.0,92,2004.0,0,0.0,0,1089.0,6,0.737,3093.0,96
13,1,324.2,1,5,5,4,1,49.83,94,1228.0,0,76.84,0,2050.0,6,0.875,3354.84,97
17,3,254.3,0,12,13,2,1,36.0,95,2367.0,0,15.29,0,1787.0,3,0.821,4169.29,96
35,0,136.9,0,0,37,1,1,22.83,94,0.0,0,0.0,0,270.7,1,0.347,270.7,96


In [50]:
len(solo_model)

702076

### 1) 파생변수 제외하고 VIF

In [51]:
vif_drop = ['cnt_per_matchid', 'winPlacePerc']
solo_vif = solo_model.drop(vif_drop, axis = 'columns')

In [52]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(solo_vif.values, i) for i in range(solo_vif.shape[1])]
vif['features'] = solo_vif.columns
vif = vif.sort_values('VIF Factor', ascending=False).reset_index(drop=True)
vif

Unnamed: 0,VIF Factor,features
0,22405968295375.6,walkDistance
1,11301379240578.408,totalDistance
2,6173543012159.693,rideDistance
3,4081639713.762,swimDistance
4,45.707,numGroups
5,24.473,killPlace
6,16.795,kills
7,15.393,damageDealt
8,7.064,killStreaks
9,5.42,weaponsAcquired



*   rideDistance, swimDistance, walkDistance 컬럼 삭제, totalDistance만 남김 

#### 변수들 1차 제거 후 VIF 결과

In [53]:
vif_drop = ['cnt_per_matchid', 'winPlacePerc', 'rideDistance', 'walkDistance', 'swimDistance']
solo_vif = solo_model.drop(vif_drop, axis = 'columns')

In [54]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(solo_vif.values, i) for i in range(solo_vif.shape[1])]
vif['features'] = solo_vif.columns
vif = vif.sort_values('VIF Factor', ascending=False).reset_index(drop=True)
vif

Unnamed: 0,VIF Factor,features
0,49.805,numGroups
1,23.747,killPlace
2,17.748,kills
3,15.855,damageDealt
4,6.965,killStreaks
5,5.447,weaponsAcquired
6,3.042,boosts
7,2.857,totalDistance
8,2.32,headshotKills
9,2.288,longestKill




*   killPlace는 비슷한 변수가 많으므로 numGroups가 아닌 Killplace 삭제



#### 변수들 2차 제거 후 VIF 결과

In [55]:
vif_drop = ['cnt_per_matchid', 'winPlacePerc', 'rideDistance', 'walkDistance', 'swimDistance','killPlace']
solo_vif = solo_model.drop(vif_drop, axis = 'columns')

In [56]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(solo_vif.values, i) for i in range(solo_vif.shape[1])]
vif['features'] = solo_vif.columns
vif = vif.sort_values('VIF Factor', ascending=False).reset_index(drop=True)
vif

Unnamed: 0,VIF Factor,features
0,17.565,kills
1,15.822,damageDealt
2,4.65,weaponsAcquired
3,3.814,killStreaks
4,3.629,numGroups
5,3.009,boosts
6,2.692,totalDistance
7,2.309,headshotKills
8,2.285,longestKill
9,1.62,heals




*   damageDealt보다 kills가 더 결과에 직접적인 영향을 미치므로 kills 대신 damageDealt 삭제



#### 변수 3차 제거 후 VIF 결과

In [57]:
vif_drop = ['cnt_per_matchid', 'winPlacePerc', 'rideDistance', 'walkDistance', 'swimDistance','killPlace', 'damageDealt']
solo_vif = solo_model.drop(vif_drop, axis = 'columns')

In [58]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(solo_vif.values, i) for i in range(solo_vif.shape[1])]
vif['features'] = solo_vif.columns
vif = vif.sort_values('VIF Factor', ascending=False).reset_index(drop=True)
vif

Unnamed: 0,VIF Factor,features
0,5.863,kills
1,4.642,weaponsAcquired
2,3.814,killStreaks
3,3.472,numGroups
4,2.998,boosts
5,2.691,totalDistance
6,2.308,headshotKills
7,2.283,longestKill
8,1.619,heals
9,1.034,roadKills


In [60]:
solo_vif

Unnamed: 0,boosts,headshotKills,heals,kills,killStreaks,longestKill,numGroups,roadKills,vehicleDestroys,weaponsAcquired,totalDistance
4,0,0,0,1,1,58.530,95,0,0,2,49.750
7,0,0,0,0,0,0.000,92,0,0,6,3093.000
13,1,1,5,4,1,49.830,94,0,0,6,3354.840
17,3,0,12,2,1,36.000,95,0,0,3,4169.290
35,0,0,0,1,1,22.830,94,0,0,1,270.700
...,...,...,...,...,...,...,...,...,...,...,...
4446950,0,0,0,1,1,11.270,92,0,0,5,1559.000
4446954,0,0,0,0,0,0.000,97,0,0,6,2146.000
4446959,0,0,0,0,0,0.000,95,0,0,1,40.250
4446962,1,0,0,0,0,0.000,93,0,0,6,81.700


## Scaling(하기전 linear: 0.68, rf: 0.83)

In [61]:
solo_vif_1 = solo_vif

In [68]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
std.fit(solo_vif_1)

## model 적용

In [69]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X = solo_vif_1
y = solo_model['winPlacePerc']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [71]:
model = LinearRegression()  #normalize=True, n_jobs=8

lreg = model.fit(X_train, y_train)

In [72]:
print("Linear Train Score:", lreg.score(X_train, y_train))
print("Linear Test Score:", lreg.score(X_test, y_test))

Linear Train Score: 0.6816395093738771
Linear Test Score: 0.6816052489859002


In [73]:
from sklearn.ensemble import RandomForestRegressor


X = solo_vif_1
y = solo_model['winPlacePerc']

size = 0.20
seed = 42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=seed)

model = RandomForestRegressor(n_estimators=10)

forest = model.fit(X_train,y_train)

print("Random Forest Train Score:", forest.score(X_train, y_train))
print("Random Forest Test Score:", forest.score(X_test, y_test))

Random Forest Train Score: 0.9656992946723479
Random Forest Test Score: 0.8342001869363954


## duo, squad model
+ model에서 공통적으로 빠져도 되는 columns: 'Id', 'groupId', 'matchId', 'KillPoints', ' rankPoints', 'winPoints', 'maxPlace', 'matchDuration'
+ solo model에서 vif값 크게 나왔던 'rideDistance','swimDistance','swimDistance' columns 삭제

In [74]:
ds_drop = ['Id', 'groupId', 'matchId','rideDistance','swimDistance', 'swimDistance','killPoints', 'rankPoints', 'winPoints', 'maxPlace','matchDuration']
ds_model = df.drop(ds_drop, axis = 'columns')
ds_model.head()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,kills,killStreaks,longestKill,matchType,numGroups,revives,roadKills,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPlacePerc,totalDistance,cnt_per_matchid
0,0,0,0.0,0,0,0,60,0,0,0.0,3,26,0,0,0,0,244.8,1,0.444,244.8,96
1,0,0,91.47,0,0,0,57,0,0,0.0,3,25,0,0,0,0,1434.0,5,0.64,1445.045,91
2,1,0,68.0,0,0,0,47,0,0,0.0,0,47,0,0,0,0,161.8,2,0.775,161.8,98
3,0,0,32.9,0,0,0,75,0,0,0.0,3,30,0,0,0,0,202.7,3,0.167,202.7,91
4,0,0,100.0,0,0,0,45,1,1,58.53,2,95,0,0,0,0,49.75,2,0.188,49.75,97


In [75]:
ds_model = ds_model[(ds_model['matchType'] == 0) | (ds_model['matchType'] == 3)]
ds_model = ds_model.drop('matchType', axis = 'columns')
ds_model.head()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,kills,killStreaks,longestKill,numGroups,revives,roadKills,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPlacePerc,totalDistance,cnt_per_matchid
0,0,0,0.0,0,0,0,60,0,0,0.0,26,0,0,0,0,244.8,1,0.444,244.8,96
1,0,0,91.47,0,0,0,57,0,0,0.0,25,0,0,0,0,1434.0,5,0.64,1445.045,91
2,1,0,68.0,0,0,0,47,0,0,0.0,47,0,0,0,0,161.8,2,0.775,161.8,98
3,0,0,32.9,0,0,0,75,0,0,0.0,30,0,0,0,0,202.7,3,0.167,202.7,91
5,0,0,100.0,1,1,0,44,1,1,18.44,28,0,0,0,0,34.7,1,0.037,34.7,95


In [76]:
len(ds_model)

3696933

### 1) 파생변수 제외하고 VIF

In [78]:
vif_drop = ['cnt_per_matchid', 'winPlacePerc']
ds_vif = ds_model.drop(vif_drop, axis = 'columns')

In [79]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(ds_vif.values, i) for i in range(ds_vif.shape[1])]
vif['features'] = ds_vif.columns
vif = vif.sort_values('VIF Factor', ascending=False).reset_index(drop=True)
vif

Unnamed: 0,VIF Factor,features
0,11.13,kills
1,10.886,numGroups
2,9.575,damageDealt
3,6.476,killPlace
4,6.202,walkDistance
5,5.812,killStreaks
6,5.425,DBNOs
7,4.959,weaponsAcquired
8,4.454,totalDistance
9,3.359,boosts


#### 변수 1차 제거 후 VIF 결과

In [81]:
vif_drop = ['cnt_per_matchid', 'winPlacePerc', 'damageDealt']
ds_vif = ds_model.drop(vif_drop, axis = 'columns')

In [82]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(ds_vif.values, i) for i in range(ds_vif.shape[1])]
vif['features'] = ds_vif.columns
vif = vif.sort_values('VIF Factor', ascending=False).reset_index(drop=True)
vif

Unnamed: 0,VIF Factor,features
0,10.79,numGroups
1,8.887,kills
2,6.476,killPlace
3,6.201,walkDistance
4,5.786,killStreaks
5,4.95,weaponsAcquired
6,4.454,totalDistance
7,4.379,DBNOs
8,3.327,boosts
9,2.066,headshotKills


In [84]:
ds_vif_1 = ds_vif

In [85]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
std.fit(solo_vif_1)

### duo-squad 모델 적용

In [86]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X = ds_vif_1
y = ds_model['winPlacePerc']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [87]:
model = LinearRegression()  #normalize=True, n_jobs=8

lreg = model.fit(X_train, y_train)

In [88]:
print("Linear Train Score:", lreg.score(X_train, y_train))
print("Linear Test Score:", lreg.score(X_test, y_test))

Linear Train Score: 0.8141498882799033
Linear Test Score: 0.8136771346481353


In [89]:
from sklearn.ensemble import RandomForestRegressor


X = ds_vif_1
y = ds_model['winPlacePerc']

size = 0.20
seed = 42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=seed)

model = RandomForestRegressor(n_estimators=10)

forest = model.fit(X_train,y_train)

print("Random Forest Train Score:", forest.score(X_train, y_train))
print("Random Forest Test Score:", forest.score(X_test, y_test))

Random Forest Train Score: 0.9820208325164432
Random Forest Test Score: 0.8988818243934593
