# library

In [72]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import os
import gc
import pickle

from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectFromModel, SelectPercentile
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from tqdm import tqdm                       # 진행바
from sklearn.metrics import roc_auc_score   # AUC 스코어 계산
from sklearn.model_selection import KFold   # K-fold CV    
from bayes_opt import BayesianOptimization  # 베이지안 최적화 라이브러리  
from functools import partial               # 함수 변수 고정
import lightgbm as lgb 
import xgboost as xgb


import warnings                             
warnings.filterwarnings("ignore")

def euclidean_distance(pt1, pt2):
  distance = 0
  for i in range(len(pt1)):
    distance += (pt1[i] - pt2[i]) ** 2
  return distance ** 0.5

In [2]:
train = pd.read_csv('C:/Users/ASUS/Documents/data_competition/스타크래프트 승률 예측/data/train.csv')
test = pd.read_csv('C:/Users/ASUS/Documents/data_competition/스타크래프트 승률 예측/data/test.csv')

In [158]:
#카메라가 상대방의 특정 영역안으로 들어간 수
df_train['p0_over'] = 0
df_train['p1_over'] = 0
compare = euclidean_distance([0,0], [40,40])
    
for j in range(0,38872):
    new_train = train.loc[train['game_id']==j]
    new_train = new_train.loc[new_train['event']=='Camera']
    p0_df = new_train.loc[new_train['player']==0]
    p1_df = new_train.loc[new_train['player']==1]
        
    player_0_camera = np.array([item.replace('at (', '').replace(')', '').split(',') for item in p0_df['event_contents']]).astype(float)
    player_1_camera = np.array([item.replace('at (', '').replace(')', '').split(',') for item in p1_df['event_contents']]).astype(float)
    
    if len(player_0_camera) == 0:
        p0_over = 0
    if len(player_1_camera) != 0:
        p0_over = 0
        p1_start = player_1_camera[0]
        for i in player_0_camera:
            dist = euclidean_distance(p1_start, i)
            if dist < compare:
                p0_over += 1
                
    if len(player_1_camera) == 0:
        p1_over = 0
    if len(player_0_camera) != 0:
        p1_over = 0
        p0_start = player_0_camera[0] 
        for i in player_1_camera:
            dist = euclidean_distance(p0_start, i)
            if dist < compare:
                p1_over += 1
            
    df_train['p0_over'][j] = p0_over
    df_train['p1_over'][j] = p1_over

In [211]:
# 처음 기록 된 카메라 좌표를 기록

df_train_p0 = train[(train.event=='Camera')&(train.player==0)]
df_train_p0 = df_train_p0[df_train_p0.shift(1).game_id!=df_train_p0.game_id] # 쉬프트를 이용하여 각 게임의 첫번째 데이터 찾기
df_train_p0 = df_train_p0.iloc[:, [0,6]].rename({'event_contents':'player0_starting'}, axis = 1)
df_train_p0.index = df_train_p0['game_id']
df_train_p0 = df_train_p0.drop(['game_id'], axis=1)
df_train = pd.merge(df_train, df_train_p0, on='game_id', how='left')
del df_train_p0

df_train_p1 = train[(train.event=='Camera')&(train.player==1)]
df_train_p1 = df_train_p1[df_train_p1.shift(1).game_id!=df_train_p1.game_id]
df_train_p1 = df_train_p1.iloc[:, [0,6]].rename({'event_contents':'player1_starting'}, axis = 1)
df_train_p1.index = df_train_p1['game_id']
df_train_p1 = df_train_p1.drop(['game_id'], axis=1)
df_train = pd.merge(df_train, df_train_p1, on='game_id', how='left')
del df_train_p1

In [212]:
# x, y 값으로 분리

df_train['player0_starting'] = df_train.player0_starting.str.split('(').str[1]
df_train['player0_starting'] = df_train.player0_starting.str.split(')').str[0]
split_xy = df_train.player0_starting.str.split(',')
df_train['player0_x'] = split_xy.str[0].astype('float')
df_train['player0_y'] = split_xy.str[1].astype('float')
del split_xy

df_train['player1_starting'] = df_train.player1_starting.str.split('(').str[1]
df_train['player1_starting'] = df_train.player1_starting.str.split(')').str[0]
split_xy = df_train.player1_starting.str.split(',')
df_train['player1_x'] = split_xy.str[0].astype('float')
df_train['player1_y'] = split_xy.str[1].astype('float')
del split_xy

In [213]:
# 플레이어의 x,y 좌표를 하나로 모음

location_p0 = df_train.loc[:, ['player0_x', 'player0_y']]
location_p0 = location_p0.rename({'player0_x':'location_x', 'player0_y':'location_y'}, axis=1)

location_p1 = df_train.loc[:, ['player1_x', 'player1_y']]
location_p1 = location_p1.rename({'player1_x':'location_x', 'player1_y':'location_y'}, axis=1)
location_p1.index += location_p0.index[-1]+1

location = pd.concat([location_p0, location_p1])
location = location.dropna()
del location_p0, location_p1

In [214]:
kmeans_clst = KMeans(n_clusters=15).fit(location)
location['starting'] = kmeans_clst.labels_+1

In [215]:
# kmeans로 찾은 15개의 포인트에서 각 데이터들의 거리 계산
for cluster in range(15):
    point = location[location.starting==cluster+1]
    loc = point.loc[:,['location_x', 'location_y']]
    del point
    loc['center_x'] = kmeans_clst.cluster_centers_[cluster][0]
    loc['center_y'] = kmeans_clst.cluster_centers_[cluster][1]
    distance = np.sqrt(np. square(loc.location_x - loc.center_x) + np.square(loc.location_y - loc.center_y))
    location.loc[loc.index, 'distance'] = distance
    del loc

In [216]:
# 일정 거리(5)이상 떨어진 데이터는 starting을 0으로 지정
idx = location[location.distance>5].index
location.loc[idx, 'starting'] = 0
del idx

In [217]:
# 클러스터링한 결과 반영
df_train['player0_starting'] = location.loc[df_train.index, 'starting']
location.index -= (df_train.index[-1]+1)
df_train['player1_starting'] = location.loc[df_train.index, 'starting']
del location

# 불필요한 컬럼 삭제
df_train = df_train.drop(['player0_x', 'player0_y', 'player1_x', 'player1_y'], axis = 1)
df_train = df_train.fillna(0)

In [167]:
# 스타팅 포인트를 이용하여 맵 분류
map_list = []
for point in range(1,16):
    couple = df_train[df_train.player0_starting == point].player1_starting.value_counts()
    if couple[couple.index[1]]<100:
        map_list.append([point, couple.index[0], 999])
    else:
        map_list.append([point, couple.index[0], couple.index[1]])
map_list = np.sort(map_list, axis = 1)
map_list = np.unique(map_list, axis = 0)

In [168]:
# map_list와 상대편 위치 정보를 이용하여 모르는 스타팅 찾기
for m in map_list:
    idx = df_train[(df_train.player0_starting == 0)&((df_train.player1_starting == m[0])|(df_train.player1_starting == m[2]))].index
    df_train.loc[idx, 'player0_starting'] = m[1]
    del idx
    idx = df_train[(df_train.player0_starting == 0)&((df_train.player1_starting == m[1])|(df_train.player1_starting == m[2]))].index
    df_train.loc[idx, 'player0_starting'] = m[0]
    del idx
    
    idx = df_train[(df_train.player1_starting == 0)&((df_train.player0_starting == m[0])|(df_train.player0_starting == m[2]))].index
    df_train.loc[idx, 'player1_starting'] = m[1]
    del idx
    idx = df_train[(df_train.player1_starting == 0)&((df_train.player0_starting == m[1])|(df_train.player0_starting == m[2]))].index
    df_train.loc[idx, 'player1_starting'] = m[0]
    del idx

In [169]:
# 맵 컬럼 추가
for map_num, m in enumerate(map_list):
    idx = df_train[(df_train.player0_starting == m[0])|(df_train.player0_starting == m[1])|(df_train.player0_starting == m[2])].index
    df_train.loc[idx, 'map'] = map_num
del idx, map_list

In [171]:
df_train = df_train.drop(df_train.columns[[-2, -3]], axis='columns')

In [219]:
df_train.columns

Index(['game_id', 'time', 'event_count_0', 'event_count_1', 'Camera_count_0',
       'Camera_count_1', 'Selection_count_0', 'Selection_count_1',
       'Ability_count_0', 'Ability_count_1', 'Right Click_count_0',
       'Right Click_count_1', 'SetControlGroup_count_0',
       'SetControlGroup_count_1', 'GetControlGroup_count_0',
       'GetControlGroup_count_1', 'AddToControlGroup_count_0',
       'AddToControlGroup_count_1', 'ControlGroup_count_0',
       'ControlGroup_count_1', 'p0_over', 'p1_over', 'map', 'p0_species',
       'p1_species', 'player0_starting', 'player1_starting'],
      dtype='object')

In [22]:
#target 추가
df_train['winner'] = pd.Series(np.array(train[train.shift(-1).game_id != train.game_id].winner))

In [222]:
with open('C:/Users/ASUS/Documents/data_competition/스타크래프트 승률 예측/data/df_train.pickle', 'wb') as f:
    pickle.dump(df_train, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open('C:/Users/ASUS/Documents/data_competition/스타크래프트 승률 예측/data/df_train.pickle', 'rb') as f:
    df_train = pickle.load(f)

# 모델링

In [174]:
label = LabelEncoder()
df_train['p0_species'] = label.fit_transform(df_train['player0_species'])
df_train['p1_species'] = label.fit_transform(df_train['player1_species'])
df_train = df_train.drop(df_train.columns[[2, 3]], axis='columns')

In [None]:
y_train = np.array(train[train.shift(-1).game_id != train.game_id].winner)

In [228]:
def lgb_cv(num_leaves, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda, x_data=None, y_data=None, n_splits=5, output='score'):
    score = 0
    kf = KFold(n_splits=n_splits)
    models = []
    for train_index, valid_index in kf.split(x_data):
        x_train, y_train = x_data.iloc[train_index], y_data[train_index]
        x_valid, y_valid = x_data.iloc[valid_index], y_data[valid_index]
        
        model = lgb.LGBMClassifier(
            num_leaves = int(num_leaves), 
            learning_rate = learning_rate, 
            n_estimators = int(n_estimators), 
            subsample = np.clip(subsample, 0, 1), 
            colsample_bytree = np.clip(colsample_bytree, 0, 1), 
            reg_alpha = reg_alpha, 
            reg_lambda = reg_lambda,
        )
        
        model.fit(x_train, y_train)
        models.append(model)
        
        pred = model.predict_proba(x_valid)[:, 1]
        true = y_valid
        score += roc_auc_score(true, pred)/n_splits
    
    if output == 'score':
        return score
    if output == 'model':
        return models

In [229]:
func_fixed = partial(lgb_cv, x_data=df_train, y_data=y_train, n_splits=5, output='score')

lgbBO = BayesianOptimization(
    func_fixed, 
    {
        'num_leaves': (16, 1024),        # num_leaves,       범위(16~1024)
        'learning_rate': (0.0001, 0.1),  # learning_rate,    범위(0.0001~0.1)
        'n_estimators': (16, 1024),      # n_estimators,     범위(16~1024)
        'subsample': (0, 1),             # subsample,        범위(0~1)
        'colsample_bytree': (0, 1),      # colsample_bytree, 범위(0~1)
        'reg_alpha': (0, 10),            # reg_alpha,        범위(0~10)
        'reg_lambda': (0, 50),           # reg_lambda,       범위(0~50)
    }, 
    random_state=4321                    # 시드 고정
)
lgbBO.maximize(init_points=5, n_iter=30) # 처음 5회 랜덤 값으로 score 계산 후 30회 최적화

|   iter    |  target   | colsam... | learni... | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
|  1        |  0.6127   |  0.0708   |  0.08152  |  790.0    |  304.6    |  1.931    |  48.95    |  0.4062   |
|  2        |  0.6499   |  0.7578   |  0.009006 |  328.4    |  639.9    |  4.599    |  10.92    |  0.6635   |
|  3        |  0.6344   |  0.6787   |  0.09504  |  299.5    |  640.8    |  3.833    |  20.02    |  0.9427   |
|  4        |  0.6293   |  0.9299   |  0.09484  |  394.5    |  361.0    |  6.648    |  2.116    |  0.2322   |
|  5        |  0.648    |  0.4301   |  0.007886 |  788.7    |  876.7    |  1.504    |  5.057    |  0.2709   |
|  6        |  0.6357   |  0.8323   |  0.08186  |  16.27    |  1.023e+0 |  8.912    |  3.989    |  0.2484   |
|  7        |  0.6496   |  0.9067   |  0.07567  |  1.021e+0 |  16.71    |  4.453    |  8.167    |  0.9516   |
|  8      

In [230]:
params = lgbBO.max['params']
models = lgb_cv(
    params['num_leaves'], 
    params['learning_rate'], 
    params['n_estimators'], 
    params['subsample'], 
    params['colsample_bytree'], 
    params['reg_alpha'], 
    params['reg_lambda'], 
    x_data=x_train, y_data=y_train, n_splits=5, output='model')

# test 전처리

In [239]:
# 처음 기록 된 카메라 좌표를 기록
df_test_p0 = test[(test.event=='Camera')&(test.player==0)]
df_test_p0 = df_test_p0[df_test_p0.shift(1).game_id!=df_test_p0.game_id] # 쉬프트를 이용하여 각 게임의 첫번째 데이터 찾기
df_test_p0 = df_test_p0.iloc[:, [0,5]].rename({'event_contents':'player0_starting'}, axis = 1)
df_test_p0.index = df_test_p0['game_id']
df_test_p0 = df_test_p0.drop(['game_id'], axis=1)
df_test = pd.merge(df_test, df_test_p0, on='game_id', how='left')
del df_test_p0

df_test_p1 = test[(test.event=='Camera')&(test.player==1)]
df_test_p1 = df_test_p1[df_test_p1.shift(1).game_id!=df_test_p1.game_id]
df_test_p1 = df_test_p1.iloc[:, [0,5]].rename({'event_contents':'player1_starting'}, axis = 1)
df_test_p1.index = df_test_p1['game_id']
df_test_p1 = df_test_p1.drop(['game_id'], axis=1)
df_test = pd.merge(df_test, df_test_p1, on='game_id', how='left')
del df_test_p1


# x, y 값으로 분리
df_test['player0_starting'] = df_test.player0_starting.str.split('(').str[1]
df_test['player0_starting'] = df_test.player0_starting.str.split(')').str[0]
split_xy = df_test.player0_starting.str.split(',')
df_test['player0_x'] = split_xy.str[0].astype('float')
df_test['player0_y'] = split_xy.str[1].astype('float')
del split_xy

df_test['player1_starting'] = df_test.player1_starting.str.split('(').str[1]
df_test['player1_starting'] = df_test.player1_starting.str.split(')').str[0]
split_xy = df_test.player1_starting.str.split(',')
df_test['player1_x'] = split_xy.str[0].astype('float')
df_test['player1_y'] = split_xy.str[1].astype('float')
del split_xy

# 플레이어의 x,y 좌표를 하나로 모음
location_p0 = df_test.loc[:, ['player0_x', 'player0_y']]
location_p0 = location_p0.rename({'player0_x':'location_x', 'player0_y':'location_y'}, axis=1)

location_p1 = df_test.loc[:, ['player1_x', 'player1_y']]
location_p1 = location_p1.rename({'player1_x':'location_x', 'player1_y':'location_y'}, axis=1)
location_p1.index += location_p0.index[-1]+1

location = pd.concat([location_p0, location_p1])
location = location.dropna()
del location_p0, location_p1

kmeans_clst = KMeans(n_clusters=15).fit(location)
location['starting'] = kmeans_clst.labels_+1

# kmeans로 찾은 15개의 포인트에서 각 데이터들의 거리 계산
for cluster in range(15):
    point = location[location.starting==cluster+1]
    loc = point.loc[:,['location_x', 'location_y']]
    del point
    loc['center_x'] = kmeans_clst.cluster_centers_[cluster][0]
    loc['center_y'] = kmeans_clst.cluster_centers_[cluster][1]
    distance = np.sqrt(np. square(loc.location_x - loc.center_x) + np.square(loc.location_y - loc.center_y))
    location.loc[loc.index, 'distance'] = distance
    del loc
    
# 일정 거리(5)이상 떨어진 데이터는 starting을 0으로 지정
idx = location[location.distance>5].index
location.loc[idx, 'starting'] = 0
del idx

# 클러스터링한 결과 반영
df_test['player0_starting'] = location.loc[df_test.index, 'starting']
location.index -= (df_test.index[-1]+1)
df_test['player1_starting'] = location.loc[df_test.index, 'starting']
del location

# 불필요한 컬럼 삭제
df_test = df_test.drop(['player0_x', 'player0_y', 'player1_x', 'player1_y'], axis = 1)
df_test = df_test.fillna(0)

In [58]:
#게임 번호 추가
df_test = pd.DataFrame(columns=['game_id'])
df_test.game_id = test.game_id.unique()

#게임 시간 추가
df_test['time'] = pd.Series(np.array(test[test.shift(-1).game_id != test.game_id].time))

#플레이어 종족 추가
df = test[test.player==0]
df_test['player0_species']=pd.Series(np.array(df[df.shift(-1).game_id != df.game_id].species))
df = test[test.player==1]
df_test['player1_species']=pd.Series(np.array(df[df.shift(-1).game_id != df.game_id].species))

#이벤트 수 추가
df = test[test.player==0]
df_test['event_count_0'] = pd.Series(np.array(df.game_id.value_counts()[df.game_id.unique()]))
df = test[test.player==1]
df_test['event_count_1'] = pd.Series(np.array(df.game_id.value_counts()[df.game_id.unique()]))

# 이벤트가 없는것 0으로 처리
for event in test.event.unique():
    df = test[(test.player==0)&(test.event==event)]
    df = pd.DataFrame(df.game_id.value_counts()[df.game_id.unique()]).rename({'game_id':event+'_count_0'}, axis = 1)
    df['game_id']= np.array(df.index)
    df_test = pd.merge(df_test, df, on='game_id', how='left')

    df = test[(test.player==1)&(test.event==event)]
    df = pd.DataFrame(df.game_id.value_counts()[df.game_id.unique()]).rename({'game_id':event+'_count_1'}, axis = 1)
    df['game_id']= np.array(df.index)
    df_test = pd.merge(df_test, df, on='game_id', how='left')
df_test = df_test.fillna(0)


KeyboardInterrupt: 

In [194]:
#카메라가 상대방의 특정 영역안으로 들어간 수
p0 = []
p1 = []
compare = euclidean_distance([0,0], [40,40])
    
for j in range(38872,55659):
    new_test = test.loc[test['game_id']==j]
    new_test = new_test.loc[new_test['event']=='Camera']
    p0_df = new_test.loc[new_test['player']==0]
    p1_df = new_test.loc[new_test['player']==1]
        
    player_0_camera = np.array([item.replace('at (', '').replace(')', '').split(',') for item in p0_df['event_contents']]).astype(float)
    player_1_camera = np.array([item.replace('at (', '').replace(')', '').split(',') for item in p1_df['event_contents']]).astype(float)
    
    if len(player_0_camera) == 0:
        p0_over = 0
    if len(player_1_camera) != 0:
        p0_over = 0
        p1_start = player_1_camera[0]
        for i in player_0_camera:
            dist = euclidean_distance(p1_start, i)
            if dist < compare:
                p0_over += 1
                
    if len(player_1_camera) == 0:
        p1_over = 0
    if len(player_0_camera) != 0:
        p1_over = 0
        p0_start = player_0_camera[0] 
        for i in player_1_camera:
            dist = euclidean_distance(p0_start, i)
            if dist < compare:
                p1_over += 1
            
    p0.append(p0_over)
    p1.append(p1_over)
    
df_test['p0_over'] = p0
df_test['p1_over'] = p1

In [240]:
kmeans_clst = KMeans(n_clusters=15).fit(location)
location['starting'] = kmeans_clst.labels_+1

# kmeans로 찾은 15개의 포인트에서 각 데이터들의 거리 계산
for cluster in range(15):
    point = location[location.starting==cluster+1]
    loc = point.loc[:,['location_x', 'location_y']]
    del point
    loc['center_x'] = kmeans_clst.cluster_centers_[cluster][0]
    loc['center_y'] = kmeans_clst.cluster_centers_[cluster][1]
    distance = np.sqrt(np. square(loc.location_x - loc.center_x) + np.square(loc.location_y - loc.center_y))
    location.loc[loc.index, 'distance'] = distance
    del loc
    
# 일정 거리(5)이상 떨어진 데이터는 starting을 0으로 지정
idx = location[location.distance>5].index
location.loc[idx, 'starting'] = 0
del idx

# 클러스터링한 결과 반영
df_test['player0_starting'] = location.loc[df_test.index, 'starting']
location.index -= (df_test.index[-1]+1)
df_test['player1_starting'] = location.loc[df_test.index, 'starting']
del location

# 불필요한 컬럼 삭제
df_test = df_test.drop(['player0_x', 'player0_y', 'player1_x', 'player1_y'], axis = 1)
df_test = df_test.fillna(0)

In [199]:
df_test['p0_over'] = p0
df_test['p1_over'] = p1

In [244]:
label = LabelEncoder()
df_test['p0_species'] = label.fit_transform(df_test['player0_species'])
df_test['p1_species'] = label.fit_transform(df_test['player1_species'])
df_test = df_test.drop(df_test.columns[[2, 3]], axis='columns')

In [248]:
preds = []
for model in models:
    pred = model.predict_proba(df_test)[:, 1]
    preds.append(pred)
pred = np.mean(preds, axis=0)

submission = pd.read_csv('C:/Users/ASUS/Documents/data_competition/스타크래프트 승률 예측/data/sample_submission.csv', index_col=0)
submission['winner'] = submission['winner'] + pred
submission.to_csv('C:/Users/ASUS/Documents/data_competition/스타크래프트 승률 예측/data/submission.csv')
submission.head()

Unnamed: 0_level_0,winner
game_id,Unnamed: 1_level_1
38872,0.340857
38873,0.347046
38874,0.329556
38875,0.384774
38876,0.410008


In [53]:
def species_converter(string):
    if string == 'T':
        return 0
    elif string == 'P':
        return 1
    elif string == 'Z':
        return 2
    else:
        raise ValueError

def data_preparation(df, answer=False):
    game_ids = df['game_id'].unique()
    events = ['Ability', 'AddToControlGroup', 'Camera', 'ControlGroup', 'GetControlGroup', 'Right Click', 'Selection', 'SetControlGroup']
    unique_event_0, unique_event_1, delta_event = {}, {}, {}
    for event in events:
        unique_event_0['P0_' + event] = 0
        unique_event_1['P1_' + event] = 0
        delta_event['delta_' + event] = 0
        
    species = df.groupby(['game_id', 'player']).species.unique()
    event_count = df.groupby(['game_id', 'player']).event.value_counts()
    if answer:
        winners = df.groupby(['game_id']).winner.max()
    
    x_data, y_data = [], []
    for game_id in tqdm(game_ids):
        df_event_count = event_count[game_id].unstack(level=-1)
        df = pd.DataFrame(species[game_id])
        df = pd.concat([df, df_event_count], axis=1)   
        df = df.fillna(0)
        
        df_P0_species = pd.DataFrame([species_converter(df.loc[0]['species'][0])], columns=['P0_species'])        
        df_P1_species = pd.DataFrame([species_converter(df.loc[1]['species'][0])], columns=['P1_species'])
        df = df.drop(['species'], axis=1)

        df_P0_event = unique_event_0.copy()
        for column in df.columns:
            df_P0_event['P0_' + column] = df.loc[0][column]
        df_P0_event = pd.DataFrame(pd.Series(df_P0_event)).T

        df_P1_event = unique_event_1.copy()
        for column in df.columns:
            df_P1_event['P1_' + column] = df.loc[1][column]
        df_P1_event = pd.DataFrame(pd.Series(df_P1_event)).T
        
        df_delta_event = delta_event.copy()
        for column in df.columns:
            df_delta_event['delta_' + column] = df_P0_event['P0_' + column][0] - df_P1_event['P1_' + column][0]
        df_delta_event = pd.DataFrame(pd.Series(df_delta_event)).T

        out = pd.concat([df_P0_species, df_P0_event, df_P1_species, df_P1_event, df_delta_event], axis=1)
        out.index = [game_id]
        out.index.name = 'game_id'
        
        x_data.append(out)
        if answer:
            y_data.append(winners[game_id])  

    x_data = pd.concat(x_data)
    y_data = np.array(y_data)
    
    return x_data, y_data

In [251]:
X_train, Y_train = data_preparation(train, answer=True)

100%|███████████████████████████████████████████████████████████████████████████| 38872/38872 [05:46<00:00, 112.21it/s]


In [422]:
with open('C:/Users/ASUS/Documents/data_competition/스타크래프트 승률 예측/data/X_train.pickle', 'wb') as f:
    pickle.dump(X_train, f, pickle.HIGHEST_PROTOCOL)

In [3]:
with open('C:/Users/ASUS/Documents/data_competition/스타크래프트 승률 예측/data/df_train.pickle', 'rb') as f:
    X_train = pickle.load(f)
    
y_train = np.array(train[train.shift(-1).game_id != train.game_id].winner)

In [13]:
game_id = X_train['game_id']
X_train = X_train.drop(['game_id'], axis = 1)

In [41]:
std_scaler = StandardScaler()
fitted = std_scaler.fit(X_train)
output = std_scaler.transform(X_train)
output = pd.DataFrame(output, columns=X_train.columns)

In [420]:
over= X_train.loc[:,['p0_over', 'p1_over']]
over['winner'] = Y_train

In [281]:
X_train['p0_over'] = df_train['p0_over']
X_train['p1_over'] = df_train['p1_over']
X_train['player0_starting'] = df_train['player0_starting']
X_train['player1_starting'] = df_train['player1_starting']


In [42]:
def lgb_cv(num_leaves, learning_rate, n_estimators, subsample,colsample_bytree, reg_alpha, reg_lambda,bagging_fraction,max_depth, x_data=None, y_data=None, n_splits=5, output='score'):
    score = 0
    kf = KFold(n_splits=n_splits)
    models = []
    for train_index, valid_index in kf.split(x_data):
        x_train, y_train = x_data.iloc[train_index], y_data[train_index]
        x_valid, y_valid = x_data.iloc[valid_index], y_data[valid_index]
        
        model = lgb.LGBMClassifier(
            num_leaves = int(num_leaves), 
            learning_rate = learning_rate, 
            n_estimators = int(n_estimators), 
            subsample = np.clip(subsample, 0, 1), 
            colsample_bytree = np.clip(colsample_bytree, 0, 1), 
            reg_alpha = reg_alpha, 
            reg_lambda = reg_lambda,
            bagging_fraction = bagging_fraction,
            max_depth = int(max_depth),
        )
        
        model.fit(x_train, y_train)
        models.append(model)
        
        pred = model.predict_proba(x_valid)[:, 1]
        true = y_valid
        score += roc_auc_score(true, pred)/n_splits
    
    if output == 'score':
        return score
    if output == 'model':
        return models

In [399]:
def xgb_cv(learning_rate, subsample,colsample_bytree ,gamma,reg_alpha, reg_lambda,max_depth, x_data=None, y_data=None, n_splits=5, output='score'):
    score = 0
    kf = KFold(n_splits=n_splits)
    models = []
    for train_index, valid_index in kf.split(x_data):
        x_train, y_train = x_data.iloc[train_index], y_data[train_index]
        x_valid, y_valid = x_data.iloc[valid_index], y_data[valid_index]
        
        model = xgb.XGBClassifier(
            learning_rate = learning_rate, 
            subsample = np.clip(subsample, 0, 1), 
            colsample_bytree = np.clip(colsample_bytree, 0, 1), 
            reg_alpha = reg_alpha, 
            reg_lambda = reg_lambda,
            max_depth = int(max_depth),
            gamma = gamma,

        )
        
        model.fit(x_train, y_train)
        models.append(model)
        
        pred = model.predict_proba(x_valid)[:, 1]
        true = y_valid
        score += roc_auc_score(true, pred)/n_splits
    
    if output == 'score':
        return score
    if output == 'model':
        return models

In [403]:
func_fixed = partial(xgb_cv, x_data=X_train, y_data=Y_train, n_splits=5, output='score')


xgbBO = BayesianOptimization(
    func_fixed, 
    {
        'learning_rate': (0.01,0.1),  # learning_rate,    범위(0.0001~0.1)
        'subsample': (0.1, 1),            #subsample,        범위(0~1)
        'max_depth' : (0, 30),
        'colsample_bytree': (0.1, 1),      # colsample_bytree, 범위(0~1)
        'reg_alpha': (1, 10),            # reg_alpha,        범위(0~10)
        'reg_lambda': (0.1, 50),           # reg_lambda,       범위(0~50)
        'gamma' : (0.01, 1),

    }, 
    random_state=4321                # 시드 고정
)
xgbBO.maximize(init_points=5, n_iter=30) # 처음 5회 랜덤 값으로 score 계산 후 30회 최적화

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
|  1        |  0.6393   |  0.1637   |  0.8169   |  0.07911  |  8.591    |  2.738    |  48.95    |  0.4656   |
|  2        |  0.6494   |  0.782    |  0.09826  |  0.03789  |  18.57    |  5.139    |  10.99    |  0.6972   |
|  3        |  0.6523   |  0.7108   |  0.9508   |  0.03531  |  18.6     |  4.45     |  20.08    |  0.9484   |
|  4        |  0.6498   |  0.9369   |  0.9489   |  0.04379  |  10.27    |  6.983    |  2.212    |  0.309    |
|  5        |  0.6351   |  0.4871   |  0.08716  |  0.07899  |  25.62    |  2.354    |  5.147    |  0.3438   |
|  6        |  0.6505   |  0.3609   |  0.2887   |  0.058    |  29.5     |  9.967    |  49.27    |  0.8102   |
|  7        |  0.5      |  0.1914   |  0.8096   |  0.05285  |  0.5651   |  9.965    |  34.79    |  0.3456   |
|  8      

KeyboardInterrupt: 

In [43]:
func_fixed = partial(lgb_cv, x_data=output, y_data=y_train, n_splits=5, output='score')

lgbBO = BayesianOptimization(
    func_fixed, 
    {
        'num_leaves': (64,128),        # num_leaves,       범위(16~1024)
        'learning_rate': (0.001,0.01),  # learning_rate,    범위(0.0001~0.1)
        'n_estimators': (2048, 4096),      # n_estimators,     범위(16~1024)
        'subsample': (0.1, 1),            #subsample,        범위(0~1)
        'max_depth' : (-1,-1),
        'colsample_bytree': (0.1, 1),      # colsample_bytree, 범위(0~1)
        'reg_alpha': (1, 10),            # reg_alpha,        범위(0~10)
        'reg_lambda': (1, 50),           # reg_lambda,       범위(0~50)
        'bagging_fraction': (0.1,1),
    }, 
    random_state=4321                # 시드 고정
)
lgbBO.maximize(init_points=5, n_iter=30) # 처음 5회 랜덤 값으로 score 계산 후 30회 최적화

|   iter    |  target   | baggin... | colsam... | learni... | max_depth | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
|  1        |  0.6514   |  0.1637   |  0.8336   |  0.007911 | -1.0      |  2.443e+0 |  126.7    |  4.656    |  38.13    |  0.1802   |
|  2        |  0.6531   |  0.3789   |  0.657    |  0.005139 | -1.0      |  3.407e+0 |  107.4    |  9.553    |  14.78    |  0.6579   |
|  3        |  0.648    |  0.445    |  0.4603   |  0.009484 | -1.0      |  3.99e+03 |  88.03    |  4.081    |  33.57    |  0.1381   |
|  4        |  0.6567   |  0.309    |  0.4871   |  0.001701 | -1.0      |  3.797e+0 |  73.63    |  1.91     |  14.27    |  0.1271   |
|  5        |  0.652    |  0.854    |  0.6378   |  0.00938  | -1.0      |  2.819e+0 |  79.6     |  6.416    |  17.16    |  0.5155   |
|  6        |  0.6384   |  0.3095   |  0.159    |  0.003552 | 

In [51]:
params = lgbBO.max['params']
models = lgb_cv(
    params['num_leaves'], 
    params['learning_rate'], 
    params['n_estimators'], 
    params['subsample'], 
    params['colsample_bytree'], 
    params['reg_alpha'], 
    params['reg_lambda'],
    params['bagging_fraction'],
    params['max_depth'],
    x_data=X_train, y_data=y_train, n_splits=5, output='model')

In [59]:
x_test, _ = data_preparation(test, answer=False)

100%|███████████████████████████████████████████████████████████████████████████| 16787/16787 [02:31<00:00, 110.53it/s]


In [65]:
#카메라가 상대방의 특정 영역안으로 들어간 수
p0 = []
p1 = []
compare = euclidean_distance([0,0], [40,40])
    
for j in range(38872,55659):
    new_test = test.loc[test['game_id']==j]
    new_test = new_test.loc[new_test['event']=='Camera']
    p0_df = new_test.loc[new_test['player']==0]
    p1_df = new_test.loc[new_test['player']==1]
        
    player_0_camera = np.array([item.replace('at (', '').replace(')', '').split(',') for item in p0_df['event_contents']]).astype(float)
    player_1_camera = np.array([item.replace('at (', '').replace(')', '').split(',') for item in p1_df['event_contents']]).astype(float)
    
    if len(player_0_camera) == 0:
        p0_over = 0
    if len(player_1_camera) != 0:
        p0_over = 0
        p1_start = player_1_camera[0]
        for i in player_0_camera:
            dist = euclidean_distance(p1_start, i)
            if dist < compare:
                p0_over += 1
                
    if len(player_1_camera) == 0:
        p1_over = 0
    if len(player_0_camera) != 0:
        p1_over = 0
        p0_start = player_0_camera[0] 
        for i in player_1_camera:
            dist = euclidean_distance(p0_start, i)
            if dist < compare:
                p1_over += 1
            
    p0.append(p0_over)
    p1.append(p1_over)
    
x_test['p0_over'] = p0
x_test['p1_over'] = p1

In [67]:
# 처음 기록 된 카메라 좌표를 기록
x_test_p0 = test[(test.event=='Camera')&(test.player==0)]
x_test_p0 = x_test_p0[x_test_p0.shift(1).game_id!=x_test_p0.game_id] # 쉬프트를 이용하여 각 게임의 첫번째 데이터 찾기
x_test_p0 = x_test_p0.iloc[:, [0,5]].rename({'event_contents':'player0_starting'}, axis = 1)
x_test_p0.index = x_test_p0['game_id']
x_test_p0 = x_test_p0.drop(['game_id'], axis=1)
x_test = pd.merge(x_test, x_test_p0, on='game_id', how='left')
del x_test_p0

x_test_p1 = test[(test.event=='Camera')&(test.player==1)]
x_test_p1 = x_test_p1[x_test_p1.shift(1).game_id!=x_test_p1.game_id]
x_test_p1 = x_test_p1.iloc[:, [0,5]].rename({'event_contents':'player1_starting'}, axis = 1)
x_test_p1.index = x_test_p1['game_id']
x_test_p1 = x_test_p1.drop(['game_id'], axis=1)
x_test = pd.merge(x_test, x_test_p1, on='game_id', how='left')
del x_test_p1


# x, y 값으로 분리
x_test['player0_starting'] = x_test.player0_starting.str.split('(').str[1]
x_test['player0_starting'] = x_test.player0_starting.str.split(')').str[0]
split_xy = x_test.player0_starting.str.split(',')
x_test['player0_x'] = split_xy.str[0].astype('float')
x_test['player0_y'] = split_xy.str[1].astype('float')
del split_xy

x_test['player1_starting'] = x_test.player1_starting.str.split('(').str[1]
x_test['player1_starting'] = x_test.player1_starting.str.split(')').str[0]
split_xy = x_test.player1_starting.str.split(',')
x_test['player1_x'] = split_xy.str[0].astype('float')
x_test['player1_y'] = split_xy.str[1].astype('float')
del split_xy

# 플레이어의 x,y 좌표를 하나로 모음
location_p0 = x_test.loc[:, ['player0_x', 'player0_y']]
location_p0 = location_p0.rename({'player0_x':'location_x', 'player0_y':'location_y'}, axis=1)

location_p1 = x_test.loc[:, ['player1_x', 'player1_y']]
location_p1 = location_p1.rename({'player1_x':'location_x', 'player1_y':'location_y'}, axis=1)
location_p1.index += location_p0.index[-1]+1

location = pd.concat([location_p0, location_p1])
location = location.dropna()
del location_p0, location_p1

kmeans_clst = KMeans(n_clusters=15).fit(location)
location['starting'] = kmeans_clst.labels_+1

# kmeans로 찾은 15개의 포인트에서 각 데이터들의 거리 계산
for cluster in range(15):
    point = location[location.starting==cluster+1]
    loc = point.loc[:,['location_x', 'location_y']]
    del point
    loc['center_x'] = kmeans_clst.cluster_centers_[cluster][0]
    loc['center_y'] = kmeans_clst.cluster_centers_[cluster][1]
    distance = np.sqrt(np. square(loc.location_x - loc.center_x) + np.square(loc.location_y - loc.center_y))
    location.loc[loc.index, 'distance'] = distance
    del loc
    
# 일정 거리(5)이상 떨어진 데이터는 starting을 0으로 지정
idx = location[location.distance>5].index
location.loc[idx, 'starting'] = 0
del idx

# 클러스터링한 결과 반영
x_test['player0_starting'] = location.loc[x_test.index, 'starting']
location.index -= (x_test.index[-1]+1)
x_test['player1_starting'] = location.loc[x_test.index, 'starting']
del location

# 불필요한 컬럼 삭제
x_test = x_test.drop(['player0_x', 'player0_y', 'player1_x', 'player1_y'], axis = 1)
x_test = x_test.fillna(0)

In [70]:
X_train.columns

Index(['time', 'event_count_0', 'event_count_1', 'Camera_count_0',
       'Camera_count_1', 'Selection_count_0', 'Selection_count_1',
       'Ability_count_0', 'Ability_count_1', 'Right Click_count_0',
       'Right Click_count_1', 'SetControlGroup_count_0',
       'SetControlGroup_count_1', 'GetControlGroup_count_0',
       'GetControlGroup_count_1', 'AddToControlGroup_count_0',
       'AddToControlGroup_count_1', 'ControlGroup_count_0',
       'ControlGroup_count_1', 'p0_over', 'p1_over', 'p0_species',
       'p1_species', 'player0_starting', 'player1_starting'],
      dtype='object')

In [71]:
x_test.columns

Index(['P0_species', 'P0_Ability', 'P0_AddToControlGroup', 'P0_Camera',
       'P0_ControlGroup', 'P0_GetControlGroup', 'P0_Right Click',
       'P0_Selection', 'P0_SetControlGroup', 'P1_species', 'P1_Ability',
       'P1_AddToControlGroup', 'P1_Camera', 'P1_ControlGroup',
       'P1_GetControlGroup', 'P1_Right Click', 'P1_Selection',
       'P1_SetControlGroup', 'delta_Ability', 'delta_AddToControlGroup',
       'delta_Camera', 'delta_ControlGroup', 'delta_GetControlGroup',
       'delta_Right Click', 'delta_Selection', 'delta_SetControlGroup',
       'p0_over', 'p1_over', 'player0_starting', 'player1_starting'],
      dtype='object')

In [69]:
preds = []
for model in models:
    pred = model.predict_proba(x_test)[:, 1]
    preds.append(pred)
pred = np.mean(preds, axis=0)

submission = pd.read_csv('C:/Users/ASUS/Documents/data_competition/스타크래프트 승률 예측/data/sample_submission.csv', index_col=0)
submission['winner'] = submission['winner'] + pred
submission.to_csv('C:/Users/ASUS/Documents/data_competition/스타크래프트 승률 예측/data/submission.csv')
submission.head()

ValueError: Number of features of the model must match the input. Model n_features_ is 25 and input n_features is 30 

In [50]:
params

{'bagging_fraction': 0.2618582257562451,
 'colsample_bytree': 0.8897619128614919,
 'learning_rate': 0.002839555434377229,
 'max_depth': -1.0,
 'n_estimators': 4093.642050024844,
 'num_leaves': 68.655010840897,
 'reg_alpha': 1.0939750550975016,
 'reg_lambda': 46.80670758865449,
 'subsample': 0.9281966491632396}

In [56]:
x_test.head()

Unnamed: 0_level_0,P0_species,P0_Ability,P0_AddToControlGroup,P0_Camera,P0_ControlGroup,P0_GetControlGroup,P0_Right Click,P0_Selection,P0_SetControlGroup,P1_species,...,P1_Selection,P1_SetControlGroup,delta_Ability,delta_AddToControlGroup,delta_Camera,delta_ControlGroup,delta_GetControlGroup,delta_Right Click,delta_Selection,delta_SetControlGroup
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
38872,1,31.0,1.0,232.0,0.0,9.0,241.0,52.0,5.0,1,...,41.0,0.0,-7.0,-3.0,-235.0,0.0,-34.0,119.0,11.0,5.0
38873,2,34.0,0.0,200.0,4.0,18.0,51.0,114.0,0.0,1,...,19.0,5.0,6.0,-1.0,54.0,4.0,-104.0,-17.0,95.0,-5.0
38874,0,33.0,0.0,245.0,0.0,63.0,79.0,42.0,7.0,0,...,36.0,5.0,4.0,0.0,25.0,0.0,-363.0,-29.0,6.0,2.0
38875,0,95.0,8.0,515.0,0.0,255.0,218.0,123.0,9.0,1,...,99.0,2.0,37.0,7.0,158.0,0.0,231.0,-104.0,24.0,7.0
38876,2,7.0,2.0,129.0,0.0,387.0,30.0,25.0,2.0,2,...,35.0,1.0,-1.0,2.0,-32.0,0.0,386.0,23.0,-10.0,1.0
