# map data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

import warnings                             
warnings.filterwarnings("ignore")

In [2]:
%%time
train = pd.read_csv('data2/train.csv')

CPU times: user 49.1 s, sys: 30.5 s, total: 1min 19s
Wall time: 2min 25s


In [4]:
train = train.drop(['winner'], axis='columns')

In [3]:
%%time
test = pd.read_csv('data2/test.csv')

CPU times: user 20.3 s, sys: 10.1 s, total: 30.4 s
Wall time: 1min 2s


In [5]:
total = train.append(test)

In [10]:
df_total = pd.DataFrame(total.game_id.unique(), columns=['game_id'])
df_total.index = df_total.game_id
df_total = df_total.drop(['game_id'], axis = 1)

In [13]:
%%time
# 처음 기록 된 카메라 좌표를 기록

df_total_p0 = total[(total.event=='Camera')&(total.player==0)]
df_total_p0 = df_total_p0[df_total_p0.shift(1).game_id!=df_total_p0.game_id] # 쉬프트를 이용하여 각 게임의 첫번째 데이터 찾기
df_total_p0 = df_total_p0.iloc[:, [0,5]].rename({'event_contents':'player0_starting'}, axis = 1)
df_total_p0.index = df_total_p0['game_id']
df_total_p0 = df_total_p0.drop(['game_id'], axis=1)
df_total = pd.merge(df_total, df_total_p0, on='game_id', how='left')
del df_total_p0

df_total_p1 = total[(total.event=='Camera')&(total.player==1)]
df_total_p1 = df_total_p1[df_total_p1.shift(1).game_id!=df_total_p1.game_id]
df_total_p1 = df_total_p1.iloc[:, [0,5]].rename({'event_contents':'player1_starting'}, axis = 1)
df_total_p1.index = df_total_p1['game_id']
df_total_p1 = df_total_p1.drop(['game_id'], axis=1)
df_total = pd.merge(df_total, df_total_p1, on='game_id', how='left')
del df_total_p1

CPU times: user 22.7 s, sys: 59.6 s, total: 1min 22s
Wall time: 5min 58s


In [15]:
# x, y 값으로 분리

df_total['player0_starting'] = df_total.player0_starting.str.split('(').str[1]
df_total['player0_starting'] = df_total.player0_starting.str.split(')').str[0]
split_xy = df_total.player0_starting.str.split(',')
df_total['player0_x'] = split_xy.str[0].astype('float')
df_total['player0_y'] = split_xy.str[1].astype('float')
del split_xy

df_total['player1_starting'] = df_total.player1_starting.str.split('(').str[1]
df_total['player1_starting'] = df_total.player1_starting.str.split(')').str[0]
split_xy = df_total.player1_starting.str.split(',')
df_total['player1_x'] = split_xy.str[0].astype('float')
df_total['player1_y'] = split_xy.str[1].astype('float')
del split_xy

In [16]:
# 플레이어의 x,y 좌표를 하나로 모음

location_p0 = df_total.loc[:, ['player0_x', 'player0_y']]
location_p0 = location_p0.rename({'player0_x':'location_x', 'player0_y':'location_y'}, axis=1)

location_p1 = df_total.loc[:, ['player1_x', 'player1_y']]
location_p1 = location_p1.rename({'player1_x':'location_x', 'player1_y':'location_y'}, axis=1)
location_p1.index += location_p0.index[-1]+1

location = pd.concat([location_p0, location_p1])
location = location.dropna()
del location_p0, location_p1

In [17]:
# # 모든 포인트 시각화
# sns.lmplot('location_x', 'location_y', data = location, fit_reg=False)
# plt.title('starting point')
# plt.show()

In [18]:
# # 스타팅 컬럼을 카운팅을 해보면 15개의 포인트가 많음
# # 15개의 스타팅포인트 존재
# df_train.player0_starting.value_counts().head(20)

In [19]:
# kmeans를 이용하여 15개로 클러스터링
kmeans_clst = KMeans(n_clusters=15).fit(location)
location['starting'] = kmeans_clst.labels_+1

In [20]:
# location.head()

In [21]:
# # 클러스트링한 결과를 시각화
# sns.lmplot('location_x', 'location_y', data = location, fit_reg=False, hue="starting")
# plt.title('starting point')
# plt.show()

# # 초반 진행 화면이 누락 된 게임은 엉뚱하게 분류 됨

In [22]:
# kmeans로 찾은 15개의 포인트에서 각 데이터들의 거리 계산
for cluster in range(15):
    point = location[location.starting==cluster+1]
    loc = point.loc[:,['location_x', 'location_y']]
    del point
    loc['center_x'] = kmeans_clst.cluster_centers_[cluster][0]
    loc['center_y'] = kmeans_clst.cluster_centers_[cluster][1]
    distance = np.sqrt(np. square(loc.location_x - loc.center_x) + np.square(loc.location_y - loc.center_y))
    location.loc[loc.index, 'distance'] = distance
    del loc

In [23]:
# 일정 거리(5)이상 떨어진 데이터는 starting을 0으로 지정
idx = location[location.distance>5].index
location.loc[idx, 'starting'] = 0
del idx

In [24]:
# # 시각화를 통해 스타팅을 모르는 게임은 분홍색(0)으로 나타며 15개의 스타팅 포인트를 확인 할 수 있음.
# sns.lmplot('location_x', 'location_y', data = location, fit_reg=False, hue="starting")
# plt.title('starting point')
# plt.show()

In [25]:
# df_train.head()

In [26]:
df_total_temp = df_total.reset_index()[['game_id', 'player0_starting', 'player1_starting']]

location['xy'] = location.apply(lambda x: str(x.location_x) + ', ' + str(x.location_y), axis='columns')

location_temp = location.reset_index()[['game_id', 'xy', 'starting']]

ttt = df_total_temp.merge(location_temp, how='left', on='game_id' )

location.index -= df_total.index[-1]+1

location_temp_2 = location.reset_index()[['game_id', 'xy', 'starting']]

ttt_2 = df_total_temp.merge(location_temp_2, how='left', on='game_id' )

ttt_3 = ttt.merge(ttt_2[['game_id','starting']], how='left', on='game_id')

ttt_3 = ttt_3.drop(['player0_starting','player1_starting','xy'], axis='columns')

ttt_3 = ttt_3.set_index('game_id')

ttt_3 = ttt_3.rename({'starting_x':'player0_starting', 'starting_y':'player1_starting'}, axis='columns')

ttt_3 = ttt_3.fillna(0)

df_total = ttt_3

In [27]:
# 스타팅 포인트를 이용하여 맵 분류
map_list = []
for point in range(1,16):
    couple = df_total[df_total.player0_starting == point].player1_starting.value_counts()
    if couple[couple.index[1]]<100:
        map_list.append([point, couple.index[0], 999])
    else:
        map_list.append([point, couple.index[0], couple.index[1]])
map_list = np.sort(map_list, axis = 1)
map_list = np.unique(map_list, axis = 0)

In [28]:
# 6개의 2인용 맵과 1개의 3인용 맵이 존재

# TMI: 스타크래프트2 공허의유산 래더에서 3인용 맵이 쓰인적은 '까탈레나' 딱 한 번.
# 이를 통해 2017년 7월20일 ~ 2017년 11월16일 사이에 진행 된 게임 정보라는 것을 알 수 있음.
# 나머지 6개의 맵은 '어비설리프', '어센션투아이어', '애컬라이트', '인터로퍼', '오딧세이', '메크디포'
# 근데 결과 예측하는데 맵 이름 상관 없음
map_list

array([[  1.,   5.,  14.],
       [  2.,  10., 999.],
       [  3.,   7., 999.],
       [  4.,   8., 999.],
       [  6.,  15., 999.],
       [  9.,  11., 999.],
       [ 12.,  13., 999.]])

In [29]:
# 스타팅을 모르는 게임 수 확인.
len(df_total[(df_total.player0_starting == 0)|(df_total.player1_starting == 0)])

360

In [30]:
# map_list와 상대편 위치 정보를 이용하여 모르는 스타팅 찾기
for m in map_list:
    idx = df_total[(df_total.player0_starting == 0)&((df_total.player1_starting == m[0])|(df_total.player1_starting == m[2]))].index
    df_total.loc[idx, 'player0_starting'] = m[1]
    del idx
    idx = df_total[(df_total.player0_starting == 0)&((df_total.player1_starting == m[1])|(df_total.player1_starting == m[2]))].index
    df_total.loc[idx, 'player0_starting'] = m[0]
    del idx
    
    idx = df_total[(df_total.player1_starting == 0)&((df_total.player0_starting == m[0])|(df_total.player0_starting == m[2]))].index
    df_total.loc[idx, 'player1_starting'] = m[1]
    del idx
    idx = df_total[(df_total.player1_starting == 0)&((df_total.player0_starting == m[1])|(df_total.player0_starting == m[2]))].index
    df_total.loc[idx, 'player1_starting'] = m[0]
    del idx

In [31]:
# 모든 게임의 스타팅포인트를 찾음
df_total[(df_total.player0_starting == 0)|(df_total.player1_starting == 0)].head()

Unnamed: 0_level_0,player0_starting,player1_starting
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1


In [32]:
# 맵 컬럼 추가
for map_num, m in enumerate(map_list):
    idx = df_total[(df_total.player0_starting == m[0])|(df_total.player0_starting == m[1])|(df_total.player0_starting == m[2])].index
    df_total.loc[idx, 'map'] = map_num
del idx, map_list

In [33]:
# 스타팅포인트, 맵 클러스터링 끝
df_total.head()

Unnamed: 0_level_0,player0_starting,player1_starting,map
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2.0,10.0,1.0
1,6.0,15.0,4.0
2,13.0,12.0,6.0
3,7.0,3.0,2.0
4,9.0,11.0,5.0


In [34]:
df_total.to_csv('data2/map_data.csv')