In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics import roc_auc_score   # AUC 스코어 계산
from sklearn.model_selection import KFold   # K-fold CV    
from bayes_opt import BayesianOptimization  # 베이지안 최적화 라이브러리  
from functools import partial               # 함수 변수 고정
import lightgbm as lgb

In [3]:
train=pd.read_csv('train.csv',encoding="utf-8")
test=pd.read_csv('test.csv',encoding="utf-8")
sample_submission=pd.read_csv('sample_submission.csv',encoding="utf-8")   

## 1.EDA

In [4]:
train.head()

Unnamed: 0,game_id,winner,time,player,species,event,event_contents
0,0,1,0.0,0,T,Camera,"at (145.25, 21.5078125)"
1,0,1,0.0,1,T,Camera,"at (22.75, 147.0078125)"
2,0,1,0.02,0,T,Selection,['OrbitalCommand [3080001]']
3,0,1,0.02,0,T,Ability,(1360) - TrainSCV
4,0,1,0.14,0,T,Camera,"at (142.99609375, 24.50390625)"


### Ability만 분류

In [4]:
df_train = train[train["event"] == "Ability"]
df_test = test[test["event"] == "Ability"]

In [5]:
df_train

Unnamed: 0,game_id,winner,time,player,species,event,event_contents
3,0,1,0.02,0,T,Ability,(1360) - TrainSCV
19,0,1,0.27,0,T,Ability,"(1021) - BuildSupplyDepot; Location: (135.0, 4..."
27,0,1,0.31,0,T,Ability,(480) - Stop
29,0,1,0.33,1,T,Ability,(1360) - TrainSCV
37,0,1,0.35,0,T,Ability,(1360) - TrainSCV
...,...,...,...,...,...,...,...
67091714,38871,0,8.41,0,Z,Ability,(17E1) - ResearchEvolveMuscularAugments
67091715,38871,0,8.42,1,T,Ability,(B40) - CalldownMULE; Target: DestructibleSign...
67091726,38871,0,8.44,0,Z,Ability,(16E2) - BuildExtractor; Target: DestructibleS...
67091731,38871,0,8.46,1,T,Ability,"(102D) - BuildArmory; Location: (79.5, 114.5, ..."


In [6]:
df_test

Unnamed: 0,game_id,time,player,species,event,event_contents
3,38872,0.01,1,P,Ability,(15E0) - TrainProbe
29,38872,0.11,0,P,Ability,(15E0) - TrainProbe
69,38872,0.25,1,P,Ability,"(1541) - BuildPylon; Location: (121.0, 150.0, ..."
75,38872,0.31,1,P,Ability,(15E0) - TrainProbe
82,38872,0.36,0,P,Ability,(15E0) - TrainProbe
...,...,...,...,...,...,...
28714832,55658,4.49,0,Z,Ability,"(5A0) - Attack; Location: (125.526611328125, 4..."
28714834,55658,4.50,1,T,Ability,"(5A0) - Attack; Location: (120.14794921875, 47..."
28714836,55658,4.50,0,Z,Ability,"(5A0) - Attack; Location: (126.2021484375, 44...."
28714837,55658,4.51,1,T,Ability,"(5A0) - Attack; Location: (126.635498046875, 4..."


In [7]:
#12BE는 설명이 없음
df_test[df_test.event_contents.str.contains('12BE')]

Unnamed: 0,game_id,time,player,species,event,event_contents
96691,38927,3.55,0,T,Ability,(12BE)
1234337,39584,7.47,1,T,Ability,(12BE)
1308299,39625,4.41,0,T,Ability,(12BE)
1381990,39666,7.25,1,T,Ability,(12BE)
1596592,39786,5.00,0,T,Ability,(12BE)
...,...,...,...,...,...,...
26623369,54446,7.57,0,T,Ability,(12BE)
27268900,54824,5.35,0,T,Ability,(12BE)
27358047,54874,8.19,1,T,Ability,(12BE)
27438881,54924,9.21,1,T,Ability,(12BE)


In [8]:
df_train.event_contents.nunique()

906693

In [9]:
# 칼럼 단어만 추출하기
def eliminated_sc(text):
    word = text.split(';')
    col_name = word[0].split(' - ')
    if (len(col_name) >1):
        result = col_name[1]
    else:
        result = 'nothing'
    return result

In [10]:
df_train.event_contents = df_train.event_contents.apply(lambda x : eliminated_sc(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [11]:
df_test.event_contents = df_test.event_contents.apply(lambda x : eliminated_sc(x))

In [12]:
df_train

Unnamed: 0,game_id,winner,time,player,species,event,event_contents
3,0,1,0.02,0,T,Ability,TrainSCV
19,0,1,0.27,0,T,Ability,BuildSupplyDepot
27,0,1,0.31,0,T,Ability,Stop
29,0,1,0.33,1,T,Ability,TrainSCV
37,0,1,0.35,0,T,Ability,TrainSCV
...,...,...,...,...,...,...,...
67091714,38871,0,8.41,0,Z,Ability,ResearchEvolveMuscularAugments
67091715,38871,0,8.42,1,T,Ability,CalldownMULE
67091726,38871,0,8.44,0,Z,Ability,BuildExtractor
67091731,38871,0,8.46,1,T,Ability,BuildArmory


In [13]:
df_test

Unnamed: 0,game_id,time,player,species,event,event_contents
3,38872,0.01,1,P,Ability,TrainProbe
29,38872,0.11,0,P,Ability,TrainProbe
69,38872,0.25,1,P,Ability,BuildPylon
75,38872,0.31,1,P,Ability,TrainProbe
82,38872,0.36,0,P,Ability,TrainProbe
...,...,...,...,...,...,...
28714832,55658,4.49,0,Z,Ability,Attack
28714834,55658,4.50,1,T,Ability,Attack
28714836,55658,4.50,0,Z,Ability,Attack
28714837,55658,4.51,1,T,Ability,Attack


In [14]:
#아무것도 아닌 값은 제외
df_train = df_train[df_train.event_contents.str.contains('nothing') == False]

In [15]:
df_test = df_test[df_test.event_contents.str.contains('nothing') == False]

In [169]:
def time_make_column(game_id,player,species,event_contents,time):
    player_num = "p"+str(player)
    time_limit = int(time)
    if(time_limit > 7): #8분부터
        time_column = 'last'
    elif(time_limit > 3): #4분부터
        time_column = 'mid'
    else: #4분이하
        time_column = 'init'
    col_name = player_num+"_"+str(event_contents) +"_t_" + time_column
    col_name2 = player_num+"_species"
    if col_name in df.columns:
        df.loc[game_id,col_name] += 1
        df.loc[game_id,col_name2] = str(species)
    else:
        df[col_name] = 0
        df[col_name2] = 0
        df.loc[game_id,col_name] += 1
        df.loc[game_id,col_name2] = str(species)            

In [1]:
def make_column(game_id,player,species,event_contents):
    player_num = "p"+str(player)
    col_name = player_num+"_"+str(event_contents)
    col_name2 = player_num+"_species"
    if col_name in df.columns:
        df.loc[game_id,col_name] += 1
        df.loc[game_id,col_name2] = str(species)
    else:
        df[col_name] = 0
        df[col_name2] = 0
        df.loc[game_id,col_name] += 1
        df.loc[game_id,col_name2] = str(species)            

In [16]:
df = pd.DataFrame(train["game_id"].unique(),columns=["game_id"])

In [17]:
df

Unnamed: 0,game_id
0,0
1,1
2,2
3,3
4,4
...,...
38867,38867
38868,38868
38869,38869
38870,38870


In [21]:
df_train.apply(lambda x : make_column(x["game_id"],x["player"],x["species"], x["event_contents"]) , axis = 1 )

3           None
19          None
27          None
29          None
37          None
            ... 
67091714    None
67091715    None
67091726    None
67091731    None
67091775    None
Length: 3797368, dtype: object

In [None]:
df['time'] = np.array(train[train.shift(-1).game_id != train.game_id].time)

In [None]:
df_t = pd.DataFrame(test["game_id"].unique(),columns=["game_id"])
df_t = df_t.set_index("game_id")

In [182]:
def make_column_test_time(game_id,player,species,event_contents,time):
    player_num = "p"+str(player)
    time_limit = int(time)
    if(time_limit > 7): #8분부터
        time_column = 'last'
    elif(time_limit > 3): #4분부터
        time_column = 'mid'
    else: #4분이하
        time_column = 'init'
    col_name = player_num+"_"+str(event_contents) +"_t_" + time_column
    col_name2 = player_num+"_species"
    if col_name in df_t.columns:
        df_t.loc[game_id,col_name] += 1
        df_t.loc[game_id,col_name2] = str(species)
    else:
        df_t[col_name] = 0
        df_t[col_name2] = 0
        df_t.loc[game_id,col_name] += 1
        df_t.loc[game_id,col_name2] = str(species)            

In [22]:
def make_column_test(game_id,player,species,event_contents):
    player_num = "p"+str(player)
    col_name = player_num+"_"+str(event_contents) 
    col_name2 = player_num+"_species"
    if col_name in df_t.columns:
        df_t.loc[game_id,col_name] += 1
        df_t.loc[game_id,col_name2] = str(species)  
    else:
        df_t[col_name] = 0
        df_t[col_name2] = 0
        df_t.loc[game_id,col_name] += 1
        df_t.loc[game_id,col_name2] = str(species)  

In [23]:
df_test.apply(lambda x : make_column_test(x["game_id"],x["player"],x["species"], x["event_contents"]) , axis = 1 )

3           None
29          None
69          None
75          None
82          None
            ... 
28714832    None
28714834    None
28714836    None
28714837    None
28714839    None
Length: 1630618, dtype: object

In [173]:
df.shape

(38872, 1619)

In [38]:
df

Unnamed: 0,game_id,p0_TrainSCV,p0_species,p0_BuildSupplyDepot,p0_Stop,p1_TrainSCV,p1_species,p1_BuildSupplyDepot,p1_BuildRefinery,p0_BuildBarracks,...,p1_HallucinateVoidRay,p1_PsionicStorm,p1_UpgradeGroundWeapons3,p0_BurrowRavagerDown,p0_BurrowRavagerUp,p0_StimpackRedirect,p0_HallucinateArchon,p0_EvolveAdrenalGlands,p0_BurrowInfestor,p0_SpawnInfestedTerran
0,0,9,0,4,1,6,0,3,2,2,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,17,0,2,2,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,7,0,6,8,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
4,4,14,0,2,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38867,38867,11,T,2,0,0,P,0,0,1,...,0,0,0,0,0,0,0,0,0,0
38868,38868,7,T,3,0,0,P,0,0,1,...,0,0,0,0,0,0,0,0,0,0
38869,38869,0,P,0,1,0,Z,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38870,38870,0,P,0,0,0,P,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
df.to_csv("traindata_200313.csv",encoding="utf-8",index=False)

In [175]:
df.to_csv("train_time_add.csv",encoding="utf-8",index=False)

In [39]:
df_t

Unnamed: 0_level_0,p1_TrainProbe,p1_species,p0_TrainProbe,p0_species,p1_BuildPylon,p1_BuildForge,p0_BuildAssimilator,p1_BuildAssimilator,p1_BuildPhotonCannon,p0_BuildPylon,...,p0_CancelUpgradeToPlanetaryFortress,p1_UnloadUnitCommandCenter,p1_HealMedivac,p0_ResearchTerranVehicleAndShipArmorsLevel2,p0_EvolveFlyerAttacks2,p1_UpgradeShields2,p0_EvolveFlyerCarapace2,p1_EvolveChitinousPlating,p0_LurkerHoldFire,p0_BuildPointDefenseDrone
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
38872,12,0,11,0,5,1,2,2,3,2,...,0,0,0,0,0,0,0,0,0,0
38873,13,0,0,0,4,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
38874,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38875,19,0,0,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38876,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55654,0,T,0,Z,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
55655,0,T,0,Z,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
55656,8,P,6,P,10,1,1,2,5,2,...,0,0,0,0,0,0,0,0,0,0
55657,0,T,0,T,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [192]:
df_t.to_csv("test_time_add.csv",encoding="utf-8",index=False)

In [42]:
df_t.to_csv("testdata_200313.csv",encoding="utf-8")