# Data Preprocess

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 500)

from sklearn import preprocessing

### Load Data

In [2]:
pokemon = pd.read_csv("./data/pokemon.csv")
combats = pd.read_csv("./data/combats.csv", names=['player1', 'player2', 'winner'])
tests = pd.read_csv("./data/tests.csv", names=['player1', 'player2'])

In [3]:
pokemon[:1]

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False


In [4]:
combats[:1]

Unnamed: 0,player1,player2,winner
0,266,298,298


In [5]:
tests[:1]

Unnamed: 0,player1,player2
0,129,117


## Fill empty values

In [6]:
pokemon['Name'] = pokemon['Name'].fillna('Unknown')
pokemon['Type 1'] = pokemon['Type 1'].fillna('Unknown')
pokemon['Type 2'] = pokemon['Type 2'].fillna('Unknown')
pokemon['Legendary'] = pokemon['Legendary'].fillna('Unknown')

### Categorize

In [7]:
columns_to_categorize = ['Type 1', 'Type 2', 'Name', 'Legendary']

In [8]:
for column in columns_to_categorize:
    categorizer = preprocessing.LabelEncoder()
    pokemon[column] = categorizer.fit(pokemon[column]).transform(pokemon[column])

In [9]:
pokemon[:1]

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,68,9,13,45,49,49,65,65,45,1,0


In [10]:
combats[:1]

Unnamed: 0,player1,player2,winner
0,266,298,298


In [11]:
tests[:1]

Unnamed: 0,player1,player2
0,129,117


### Add Test

In [12]:
tests['order'] = range(tests.shape[0])

### Join Data

In [13]:
def join1(combats, pokemon):
    return pd.merge(combats, pokemon, how='right', left_on=['player1'], right_on=['#'])

In [14]:
def join2(combats, pokemon):
    return pd.merge(combats, pokemon, how='right', left_on=['player2'], right_on=['#'])

###### Join First Column

In [15]:
joined = join1(combats, pokemon)

##### Rename Columns

In [16]:
joined.columns = ['player1', u'player2', u'winner', u'#', u'Name_p1', u'Type_1_p1', u'Type_2_p1',
       u'HP_p1', u'Attack_p1', u'Defense_p1', u'SpAtk_p1', u'SpDef_p1', u'Speed_p1',
       u'Generation_p1', u'Legendary_p1']

###### Join Second Column

In [17]:
joined_final = join2(joined, pokemon)

In [18]:
joined_final[:1]

Unnamed: 0,player1,player2,winner,#_x,Name_p1,Type_1_p1,Type_2_p1,HP_p1,Attack_p1,Defense_p1,SpAtk_p1,SpDef_p1,Speed_p1,Generation_p1,Legendary_p1,#_y,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,266.0,298,298.0,266.0,348.0,15.0,10.0,50.0,64.0,50.0,45.0,50.0,41.0,2.0,0.0,298,501,9,1,70,70,40,60,40,60,3,0


###### Rename Columns

In [19]:
joined_final.columns = [     u'player1',      u'player2',       u'winner',          u'#_x',
            u'Name_p1',    u'Type_1_p1',    u'Type_2_p1',        u'HP_p1',
          u'Attack_p1',   u'Defense_p1',      u'SpAtk_p1',      u'SpDef_p1',
            u'Speed_p1', u'Generation_p1',  u'Legendary_p1',          u'#_y',
               u'Name_p2',       u'Type_1_p2',       u'Type_2_p2',           u'HP_p2',
             u'Attack_p2',      u'Defense_p2',      u'SpAtk_p2',      u'SpDef_p2',
              u'Speed_p2',   u'Generation_p2',    u'Legendary_p2']

#### Rename Winner column to Boolean

In [20]:
joined_final['label'] = np.where(joined_final['player1'] == joined_final['winner'], 0, 1)

##### Remove unused columns

In [21]:
joined_final = joined_final.drop(['player1', 'player2', 'winner', '#_x', '#_y'], 1)

##### Delete unused rows

In [22]:
joined_final = joined_final[:50000]

## Join Test Data

In [23]:
test_joined = join1(tests, pokemon)

In [24]:
test_joined = join2(test_joined, pokemon)

### Normalize (optional)

In [25]:
df_processed = preprocessing.MinMaxScaler().fit_transform(joined_final)
df_processed = pd.DataFrame(df_processed, columns=joined_final.columns)

In [26]:
df_test_processed = preprocessing.MinMaxScaler().fit_transform(test_joined)
df_test_processed = pd.DataFrame(df_test_processed, columns=test_joined.columns)

## Order Tests

In [27]:
df_test_processed = df_test_processed.sort_values(['order'])

##### Remove unused columns

In [30]:
df_test_processed = df_test_processed.drop(['player1', 'player2', 'order', '#_x', '#_y'], 1)

### Extract to CSV

In [31]:
df_processed.to_csv('./data/data_processed.csv', header=True, index=False)

In [32]:
df_test_processed.to_csv('./data/data_test_processed.csv', header=True, index=False)