# Data Preprocess

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 500)

from sklearn import preprocessing

### Load Data

In [2]:
pokemon = pd.read_csv("./data/pokemon.csv")
combats = pd.read_csv("./data/combats.csv", names=['player1', 'player2', 'winner'])
tests = pd.read_csv("./data/tests.csv")

### Join Data

In [3]:
def join1(pokemon, combats):
    return pd.merge(combats, pokemon, how='right', left_on=['player1'], right_on=['#'])

In [4]:
def join2(pokemon, combats):
    return pd.merge(combats, pokemon, how='right', left_on=['player2'], right_on=['#'])

###### Join First Column

In [5]:
joined = join1(pokemon, combats)

##### Rename Columns

In [6]:
joined.columns = ['player1', u'player2', u'winner', u'#', u'Name_p1', u'Type_1_p1', u'Type_2_p1',
       u'HP_p1', u'Attack_p1', u'Defense_p1', u'SpAtk_p1', u'SpDef_p1', u'Speed_p1',
       u'Generation_p1', u'Legendary_p1']

###### Join Second Column

In [7]:
joined_final = join2(pokemon, joined)

###### Rename Columns

In [8]:
joined_final.columns = [     u'player1',      u'player2',       u'winner',          u'#_x',
            u'Name_p1',    u'Type_1_p1',    u'Type_2_p1',        u'HP_p1',
          u'Attack_p1',   u'Defense_p1',      u'SpAtk_p1',      u'SpDef_p1',
            u'Speed_p1', u'Generation_p1',  u'Legendary_p1',          u'#_y',
               u'Name_p2',       u'Type_1_p2',       u'Type_2_p2',           u'HP_p2',
             u'Attack_p2',      u'Defense_p2',      u'SpAtk_p2',      u'SpDef_p2',
              u'Speed_p2',   u'Generation_p2',    u'Legendary_p2']

#### Rename Winner column to Boolean

In [9]:
joined_final['label'] = np.where(joined_final['player1'] == joined_final['winner'], 0, 1)

##### Remove unused columns

In [10]:
joined_final = joined_final.drop(['player1', 'player2', 'winner', '#_x', '#_y'], 1)

##### Delete unused rows

In [11]:
joined_final = joined_final[:50000]

## Fill empty values

In [12]:
joined_final['Name_p1'] = joined_final['Name_p1'].fillna('Unknown')
joined_final['Type_1_p1'] = joined_final['Type_1_p1'].fillna('Unknown')
joined_final['Type_2_p1'] = joined_final['Type_2_p1'].fillna('Unknown')
joined_final['Legendary_p1'] = joined_final['Legendary_p1'].fillna('Unknown')
joined_final['Name_p2'] = joined_final['Name_p2'].fillna('Unknown')
joined_final['Type_1_p2'] = joined_final['Type_1_p2'].fillna('Unknown')
joined_final['Type_2_p2'] = joined_final['Type_2_p2'].fillna('Unknown')
joined_final['Legendary_p2'] = joined_final['Legendary_p2'].fillna('Unknown')

### Categorize

In [13]:
columns_to_categorize = ['Type_1_p1', 'Type_2_p1', 'Name_p1', 'Legendary_p1', 'Name_p2', 'Type_1_p2',
                         'Type_2_p2', 'Legendary_p2']

In [14]:
for column in columns_to_categorize:
    categorizer = preprocessing.LabelEncoder()
    joined_final[column] = categorizer.fit(joined_final[column]).transform(joined_final[column])

### Normalize

In [15]:
df_processed = preprocessing.MinMaxScaler().fit_transform(joined_final)
df_processed = pd.DataFrame(df_processed, columns=joined_final.columns)

### Extract to CSV

In [16]:
joined_final.to_csv('./data/data_processed.csv', header=True, index=False)