# Reading database into csv

In [1]:
import sqlite3
conn = sqlite3.connect('final.db')
cur = conn.cursor()

In [2]:
# Methods
def fetch_table_names(cur: sqlite3.Cursor) -> [str]:
    '''
    Retrieves all tables in the database
    '''
    
    cur.execute('SELECT name FROM sqlite_master WHERE type = \'table\';')
    return [x[0] for x in cur.fetchall()]

def fetch_column_names(cur: sqlite3.Cursor, table: str) -> [str]:
    '''
    Retrieves all columns in a table
    '''
    
    cur.execute('PRAGMA table_info(' + table + ');')
    return [x[1] for x in cur.fetchall()]

def fetch_col_values(cur: sqlite3.Cursor, table: str, col: str) -> []:
    '''
    Returns all values for a specific column
    '''
    
    cur.execute('SELECT ' + col + ' FROM ' + table + ';')
    return [x[0] for x in cur.fetchall()]

In [3]:
fetch_table_names(cur)

['SCHOOL_TRAIN', 'SCHOOL_TEST', 'POKEMON_TRAIN', 'POKEMON_TEST']

In [4]:
names = fetch_column_names(cur, 'POKEMON_TRAIN')
names

['rowid',
 'unique_id',
 'types',
 'abilities',
 'base_happiness',
 'height_m',
 'weight_kg',
 'poke_stats',
 'is_legendary']

In [5]:
import pandas as pd
import numpy as np
train = pd.DataFrame()

In [6]:
for name in names:
    train[name] = fetch_col_values(cur, 'POKEMON_TRAIN', name)

In [7]:
train.to_csv('Pokemon_train.csv', index=False)

# Cleaning

In [8]:
train.head(10)

Unnamed: 0,rowid,unique_id,types,abilities,base_happiness,height_m,weight_kg,poke_stats,is_legendary
0,1,665,"fighting, nan","['Guts', 'Sheer Force', 'Iron Fist']",70.0,0.6,12.5,"nan, 80.0, 55.0, 25.0, 35.0, 35.0",0
1,2,288,"dragon, nan","['Bulletproof', 'Soundproof', 'Overcoat']",70.0,0.6,29.7,"nan, nan, 65.0, 45.0, 45.0, 45.0",0
2,3,349,"poison, fire","['Corrosion', 'Oblivious']",70.0,1.2,22.2,"nan, 64.0, 60.0, 111.0, 60.0, 117.0",0
3,4,260,"ground, nan","['Rock Head', 'Lightningrod', 'Battle Armor']",70.0,0.4,6.5,"nan, 50.0, 95.0, 40.0, 50.0, 35.0",0
4,5,536,"water, nan","['Torrent', 'Sheer Force']",70.0,0.6,9.5,"nan, 65.0, 64.0, 44.0, 48.0, 43.0",0
5,6,293,"dragon, fire",['Turboblaze'],0.0,3.2,330.0,"nan, 120.0, 100.0, 150.0, 120.0, 90.0",1
6,7,181,"ground, dragon",['Levitate'],70.0,1.1,15.3,"nan, 70.0, 50.0, 50.0, 50.0, 70.0",0
7,8,554,"poison, ground","['Poison Point', 'Rivalry', 'Sheer Force']",70.0,1.3,60.0,"nan, 92.0, 87.0, 75.0, 85.0, 76.0",0
8,9,507,"bug, electric",['Battery'],70.0,0.5,10.5,"nan, 82.0, 95.0, 55.0, 75.0, 36.0",0
9,10,140,"rock, poison",['Beast Boost'],0.0,1.2,55.5,"109, 53.0, 47.0, 127.0, 131.0, 103.0",1


### Get unique pokemon types in this df

In [9]:
unq = set()
main = []
for row in train['types']:
    temp = row.split(', ')
    main.append(temp)
    for typ in temp:
        unq.add(typ)
print(unq)

for typ in unq:
    train[typ] = 0

{'grass', 'psychic', 'fire', 'ground', 'ghost', 'dragon', 'dark', 'flying', 'steel', 'ice', 'electric', 'normal', 'water', 'bug', 'poison', 'rock', 'nan', 'fairy', 'fighting'}


### Fills in pokemon types in df
1 if pokemon is of that type, 0 else

In [10]:
for typ in unq:
    i = 0
    for var in train[typ]:
        first,second = main[i][0],main[i][1]
        if (first == typ) or (second == typ):
            train[typ][i] = 1
        i += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [11]:
train.head()

Unnamed: 0,rowid,unique_id,types,abilities,base_happiness,height_m,weight_kg,poke_stats,is_legendary,grass,...,ice,electric,normal,water,bug,poison,rock,nan,fairy,fighting
0,1,665,"fighting, nan","['Guts', 'Sheer Force', 'Iron Fist']",70.0,0.6,12.5,"nan, 80.0, 55.0, 25.0, 35.0, 35.0",0,0,...,0,0,0,0,0,0,0,1,0,1
1,2,288,"dragon, nan","['Bulletproof', 'Soundproof', 'Overcoat']",70.0,0.6,29.7,"nan, nan, 65.0, 45.0, 45.0, 45.0",0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,349,"poison, fire","['Corrosion', 'Oblivious']",70.0,1.2,22.2,"nan, 64.0, 60.0, 111.0, 60.0, 117.0",0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,260,"ground, nan","['Rock Head', 'Lightningrod', 'Battle Armor']",70.0,0.4,6.5,"nan, 50.0, 95.0, 40.0, 50.0, 35.0",0,0,...,0,0,0,0,0,0,0,1,0,0
4,5,536,"water, nan","['Torrent', 'Sheer Force']",70.0,0.6,9.5,"nan, 65.0, 64.0, 44.0, 48.0, 43.0",0,0,...,0,0,0,1,0,0,0,1,0,0


In [12]:
import re

In [13]:
i = 0
for val in train['abilities']:
    t = val
    t = re.sub('[\'\[\]]', '', t)
    t = t.split(', ')
    abSum = len(t)
    train['abilities'][i] = abSum
    i += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [14]:
train.head()

Unnamed: 0,rowid,unique_id,types,abilities,base_happiness,height_m,weight_kg,poke_stats,is_legendary,grass,...,ice,electric,normal,water,bug,poison,rock,nan,fairy,fighting
0,1,665,"fighting, nan",3,70.0,0.6,12.5,"nan, 80.0, 55.0, 25.0, 35.0, 35.0",0,0,...,0,0,0,0,0,0,0,1,0,1
1,2,288,"dragon, nan",3,70.0,0.6,29.7,"nan, nan, 65.0, 45.0, 45.0, 45.0",0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,349,"poison, fire",2,70.0,1.2,22.2,"nan, 64.0, 60.0, 111.0, 60.0, 117.0",0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,260,"ground, nan",3,70.0,0.4,6.5,"nan, 50.0, 95.0, 40.0, 50.0, 35.0",0,0,...,0,0,0,0,0,0,0,1,0,0
4,5,536,"water, nan",2,70.0,0.6,9.5,"nan, 65.0, 64.0, 44.0, 48.0, 43.0",0,0,...,0,0,0,1,0,0,0,1,0,0


# Split poke_stats into seperate stat columns

In [15]:
poke_stats = train['poke_stats']
stat_type = ['HP', 'Atk', 'Def', 'SpA', 'SpD', 'Spe']

In [16]:
stat_list = []
for i in range(520):
    stat_list.append(poke_stats.iloc[i].split(', '))

In [17]:
stats = pd.DataFrame(stat_list, columns=stat_type)

In [18]:
stats.head()

Unnamed: 0,HP,Atk,Def,SpA,SpD,Spe
0,,80.0,55.0,25.0,35.0,35.0
1,,,65.0,45.0,45.0,45.0
2,,64.0,60.0,111.0,60.0,117.0
3,,50.0,95.0,40.0,50.0,35.0
4,,65.0,64.0,44.0,48.0,43.0


In [19]:
def combine_dfs(df1, df2):
    #adds df2 columns to the end of df1
    cols = df2.columns
    for col_names in cols:
        df1[col_names] = df2[col_names]
    return df1

In [20]:
train = combine_dfs(train, stats)

In [21]:
train.head(3)

Unnamed: 0,rowid,unique_id,types,abilities,base_happiness,height_m,weight_kg,poke_stats,is_legendary,grass,...,rock,nan,fairy,fighting,HP,Atk,Def,SpA,SpD,Spe
0,1,665,"fighting, nan",3,70.0,0.6,12.5,"nan, 80.0, 55.0, 25.0, 35.0, 35.0",0,0,...,0,1,0,1,,80.0,55.0,25.0,35.0,35.0
1,2,288,"dragon, nan",3,70.0,0.6,29.7,"nan, nan, 65.0, 45.0, 45.0, 45.0",0,0,...,0,1,0,0,,,65.0,45.0,45.0,45.0
2,3,349,"poison, fire",2,70.0,1.2,22.2,"nan, 64.0, 60.0, 111.0, 60.0, 117.0",0,0,...,0,0,0,0,,64.0,60.0,111.0,60.0,117.0


# Fix data types

In [22]:
train.HP  = train.HP.replace({"nan": np.nan})
train.Atk = train.Atk.replace({"nan": np.nan})
train.Def = train.Def.replace({"nan": np.nan})
train.SpA = train.SpA.replace({"nan": np.nan})
train.SpD = train.SpD.replace({"nan": np.nan})
train.Spe = train.Spe.replace({"nan": np.nan})

In [23]:
train['HP']             = pd.to_numeric(train['HP'])
train['Atk']            = pd.to_numeric(train['Atk'])
train['Def']            = pd.to_numeric(train['Def'])
train['SpA']            = pd.to_numeric(train['SpA'])
train['SpD']            = pd.to_numeric(train['SpD'])
train['Spe']            = pd.to_numeric(train['Spe'])
train['height_m']       = pd.to_numeric(train['height_m'])
train['weight_kg']      = pd.to_numeric(train['weight_kg'])
train['base_happiness'] = pd.to_numeric(train['base_happiness'])
train['unique_id']      = train['unique_id'].astype(np.int64)
train['is_legendary']   = train['is_legendary'].astype(np.int64)
train['abilities']      = train['abilities'].astype(np.int64)

In [24]:
train.dtypes

rowid               int64
unique_id           int64
types              object
abilities           int64
base_happiness    float64
height_m          float64
weight_kg         float64
poke_stats         object
is_legendary        int64
grass               int64
psychic             int64
fire                int64
ground              int64
ghost               int64
dragon              int64
dark                int64
flying              int64
steel               int64
ice                 int64
electric            int64
normal              int64
water               int64
bug                 int64
poison              int64
rock                int64
nan                 int64
fairy               int64
fighting            int64
HP                float64
Atk               float64
Def               float64
SpA               float64
SpD               float64
Spe               float64
dtype: object

## Cleaning mostly done

In [25]:
train.head()

Unnamed: 0,rowid,unique_id,types,abilities,base_happiness,height_m,weight_kg,poke_stats,is_legendary,grass,...,rock,nan,fairy,fighting,HP,Atk,Def,SpA,SpD,Spe
0,1,665,"fighting, nan",3,70.0,0.6,12.5,"nan, 80.0, 55.0, 25.0, 35.0, 35.0",0,0,...,0,1,0,1,,80.0,55.0,25.0,35.0,35.0
1,2,288,"dragon, nan",3,70.0,0.6,29.7,"nan, nan, 65.0, 45.0, 45.0, 45.0",0,0,...,0,1,0,0,,,65.0,45.0,45.0,45.0
2,3,349,"poison, fire",2,70.0,1.2,22.2,"nan, 64.0, 60.0, 111.0, 60.0, 117.0",0,0,...,0,0,0,0,,64.0,60.0,111.0,60.0,117.0
3,4,260,"ground, nan",3,70.0,0.4,6.5,"nan, 50.0, 95.0, 40.0, 50.0, 35.0",0,0,...,0,1,0,0,,50.0,95.0,40.0,50.0,35.0
4,5,536,"water, nan",2,70.0,0.6,9.5,"nan, 65.0, 64.0, 44.0, 48.0, 43.0",0,0,...,0,1,0,0,,65.0,64.0,44.0,48.0,43.0
